Statistics
| Branch: | Revision:

root / target-arm / neon_helper.c @ 9c486ad6

History | View | Annotate | Download (34.6 kB)

1
/*
2
 * ARM NEON vector operations.
3
 *
4
 * Copyright (c) 2007, 2008 CodeSourcery.
5
 * Written by Paul Brook
6
 *
7
 * This code is licenced under the GNU GPL v2.
8
 */
9
#include <stdlib.h>
10
#include <stdio.h>
11

    
12
#include "cpu.h"
13
#include "exec-all.h"
14
#include "helpers.h"
15

    
16
#define SIGNBIT (uint32_t)0x80000000
17
#define SIGNBIT64 ((uint64_t)1 << 63)
18

    
19
#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
20

    
21
static float_status neon_float_status;
22
#define NFS &neon_float_status
23

    
24
/* Helper routines to perform bitwise copies between float and int.  */
25
static inline float32 vfp_itos(uint32_t i)
26
{
27
    union {
28
        uint32_t i;
29
        float32 s;
30
    } v;
31

    
32
    v.i = i;
33
    return v.s;
34
}
35

    
36
static inline uint32_t vfp_stoi(float32 s)
37
{
38
    union {
39
        uint32_t i;
40
        float32 s;
41
    } v;
42

    
43
    v.s = s;
44
    return v.i;
45
}
46

    
47
#define NEON_TYPE1(name, type) \
48
typedef struct \
49
{ \
50
    type v1; \
51
} neon_##name;
52
#ifdef HOST_WORDS_BIGENDIAN
53
#define NEON_TYPE2(name, type) \
54
typedef struct \
55
{ \
56
    type v2; \
57
    type v1; \
58
} neon_##name;
59
#define NEON_TYPE4(name, type) \
60
typedef struct \
61
{ \
62
    type v4; \
63
    type v3; \
64
    type v2; \
65
    type v1; \
66
} neon_##name;
67
#else
68
#define NEON_TYPE2(name, type) \
69
typedef struct \
70
{ \
71
    type v1; \
72
    type v2; \
73
} neon_##name;
74
#define NEON_TYPE4(name, type) \
75
typedef struct \
76
{ \
77
    type v1; \
78
    type v2; \
79
    type v3; \
80
    type v4; \
81
} neon_##name;
82
#endif
83

    
84
NEON_TYPE4(s8, int8_t)
85
NEON_TYPE4(u8, uint8_t)
86
NEON_TYPE2(s16, int16_t)
87
NEON_TYPE2(u16, uint16_t)
88
NEON_TYPE1(s32, int32_t)
89
NEON_TYPE1(u32, uint32_t)
90
#undef NEON_TYPE4
91
#undef NEON_TYPE2
92
#undef NEON_TYPE1
93

    
94
/* Copy from a uint32_t to a vector structure type.  */
95
#define NEON_UNPACK(vtype, dest, val) do { \
96
    union { \
97
        vtype v; \
98
        uint32_t i; \
99
    } conv_u; \
100
    conv_u.i = (val); \
101
    dest = conv_u.v; \
102
    } while(0)
103

    
104
/* Copy from a vector structure type to a uint32_t.  */
105
#define NEON_PACK(vtype, dest, val) do { \
106
    union { \
107
        vtype v; \
108
        uint32_t i; \
109
    } conv_u; \
110
    conv_u.v = (val); \
111
    dest = conv_u.i; \
112
    } while(0)
113

    
114
#define NEON_DO1 \
115
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
116
#define NEON_DO2 \
117
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
118
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
119
#define NEON_DO4 \
120
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
121
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
122
    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
123
    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
124

    
125
#define NEON_VOP_BODY(vtype, n) \
126
{ \
127
    uint32_t res; \
128
    vtype vsrc1; \
129
    vtype vsrc2; \
130
    vtype vdest; \
131
    NEON_UNPACK(vtype, vsrc1, arg1); \
132
    NEON_UNPACK(vtype, vsrc2, arg2); \
133
    NEON_DO##n; \
134
    NEON_PACK(vtype, res, vdest); \
135
    return res; \
136
}
137

    
138
#define NEON_VOP(name, vtype, n) \
139
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
140
NEON_VOP_BODY(vtype, n)
141

    
142
#define NEON_VOP_ENV(name, vtype, n) \
143
uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
144
NEON_VOP_BODY(vtype, n)
145

    
146
/* Pairwise operations.  */
147
/* For 32-bit elements each segment only contains a single element, so
148
   the elementwise and pairwise operations are the same.  */
149
#define NEON_PDO2 \
150
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
151
    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
152
#define NEON_PDO4 \
153
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
154
    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
155
    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
156
    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
157

    
158
#define NEON_POP(name, vtype, n) \
159
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
160
{ \
161
    uint32_t res; \
162
    vtype vsrc1; \
163
    vtype vsrc2; \
164
    vtype vdest; \
165
    NEON_UNPACK(vtype, vsrc1, arg1); \
166
    NEON_UNPACK(vtype, vsrc2, arg2); \
167
    NEON_PDO##n; \
168
    NEON_PACK(vtype, res, vdest); \
169
    return res; \
170
}
171

    
172
/* Unary operators.  */
173
#define NEON_VOP1(name, vtype, n) \
174
uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
175
{ \
176
    vtype vsrc1; \
177
    vtype vdest; \
178
    NEON_UNPACK(vtype, vsrc1, arg); \
179
    NEON_DO##n; \
180
    NEON_PACK(vtype, arg, vdest); \
181
    return arg; \
182
}
183

    
184

    
185
#define NEON_USAT(dest, src1, src2, type) do { \
186
    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
187
    if (tmp != (type)tmp) { \
188
        SET_QC(); \
189
        dest = ~0; \
190
    } else { \
191
        dest = tmp; \
192
    }} while(0)
193
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
194
NEON_VOP_ENV(qadd_u8, neon_u8, 4)
195
#undef NEON_FN
196
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
197
NEON_VOP_ENV(qadd_u16, neon_u16, 2)
198
#undef NEON_FN
199
#undef NEON_USAT
200

    
201
#define NEON_SSAT(dest, src1, src2, type) do { \
202
    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
203
    if (tmp != (type)tmp) { \
204
        SET_QC(); \
205
        if (src2 > 0) { \
206
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
207
        } else { \
208
            tmp = 1 << (sizeof(type) * 8 - 1); \
209
        } \
210
    } \
211
    dest = tmp; \
212
    } while(0)
213
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
214
NEON_VOP_ENV(qadd_s8, neon_s8, 4)
215
#undef NEON_FN
216
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
217
NEON_VOP_ENV(qadd_s16, neon_s16, 2)
218
#undef NEON_FN
219
#undef NEON_SSAT
220

    
221
#define NEON_USAT(dest, src1, src2, type) do { \
222
    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
223
    if (tmp != (type)tmp) { \
224
        SET_QC(); \
225
        dest = 0; \
226
    } else { \
227
        dest = tmp; \
228
    }} while(0)
229
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
230
NEON_VOP_ENV(qsub_u8, neon_u8, 4)
231
#undef NEON_FN
232
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
233
NEON_VOP_ENV(qsub_u16, neon_u16, 2)
234
#undef NEON_FN
235
#undef NEON_USAT
236

    
237
#define NEON_SSAT(dest, src1, src2, type) do { \
238
    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
239
    if (tmp != (type)tmp) { \
240
        SET_QC(); \
241
        if (src2 < 0) { \
242
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
243
        } else { \
244
            tmp = 1 << (sizeof(type) * 8 - 1); \
245
        } \
246
    } \
247
    dest = tmp; \
248
    } while(0)
249
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
250
NEON_VOP_ENV(qsub_s8, neon_s8, 4)
251
#undef NEON_FN
252
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
253
NEON_VOP_ENV(qsub_s16, neon_s16, 2)
254
#undef NEON_FN
255
#undef NEON_SSAT
256

    
257
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
258
NEON_VOP(hadd_s8, neon_s8, 4)
259
NEON_VOP(hadd_u8, neon_u8, 4)
260
NEON_VOP(hadd_s16, neon_s16, 2)
261
NEON_VOP(hadd_u16, neon_u16, 2)
262
#undef NEON_FN
263

    
264
int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
265
{
266
    int32_t dest;
267

    
268
    dest = (src1 >> 1) + (src2 >> 1);
269
    if (src1 & src2 & 1)
270
        dest++;
271
    return dest;
272
}
273

    
274
uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
275
{
276
    uint32_t dest;
277

    
278
    dest = (src1 >> 1) + (src2 >> 1);
279
    if (src1 & src2 & 1)
280
        dest++;
281
    return dest;
282
}
283

    
284
#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
285
NEON_VOP(rhadd_s8, neon_s8, 4)
286
NEON_VOP(rhadd_u8, neon_u8, 4)
287
NEON_VOP(rhadd_s16, neon_s16, 2)
288
NEON_VOP(rhadd_u16, neon_u16, 2)
289
#undef NEON_FN
290

    
291
int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
292
{
293
    int32_t dest;
294

    
295
    dest = (src1 >> 1) + (src2 >> 1);
296
    if ((src1 | src2) & 1)
297
        dest++;
298
    return dest;
299
}
300

    
301
uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
302
{
303
    uint32_t dest;
304

    
305
    dest = (src1 >> 1) + (src2 >> 1);
306
    if ((src1 | src2) & 1)
307
        dest++;
308
    return dest;
309
}
310

    
311
#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
312
NEON_VOP(hsub_s8, neon_s8, 4)
313
NEON_VOP(hsub_u8, neon_u8, 4)
314
NEON_VOP(hsub_s16, neon_s16, 2)
315
NEON_VOP(hsub_u16, neon_u16, 2)
316
#undef NEON_FN
317

    
318
int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
319
{
320
    int32_t dest;
321

    
322
    dest = (src1 >> 1) - (src2 >> 1);
323
    if ((~src1) & src2 & 1)
324
        dest--;
325
    return dest;
326
}
327

    
328
uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
329
{
330
    uint32_t dest;
331

    
332
    dest = (src1 >> 1) - (src2 >> 1);
333
    if ((~src1) & src2 & 1)
334
        dest--;
335
    return dest;
336
}
337

    
338
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
339
NEON_VOP(cgt_s8, neon_s8, 4)
340
NEON_VOP(cgt_u8, neon_u8, 4)
341
NEON_VOP(cgt_s16, neon_s16, 2)
342
NEON_VOP(cgt_u16, neon_u16, 2)
343
NEON_VOP(cgt_s32, neon_s32, 1)
344
NEON_VOP(cgt_u32, neon_u32, 1)
345
#undef NEON_FN
346

    
347
#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
348
NEON_VOP(cge_s8, neon_s8, 4)
349
NEON_VOP(cge_u8, neon_u8, 4)
350
NEON_VOP(cge_s16, neon_s16, 2)
351
NEON_VOP(cge_u16, neon_u16, 2)
352
NEON_VOP(cge_s32, neon_s32, 1)
353
NEON_VOP(cge_u32, neon_u32, 1)
354
#undef NEON_FN
355

    
356
#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
357
NEON_VOP(min_s8, neon_s8, 4)
358
NEON_VOP(min_u8, neon_u8, 4)
359
NEON_VOP(min_s16, neon_s16, 2)
360
NEON_VOP(min_u16, neon_u16, 2)
361
NEON_VOP(min_s32, neon_s32, 1)
362
NEON_VOP(min_u32, neon_u32, 1)
363
NEON_POP(pmin_s8, neon_s8, 4)
364
NEON_POP(pmin_u8, neon_u8, 4)
365
NEON_POP(pmin_s16, neon_s16, 2)
366
NEON_POP(pmin_u16, neon_u16, 2)
367
#undef NEON_FN
368

    
369
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
370
NEON_VOP(max_s8, neon_s8, 4)
371
NEON_VOP(max_u8, neon_u8, 4)
372
NEON_VOP(max_s16, neon_s16, 2)
373
NEON_VOP(max_u16, neon_u16, 2)
374
NEON_VOP(max_s32, neon_s32, 1)
375
NEON_VOP(max_u32, neon_u32, 1)
376
NEON_POP(pmax_s8, neon_s8, 4)
377
NEON_POP(pmax_u8, neon_u8, 4)
378
NEON_POP(pmax_s16, neon_s16, 2)
379
NEON_POP(pmax_u16, neon_u16, 2)
380
#undef NEON_FN
381

    
382
#define NEON_FN(dest, src1, src2) \
383
    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
384
NEON_VOP(abd_s8, neon_s8, 4)
385
NEON_VOP(abd_u8, neon_u8, 4)
386
NEON_VOP(abd_s16, neon_s16, 2)
387
NEON_VOP(abd_u16, neon_u16, 2)
388
NEON_VOP(abd_s32, neon_s32, 1)
389
NEON_VOP(abd_u32, neon_u32, 1)
390
#undef NEON_FN
391

    
392
#define NEON_FN(dest, src1, src2) do { \
393
    int8_t tmp; \
394
    tmp = (int8_t)src2; \
395
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
396
        tmp <= -(ssize_t)sizeof(src1) * 8) { \
397
        dest = 0; \
398
    } else if (tmp < 0) { \
399
        dest = src1 >> -tmp; \
400
    } else { \
401
        dest = src1 << tmp; \
402
    }} while (0)
403
NEON_VOP(shl_u8, neon_u8, 4)
404
NEON_VOP(shl_u16, neon_u16, 2)
405
NEON_VOP(shl_u32, neon_u32, 1)
406
#undef NEON_FN
407

    
408
uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
409
{
410
    int8_t shift = (int8_t)shiftop;
411
    if (shift >= 64 || shift <= -64) {
412
        val = 0;
413
    } else if (shift < 0) {
414
        val >>= -shift;
415
    } else {
416
        val <<= shift;
417
    }
418
    return val;
419
}
420

    
421
#define NEON_FN(dest, src1, src2) do { \
422
    int8_t tmp; \
423
    tmp = (int8_t)src2; \
424
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
425
        dest = 0; \
426
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
427
        dest = src1 >> (sizeof(src1) * 8 - 1); \
428
    } else if (tmp < 0) { \
429
        dest = src1 >> -tmp; \
430
    } else { \
431
        dest = src1 << tmp; \
432
    }} while (0)
433
NEON_VOP(shl_s8, neon_s8, 4)
434
NEON_VOP(shl_s16, neon_s16, 2)
435
NEON_VOP(shl_s32, neon_s32, 1)
436
#undef NEON_FN
437

    
438
uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
439
{
440
    int8_t shift = (int8_t)shiftop;
441
    int64_t val = valop;
442
    if (shift >= 64) {
443
        val = 0;
444
    } else if (shift <= -64) {
445
        val >>= 63;
446
    } else if (shift < 0) {
447
        val >>= -shift;
448
    } else {
449
        val <<= shift;
450
    }
451
    return val;
452
}
453

    
454
#define NEON_FN(dest, src1, src2) do { \
455
    int8_t tmp; \
456
    tmp = (int8_t)src2; \
457
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
458
        dest = 0; \
459
    } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
460
        dest = src1 >> (sizeof(src1) * 8 - 1); \
461
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
462
        dest = src1 >> (tmp - 1); \
463
        dest++; \
464
        dest >>= 1; \
465
    } else if (tmp < 0) { \
466
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
467
    } else { \
468
        dest = src1 << tmp; \
469
    }} while (0)
470
NEON_VOP(rshl_s8, neon_s8, 4)
471
NEON_VOP(rshl_s16, neon_s16, 2)
472
NEON_VOP(rshl_s32, neon_s32, 1)
473
#undef NEON_FN
474

    
475
uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
476
{
477
    int8_t shift = (int8_t)shiftop;
478
    int64_t val = valop;
479
    if (shift >= 64) {
480
        val = 0;
481
    } else if (shift < -64) {
482
        val >>= 63;
483
    } else if (shift == -63) {
484
        val >>= 63;
485
        val++;
486
        val >>= 1;
487
    } else if (shift < 0) {
488
        val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
489
    } else {
490
        val <<= shift;
491
    }
492
    return val;
493
}
494

    
495
#define NEON_FN(dest, src1, src2) do { \
496
    int8_t tmp; \
497
    tmp = (int8_t)src2; \
498
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
499
        tmp < -(ssize_t)sizeof(src1) * 8) { \
500
        dest = 0; \
501
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
502
        dest = src1 >> (tmp - 1); \
503
    } else if (tmp < 0) { \
504
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
505
    } else { \
506
        dest = src1 << tmp; \
507
    }} while (0)
508
NEON_VOP(rshl_u8, neon_u8, 4)
509
NEON_VOP(rshl_u16, neon_u16, 2)
510
NEON_VOP(rshl_u32, neon_u32, 1)
511
#undef NEON_FN
512

    
513
uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
514
{
515
    int8_t shift = (uint8_t)shiftop;
516
    if (shift >= 64 || shift < 64) {
517
        val = 0;
518
    } else if (shift == -64) {
519
        /* Rounding a 1-bit result just preserves that bit.  */
520
        val >>= 63;
521
    } if (shift < 0) {
522
        val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
523
        val >>= -shift;
524
    } else {
525
        val <<= shift;
526
    }
527
    return val;
528
}
529

    
530
#define NEON_FN(dest, src1, src2) do { \
531
    int8_t tmp; \
532
    tmp = (int8_t)src2; \
533
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
534
        if (src1) { \
535
            SET_QC(); \
536
            dest = ~0; \
537
        } else { \
538
            dest = 0; \
539
        } \
540
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
541
        dest = 0; \
542
    } else if (tmp < 0) { \
543
        dest = src1 >> -tmp; \
544
    } else { \
545
        dest = src1 << tmp; \
546
        if ((dest >> tmp) != src1) { \
547
            SET_QC(); \
548
            dest = ~0; \
549
        } \
550
    }} while (0)
551
NEON_VOP_ENV(qshl_u8, neon_u8, 4)
552
NEON_VOP_ENV(qshl_u16, neon_u16, 2)
553
NEON_VOP_ENV(qshl_u32, neon_u32, 1)
554
#undef NEON_FN
555

    
556
uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
557
{
558
    int8_t shift = (int8_t)shiftop;
559
    if (shift >= 64) {
560
        if (val) {
561
            val = ~(uint64_t)0;
562
            SET_QC();
563
        }
564
    } else if (shift <= -64) {
565
        val = 0;
566
    } else if (shift < 0) {
567
        val >>= -shift;
568
    } else {
569
        uint64_t tmp = val;
570
        val <<= shift;
571
        if ((val >> shift) != tmp) {
572
            SET_QC();
573
            val = ~(uint64_t)0;
574
        }
575
    }
576
    return val;
577
}
578

    
579
#define NEON_FN(dest, src1, src2) do { \
580
    int8_t tmp; \
581
    tmp = (int8_t)src2; \
582
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
583
        if (src1) { \
584
            SET_QC(); \
585
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
586
            if (src1 > 0) { \
587
                dest--; \
588
            } \
589
        } else { \
590
            dest = src1; \
591
        } \
592
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
593
        dest = src1 >> 31; \
594
    } else if (tmp < 0) { \
595
        dest = src1 >> -tmp; \
596
    } else { \
597
        dest = src1 << tmp; \
598
        if ((dest >> tmp) != src1) { \
599
            SET_QC(); \
600
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
601
            if (src1 > 0) { \
602
                dest--; \
603
            } \
604
        } \
605
    }} while (0)
606
NEON_VOP_ENV(qshl_s8, neon_s8, 4)
607
NEON_VOP_ENV(qshl_s16, neon_s16, 2)
608
NEON_VOP_ENV(qshl_s32, neon_s32, 1)
609
#undef NEON_FN
610

    
611
uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
612
{
613
    int8_t shift = (uint8_t)shiftop;
614
    int64_t val = valop;
615
    if (shift >= 64) {
616
        if (val) {
617
            SET_QC();
618
            val = (val >> 63) ^ ~SIGNBIT64;
619
        }
620
    } else if (shift <= -64) {
621
        val >>= 63;
622
    } else if (shift < 0) {
623
        val >>= -shift;
624
    } else {
625
        int64_t tmp = val;
626
        val <<= shift;
627
        if ((val >> shift) != tmp) {
628
            SET_QC();
629
            val = (tmp >> 63) ^ ~SIGNBIT64;
630
        }
631
    }
632
    return val;
633
}
634

    
635

    
636
/* FIXME: This is wrong.  */
637
#define NEON_FN(dest, src1, src2) do { \
638
    int8_t tmp; \
639
    tmp = (int8_t)src2; \
640
    if (tmp < 0) { \
641
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
642
    } else { \
643
        dest = src1 << tmp; \
644
        if ((dest >> tmp) != src1) { \
645
            SET_QC(); \
646
            dest = ~0; \
647
        } \
648
    }} while (0)
649
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
650
NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
651
NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
652
#undef NEON_FN
653

    
654
uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
655
{
656
    int8_t shift = (int8_t)shiftop;
657
    if (shift < 0) {
658
        val = (val + (1 << (-1 - shift))) >> -shift;
659
    } else { \
660
        uint64_t tmp = val;
661
        val <<= shift;
662
        if ((val >> shift) != tmp) {
663
            SET_QC();
664
            val = ~0;
665
        }
666
    }
667
    return val;
668
}
669

    
670
#define NEON_FN(dest, src1, src2) do { \
671
    int8_t tmp; \
672
    tmp = (int8_t)src2; \
673
    if (tmp < 0) { \
674
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
675
    } else { \
676
        dest = src1 << tmp; \
677
        if ((dest >> tmp) != src1) { \
678
            SET_QC(); \
679
            dest = src1 >> 31; \
680
        } \
681
    }} while (0)
682
NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
683
NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
684
NEON_VOP_ENV(qrshl_s32, neon_s32, 1)
685
#undef NEON_FN
686

    
687
uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
688
{
689
    int8_t shift = (uint8_t)shiftop;
690
    int64_t val = valop;
691

    
692
    if (shift < 0) {
693
        val = (val + (1 << (-1 - shift))) >> -shift;
694
    } else {
695
        int64_t tmp = val;;
696
        val <<= shift;
697
        if ((val >> shift) != tmp) {
698
            SET_QC();
699
            val = tmp >> 31;
700
        }
701
    }
702
    return val;
703
}
704

    
705
uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
706
{
707
    uint32_t mask;
708
    mask = (a ^ b) & 0x80808080u;
709
    a &= ~0x80808080u;
710
    b &= ~0x80808080u;
711
    return (a + b) ^ mask;
712
}
713

    
714
uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
715
{
716
    uint32_t mask;
717
    mask = (a ^ b) & 0x80008000u;
718
    a &= ~0x80008000u;
719
    b &= ~0x80008000u;
720
    return (a + b) ^ mask;
721
}
722

    
723
#define NEON_FN(dest, src1, src2) dest = src1 + src2
724
NEON_POP(padd_u8, neon_u8, 4)
725
NEON_POP(padd_u16, neon_u16, 2)
726
#undef NEON_FN
727

    
728
#define NEON_FN(dest, src1, src2) dest = src1 - src2
729
NEON_VOP(sub_u8, neon_u8, 4)
730
NEON_VOP(sub_u16, neon_u16, 2)
731
#undef NEON_FN
732

    
733
#define NEON_FN(dest, src1, src2) dest = src1 * src2
734
NEON_VOP(mul_u8, neon_u8, 4)
735
NEON_VOP(mul_u16, neon_u16, 2)
736
#undef NEON_FN
737

    
738
/* Polynomial multiplication is like integer multiplication except the
739
   partial products are XORed, not added.  */
740
uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
741
{
742
    uint32_t mask;
743
    uint32_t result;
744
    result = 0;
745
    while (op1) {
746
        mask = 0;
747
        if (op1 & 1)
748
            mask |= 0xff;
749
        if (op1 & (1 << 8))
750
            mask |= (0xff << 8);
751
        if (op1 & (1 << 16))
752
            mask |= (0xff << 16);
753
        if (op1 & (1 << 24))
754
            mask |= (0xff << 24);
755
        result ^= op2 & mask;
756
        op1 = (op1 >> 1) & 0x7f7f7f7f;
757
        op2 = (op2 << 1) & 0xfefefefe;
758
    }
759
    return result;
760
}
761

    
762
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
763
NEON_VOP(tst_u8, neon_u8, 4)
764
NEON_VOP(tst_u16, neon_u16, 2)
765
NEON_VOP(tst_u32, neon_u32, 1)
766
#undef NEON_FN
767

    
768
#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
769
NEON_VOP(ceq_u8, neon_u8, 4)
770
NEON_VOP(ceq_u16, neon_u16, 2)
771
NEON_VOP(ceq_u32, neon_u32, 1)
772
#undef NEON_FN
773

    
774
#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
775
NEON_VOP1(abs_s8, neon_s8, 4)
776
NEON_VOP1(abs_s16, neon_s16, 2)
777
#undef NEON_FN
778

    
779
/* Count Leading Sign/Zero Bits.  */
780
static inline int do_clz8(uint8_t x)
781
{
782
    int n;
783
    for (n = 8; x; n--)
784
        x >>= 1;
785
    return n;
786
}
787

    
788
static inline int do_clz16(uint16_t x)
789
{
790
    int n;
791
    for (n = 16; x; n--)
792
        x >>= 1;
793
    return n;
794
}
795

    
796
#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
797
NEON_VOP1(clz_u8, neon_u8, 4)
798
#undef NEON_FN
799

    
800
#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
801
NEON_VOP1(clz_u16, neon_u16, 2)
802
#undef NEON_FN
803

    
804
#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
805
NEON_VOP1(cls_s8, neon_s8, 4)
806
#undef NEON_FN
807

    
808
#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
809
NEON_VOP1(cls_s16, neon_s16, 2)
810
#undef NEON_FN
811

    
812
uint32_t HELPER(neon_cls_s32)(uint32_t x)
813
{
814
    int count;
815
    if ((int32_t)x < 0)
816
        x = ~x;
817
    for (count = 32; x; count--)
818
        x = x >> 1;
819
    return count - 1;
820
}
821

    
822
/* Bit count.  */
823
uint32_t HELPER(neon_cnt_u8)(uint32_t x)
824
{
825
    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
826
    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
827
    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
828
    return x;
829
}
830

    
831
#define NEON_QDMULH16(dest, src1, src2, round) do { \
832
    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
833
    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
834
        SET_QC(); \
835
        tmp = (tmp >> 31) ^ ~SIGNBIT; \
836
    } \
837
    tmp <<= 1; \
838
    if (round) { \
839
        int32_t old = tmp; \
840
        tmp += 1 << 15; \
841
        if ((int32_t)tmp < old) { \
842
            SET_QC(); \
843
            tmp = SIGNBIT - 1; \
844
        } \
845
    } \
846
    dest = tmp >> 16; \
847
    } while(0)
848
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
849
NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
850
#undef NEON_FN
851
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
852
NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
853
#undef NEON_FN
854
#undef NEON_QDMULH16
855

    
856
#define NEON_QDMULH32(dest, src1, src2, round) do { \
857
    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
858
    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
859
        SET_QC(); \
860
        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
861
    } else { \
862
        tmp <<= 1; \
863
    } \
864
    if (round) { \
865
        int64_t old = tmp; \
866
        tmp += (int64_t)1 << 31; \
867
        if ((int64_t)tmp < old) { \
868
            SET_QC(); \
869
            tmp = SIGNBIT64 - 1; \
870
        } \
871
    } \
872
    dest = tmp >> 32; \
873
    } while(0)
874
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
875
NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
876
#undef NEON_FN
877
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
878
NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
879
#undef NEON_FN
880
#undef NEON_QDMULH32
881

    
882
uint32_t HELPER(neon_narrow_u8)(uint64_t x)
883
{
884
    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
885
           | ((x >> 24) & 0xff000000u);
886
}
887

    
888
uint32_t HELPER(neon_narrow_u16)(uint64_t x)
889
{
890
    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
891
}
892

    
893
uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
894
{
895
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
896
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
897
}
898

    
899
uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
900
{
901
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
902
}
903

    
904
uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
905
{
906
    x &= 0xff80ff80ff80ff80ull;
907
    x += 0x0080008000800080ull;
908
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
909
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
910
}
911

    
912
uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
913
{
914
    x &= 0xffff8000ffff8000ull;
915
    x += 0x0000800000008000ull;
916
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
917
}
918

    
919
uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
920
{
921
    uint16_t s;
922
    uint8_t d;
923
    uint32_t res = 0;
924
#define SAT8(n) \
925
    s = x >> n; \
926
    if (s > 0xff) { \
927
        d = 0xff; \
928
        SET_QC(); \
929
    } else  { \
930
        d = s; \
931
    } \
932
    res |= (uint32_t)d << (n / 2);
933

    
934
    SAT8(0);
935
    SAT8(16);
936
    SAT8(32);
937
    SAT8(48);
938
#undef SAT8
939
    return res;
940
}
941

    
942
uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
943
{
944
    int16_t s;
945
    uint8_t d;
946
    uint32_t res = 0;
947
#define SAT8(n) \
948
    s = x >> n; \
949
    if (s != (int8_t)s) { \
950
        d = (s >> 15) ^ 0x7f; \
951
        SET_QC(); \
952
    } else  { \
953
        d = s; \
954
    } \
955
    res |= (uint32_t)d << (n / 2);
956

    
957
    SAT8(0);
958
    SAT8(16);
959
    SAT8(32);
960
    SAT8(48);
961
#undef SAT8
962
    return res;
963
}
964

    
965
uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
966
{
967
    uint32_t high;
968
    uint32_t low;
969
    low = x;
970
    if (low > 0xffff) {
971
        low = 0xffff;
972
        SET_QC();
973
    }
974
    high = x >> 32;
975
    if (high > 0xffff) {
976
        high = 0xffff;
977
        SET_QC();
978
    }
979
    return low | (high << 16);
980
}
981

    
982
uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
983
{
984
    int32_t low;
985
    int32_t high;
986
    low = x;
987
    if (low != (int16_t)low) {
988
        low = (low >> 31) ^ 0x7fff;
989
        SET_QC();
990
    }
991
    high = x >> 32;
992
    if (high != (int16_t)high) {
993
        high = (high >> 31) ^ 0x7fff;
994
        SET_QC();
995
    }
996
    return (uint16_t)low | (high << 16);
997
}
998

    
999
uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
1000
{
1001
    if (x > 0xffffffffu) {
1002
        SET_QC();
1003
        return 0xffffffffu;
1004
    }
1005
    return x;
1006
}
1007

    
1008
uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
1009
{
1010
    if ((int64_t)x != (int32_t)x) {
1011
        SET_QC();
1012
        return (x >> 63) ^ 0x7fffffff;
1013
    }
1014
    return x;
1015
}
1016

    
1017
uint64_t HELPER(neon_widen_u8)(uint32_t x)
1018
{
1019
    uint64_t tmp;
1020
    uint64_t ret;
1021
    ret = (uint8_t)x;
1022
    tmp = (uint8_t)(x >> 8);
1023
    ret |= tmp << 16;
1024
    tmp = (uint8_t)(x >> 16);
1025
    ret |= tmp << 32;
1026
    tmp = (uint8_t)(x >> 24);
1027
    ret |= tmp << 48;
1028
    return ret;
1029
}
1030

    
1031
uint64_t HELPER(neon_widen_s8)(uint32_t x)
1032
{
1033
    uint64_t tmp;
1034
    uint64_t ret;
1035
    ret = (uint16_t)(int8_t)x;
1036
    tmp = (uint16_t)(int8_t)(x >> 8);
1037
    ret |= tmp << 16;
1038
    tmp = (uint16_t)(int8_t)(x >> 16);
1039
    ret |= tmp << 32;
1040
    tmp = (uint16_t)(int8_t)(x >> 24);
1041
    ret |= tmp << 48;
1042
    return ret;
1043
}
1044

    
1045
uint64_t HELPER(neon_widen_u16)(uint32_t x)
1046
{
1047
    uint64_t high = (uint16_t)(x >> 16);
1048
    return ((uint16_t)x) | (high << 32);
1049
}
1050

    
1051
uint64_t HELPER(neon_widen_s16)(uint32_t x)
1052
{
1053
    uint64_t high = (int16_t)(x >> 16);
1054
    return ((uint32_t)(int16_t)x) | (high << 32);
1055
}
1056

    
1057
uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1058
{
1059
    uint64_t mask;
1060
    mask = (a ^ b) & 0x8000800080008000ull;
1061
    a &= ~0x8000800080008000ull;
1062
    b &= ~0x8000800080008000ull;
1063
    return (a + b) ^ mask;
1064
}
1065

    
1066
uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1067
{
1068
    uint64_t mask;
1069
    mask = (a ^ b) & 0x8000000080000000ull;
1070
    a &= ~0x8000000080000000ull;
1071
    b &= ~0x8000000080000000ull;
1072
    return (a + b) ^ mask;
1073
}
1074

    
1075
uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1076
{
1077
    uint64_t tmp;
1078
    uint64_t tmp2;
1079

    
1080
    tmp = a & 0x0000ffff0000ffffull;
1081
    tmp += (a >> 16) & 0x0000ffff0000ffffull;
1082
    tmp2 = b & 0xffff0000ffff0000ull;
1083
    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1084
    return    ( tmp         & 0xffff)
1085
            | ((tmp  >> 16) & 0xffff0000ull)
1086
            | ((tmp2 << 16) & 0xffff00000000ull)
1087
            | ( tmp2        & 0xffff000000000000ull);
1088
}
1089

    
1090
uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1091
{
1092
    uint32_t low = a + (a >> 32);
1093
    uint32_t high = b + (b >> 32);
1094
    return low + ((uint64_t)high << 32);
1095
}
1096

    
1097
uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1098
{
1099
    uint64_t mask;
1100
    mask = (a ^ ~b) & 0x8000800080008000ull;
1101
    a |= 0x8000800080008000ull;
1102
    b &= ~0x8000800080008000ull;
1103
    return (a - b) ^ mask;
1104
}
1105

    
1106
uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1107
{
1108
    uint64_t mask;
1109
    mask = (a ^ ~b) & 0x8000000080000000ull;
1110
    a |= 0x8000000080000000ull;
1111
    b &= ~0x8000000080000000ull;
1112
    return (a - b) ^ mask;
1113
}
1114

    
1115
uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
1116
{
1117
    uint32_t x, y;
1118
    uint32_t low, high;
1119

    
1120
    x = a;
1121
    y = b;
1122
    low = x + y;
1123
    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1124
        SET_QC();
1125
        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1126
    }
1127
    x = a >> 32;
1128
    y = b >> 32;
1129
    high = x + y;
1130
    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1131
        SET_QC();
1132
        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1133
    }
1134
    return low | ((uint64_t)high << 32);
1135
}
1136

    
1137
uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
1138
{
1139
    uint64_t result;
1140

    
1141
    result = a + b;
1142
    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1143
        SET_QC();
1144
        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1145
    }
1146
    return result;
1147
}
1148

    
1149
#define DO_ABD(dest, x, y, type) do { \
1150
    type tmp_x = x; \
1151
    type tmp_y = y; \
1152
    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1153
    } while(0)
1154

    
1155
uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1156
{
1157
    uint64_t tmp;
1158
    uint64_t result;
1159
    DO_ABD(result, a, b, uint8_t);
1160
    DO_ABD(tmp, a >> 8, b >> 8, uint8_t);
1161
    result |= tmp << 16;
1162
    DO_ABD(tmp, a >> 16, b >> 16, uint8_t);
1163
    result |= tmp << 32;
1164
    DO_ABD(tmp, a >> 24, b >> 24, uint8_t);
1165
    result |= tmp << 48;
1166
    return result;
1167
}
1168

    
1169
uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1170
{
1171
    uint64_t tmp;
1172
    uint64_t result;
1173
    DO_ABD(result, a, b, int8_t);
1174
    DO_ABD(tmp, a >> 8, b >> 8, int8_t);
1175
    result |= tmp << 16;
1176
    DO_ABD(tmp, a >> 16, b >> 16, int8_t);
1177
    result |= tmp << 32;
1178
    DO_ABD(tmp, a >> 24, b >> 24, int8_t);
1179
    result |= tmp << 48;
1180
    return result;
1181
}
1182

    
1183
uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1184
{
1185
    uint64_t tmp;
1186
    uint64_t result;
1187
    DO_ABD(result, a, b, uint16_t);
1188
    DO_ABD(tmp, a >> 16, b >> 16, uint16_t);
1189
    return result | (tmp << 32);
1190
}
1191

    
1192
uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1193
{
1194
    uint64_t tmp;
1195
    uint64_t result;
1196
    DO_ABD(result, a, b, int16_t);
1197
    DO_ABD(tmp, a >> 16, b >> 16, int16_t);
1198
    return result | (tmp << 32);
1199
}
1200

    
1201
uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1202
{
1203
    uint64_t result;
1204
    DO_ABD(result, a, b, uint32_t);
1205
    return result;
1206
}
1207

    
1208
uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1209
{
1210
    uint64_t result;
1211
    DO_ABD(result, a, b, int32_t);
1212
    return result;
1213
}
1214
#undef DO_ABD
1215

    
1216
/* Widening multiply. Named type is the source type.  */
1217
#define DO_MULL(dest, x, y, type1, type2) do { \
1218
    type1 tmp_x = x; \
1219
    type1 tmp_y = y; \
1220
    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1221
    } while(0)
1222

    
1223
uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1224
{
1225
    uint64_t tmp;
1226
    uint64_t result;
1227

    
1228
    DO_MULL(result, a, b, uint8_t, uint16_t);
1229
    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1230
    result |= tmp << 16;
1231
    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1232
    result |= tmp << 32;
1233
    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1234
    result |= tmp << 48;
1235
    return result;
1236
}
1237

    
1238
uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1239
{
1240
    uint64_t tmp;
1241
    uint64_t result;
1242

    
1243
    DO_MULL(result, a, b, int8_t, uint16_t);
1244
    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1245
    result |= tmp << 16;
1246
    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1247
    result |= tmp << 32;
1248
    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1249
    result |= tmp << 48;
1250
    return result;
1251
}
1252

    
1253
uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1254
{
1255
    uint64_t tmp;
1256
    uint64_t result;
1257

    
1258
    DO_MULL(result, a, b, uint16_t, uint32_t);
1259
    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1260
    return result | (tmp << 32);
1261
}
1262

    
1263
uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1264
{
1265
    uint64_t tmp;
1266
    uint64_t result;
1267

    
1268
    DO_MULL(result, a, b, int16_t, uint32_t);
1269
    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1270
    return result | (tmp << 32);
1271
}
1272

    
1273
uint64_t HELPER(neon_negl_u16)(uint64_t x)
1274
{
1275
    uint16_t tmp;
1276
    uint64_t result;
1277
    result = (uint16_t)-x;
1278
    tmp = -(x >> 16);
1279
    result |= (uint64_t)tmp << 16;
1280
    tmp = -(x >> 32);
1281
    result |= (uint64_t)tmp << 32;
1282
    tmp = -(x >> 48);
1283
    result |= (uint64_t)tmp << 48;
1284
    return result;
1285
}
1286

    
1287
#include <stdio.h>
1288
uint64_t HELPER(neon_negl_u32)(uint64_t x)
1289
{
1290
    uint32_t low = -x;
1291
    uint32_t high = -(x >> 32);
1292
    return low | ((uint64_t)high << 32);
1293
}
1294

    
1295
/* FIXME:  There should be a native op for this.  */
1296
uint64_t HELPER(neon_negl_u64)(uint64_t x)
1297
{
1298
    return -x;
1299
}
1300

    
1301
/* Saturnating sign manuipulation.  */
1302
/* ??? Make these use NEON_VOP1 */
1303
#define DO_QABS8(x) do { \
1304
    if (x == (int8_t)0x80) { \
1305
        x = 0x7f; \
1306
        SET_QC(); \
1307
    } else if (x < 0) { \
1308
        x = -x; \
1309
    }} while (0)
1310
uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
1311
{
1312
    neon_s8 vec;
1313
    NEON_UNPACK(neon_s8, vec, x);
1314
    DO_QABS8(vec.v1);
1315
    DO_QABS8(vec.v2);
1316
    DO_QABS8(vec.v3);
1317
    DO_QABS8(vec.v4);
1318
    NEON_PACK(neon_s8, x, vec);
1319
    return x;
1320
}
1321
#undef DO_QABS8
1322

    
1323
#define DO_QNEG8(x) do { \
1324
    if (x == (int8_t)0x80) { \
1325
        x = 0x7f; \
1326
        SET_QC(); \
1327
    } else { \
1328
        x = -x; \
1329
    }} while (0)
1330
uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
1331
{
1332
    neon_s8 vec;
1333
    NEON_UNPACK(neon_s8, vec, x);
1334
    DO_QNEG8(vec.v1);
1335
    DO_QNEG8(vec.v2);
1336
    DO_QNEG8(vec.v3);
1337
    DO_QNEG8(vec.v4);
1338
    NEON_PACK(neon_s8, x, vec);
1339
    return x;
1340
}
1341
#undef DO_QNEG8
1342

    
1343
#define DO_QABS16(x) do { \
1344
    if (x == (int16_t)0x8000) { \
1345
        x = 0x7fff; \
1346
        SET_QC(); \
1347
    } else if (x < 0) { \
1348
        x = -x; \
1349
    }} while (0)
1350
uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
1351
{
1352
    neon_s16 vec;
1353
    NEON_UNPACK(neon_s16, vec, x);
1354
    DO_QABS16(vec.v1);
1355
    DO_QABS16(vec.v2);
1356
    NEON_PACK(neon_s16, x, vec);
1357
    return x;
1358
}
1359
#undef DO_QABS16
1360

    
1361
#define DO_QNEG16(x) do { \
1362
    if (x == (int16_t)0x8000) { \
1363
        x = 0x7fff; \
1364
        SET_QC(); \
1365
    } else { \
1366
        x = -x; \
1367
    }} while (0)
1368
uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
1369
{
1370
    neon_s16 vec;
1371
    NEON_UNPACK(neon_s16, vec, x);
1372
    DO_QNEG16(vec.v1);
1373
    DO_QNEG16(vec.v2);
1374
    NEON_PACK(neon_s16, x, vec);
1375
    return x;
1376
}
1377
#undef DO_QNEG16
1378

    
1379
uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
1380
{
1381
    if (x == SIGNBIT) {
1382
        SET_QC();
1383
        x = ~SIGNBIT;
1384
    } else if ((int32_t)x < 0) {
1385
        x = -x;
1386
    }
1387
    return x;
1388
}
1389

    
1390
uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
1391
{
1392
    if (x == SIGNBIT) {
1393
        SET_QC();
1394
        x = ~SIGNBIT;
1395
    } else {
1396
        x = -x;
1397
    }
1398
    return x;
1399
}
1400

    
1401
/* NEON Float helpers.  */
1402
uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
1403
{
1404
    float32 f0 = vfp_itos(a);
1405
    float32 f1 = vfp_itos(b);
1406
    return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b;
1407
}
1408

    
1409
uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
1410
{
1411
    float32 f0 = vfp_itos(a);
1412
    float32 f1 = vfp_itos(b);
1413
    return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b;
1414
}
1415

    
1416
uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
1417
{
1418
    float32 f0 = vfp_itos(a);
1419
    float32 f1 = vfp_itos(b);
1420
    return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
1421
                    ? float32_sub(f0, f1, NFS)
1422
                    : float32_sub(f1, f0, NFS));
1423
}
1424

    
1425
uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
1426
{
1427
    return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));
1428
}
1429

    
1430
uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
1431
{
1432
    return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));
1433
}
1434

    
1435
uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
1436
{
1437
    return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));
1438
}
1439

    
1440
/* Floating point comparisons produce an integer result.  */
1441
#define NEON_VOP_FCMP(name, cmp) \
1442
uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \
1443
{ \
1444
    if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \
1445
        return ~0; \
1446
    else \
1447
        return 0; \
1448
}
1449

    
1450
NEON_VOP_FCMP(ceq_f32, ==)
1451
NEON_VOP_FCMP(cge_f32, >=)
1452
NEON_VOP_FCMP(cgt_f32, >)
1453

    
1454
uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
1455
{
1456
    float32 f0 = float32_abs(vfp_itos(a));
1457
    float32 f1 = float32_abs(vfp_itos(b));
1458
    return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0;
1459
}
1460

    
1461
uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
1462
{
1463
    float32 f0 = float32_abs(vfp_itos(a));
1464
    float32 f1 = float32_abs(vfp_itos(b));
1465
    return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
1466
}