root / targetarm / neon_helper.c @ 4bd4ee07
History  View  Annotate  Download (51.5 kB)
1 
/*


2 
* ARM NEON vector operations.

3 
*

4 
* Copyright (c) 2007, 2008 CodeSourcery.

5 
* Written by Paul Brook

6 
*

7 
* This code is licenced under the GNU GPL v2.

8 
*/

9 
#include <stdlib.h> 
10 
#include <stdio.h> 
11  
12 
#include "cpu.h" 
13 
#include "execall.h" 
14 
#include "helpers.h" 
15  
16 
#define SIGNBIT (uint32_t)0x80000000 
17 
#define SIGNBIT64 ((uint64_t)1 << 63) 
18  
19 
#define SET_QC() env>vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q

20  
21 
static float_status neon_float_status;

22 
#define NFS &neon_float_status

23  
24 
/* Helper routines to perform bitwise copies between float and int. */

25 
static inline float32 vfp_itos(uint32_t i) 
26 
{ 
27 
union {

28 
uint32_t i; 
29 
float32 s; 
30 
} v; 
31  
32 
v.i = i; 
33 
return v.s;

34 
} 
35  
36 
static inline uint32_t vfp_stoi(float32 s) 
37 
{ 
38 
union {

39 
uint32_t i; 
40 
float32 s; 
41 
} v; 
42  
43 
v.s = s; 
44 
return v.i;

45 
} 
46  
47 
#define NEON_TYPE1(name, type) \

48 
typedef struct \ 
49 
{ \ 
50 
type v1; \ 
51 
} neon_##name; 
52 
#ifdef HOST_WORDS_BIGENDIAN

53 
#define NEON_TYPE2(name, type) \

54 
typedef struct \ 
55 
{ \ 
56 
type v2; \ 
57 
type v1; \ 
58 
} neon_##name; 
59 
#define NEON_TYPE4(name, type) \

60 
typedef struct \ 
61 
{ \ 
62 
type v4; \ 
63 
type v3; \ 
64 
type v2; \ 
65 
type v1; \ 
66 
} neon_##name; 
67 
#else

68 
#define NEON_TYPE2(name, type) \

69 
typedef struct \ 
70 
{ \ 
71 
type v1; \ 
72 
type v2; \ 
73 
} neon_##name; 
74 
#define NEON_TYPE4(name, type) \

75 
typedef struct \ 
76 
{ \ 
77 
type v1; \ 
78 
type v2; \ 
79 
type v3; \ 
80 
type v4; \ 
81 
} neon_##name; 
82 
#endif

83  
84 
NEON_TYPE4(s8, int8_t) 
85 
NEON_TYPE4(u8, uint8_t) 
86 
NEON_TYPE2(s16, int16_t) 
87 
NEON_TYPE2(u16, uint16_t) 
88 
NEON_TYPE1(s32, int32_t) 
89 
NEON_TYPE1(u32, uint32_t) 
90 
#undef NEON_TYPE4

91 
#undef NEON_TYPE2

92 
#undef NEON_TYPE1

93  
94 
/* Copy from a uint32_t to a vector structure type. */

95 
#define NEON_UNPACK(vtype, dest, val) do { \ 
96 
union { \

97 
vtype v; \ 
98 
uint32_t i; \ 
99 
} conv_u; \ 
100 
conv_u.i = (val); \ 
101 
dest = conv_u.v; \ 
102 
} while(0) 
103  
104 
/* Copy from a vector structure type to a uint32_t. */

105 
#define NEON_PACK(vtype, dest, val) do { \ 
106 
union { \

107 
vtype v; \ 
108 
uint32_t i; \ 
109 
} conv_u; \ 
110 
conv_u.v = (val); \ 
111 
dest = conv_u.i; \ 
112 
} while(0) 
113  
114 
#define NEON_DO1 \

115 
NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 
116 
#define NEON_DO2 \

117 
NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 
118 
NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 
119 
#define NEON_DO4 \

120 
NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 
121 
NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 
122 
NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 
123 
NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 
124  
125 
#define NEON_VOP_BODY(vtype, n) \

126 
{ \ 
127 
uint32_t res; \ 
128 
vtype vsrc1; \ 
129 
vtype vsrc2; \ 
130 
vtype vdest; \ 
131 
NEON_UNPACK(vtype, vsrc1, arg1); \ 
132 
NEON_UNPACK(vtype, vsrc2, arg2); \ 
133 
NEON_DO##n; \ 
134 
NEON_PACK(vtype, res, vdest); \ 
135 
return res; \

136 
} 
137  
138 
#define NEON_VOP(name, vtype, n) \

139 
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 
140 
NEON_VOP_BODY(vtype, n) 
141  
142 
#define NEON_VOP_ENV(name, vtype, n) \

143 
uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \ 
144 
NEON_VOP_BODY(vtype, n) 
145  
146 
/* Pairwise operations. */

147 
/* For 32bit elements each segment only contains a single element, so

148 
the elementwise and pairwise operations are the same. */

149 
#define NEON_PDO2 \

150 
NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 
151 
NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 
152 
#define NEON_PDO4 \

153 
NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 
154 
NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 
155 
NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 
156 
NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 
157  
158 
#define NEON_POP(name, vtype, n) \

159 
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 
160 
{ \ 
161 
uint32_t res; \ 
162 
vtype vsrc1; \ 
163 
vtype vsrc2; \ 
164 
vtype vdest; \ 
165 
NEON_UNPACK(vtype, vsrc1, arg1); \ 
166 
NEON_UNPACK(vtype, vsrc2, arg2); \ 
167 
NEON_PDO##n; \ 
168 
NEON_PACK(vtype, res, vdest); \ 
169 
return res; \

170 
} 
171  
172 
/* Unary operators. */

173 
#define NEON_VOP1(name, vtype, n) \

174 
uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 
175 
{ \ 
176 
vtype vsrc1; \ 
177 
vtype vdest; \ 
178 
NEON_UNPACK(vtype, vsrc1, arg); \ 
179 
NEON_DO##n; \ 
180 
NEON_PACK(vtype, arg, vdest); \ 
181 
return arg; \

182 
} 
183  
184  
185 
#define NEON_USAT(dest, src1, src2, type) do { \ 
186 
uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 
187 
if (tmp != (type)tmp) { \

188 
SET_QC(); \ 
189 
dest = ~0; \

190 
} else { \

191 
dest = tmp; \ 
192 
}} while(0) 
193 
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)

194 
NEON_VOP_ENV(qadd_u8, neon_u8, 4)

195 
#undef NEON_FN

196 
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)

197 
NEON_VOP_ENV(qadd_u16, neon_u16, 2)

198 
#undef NEON_FN

199 
#undef NEON_USAT

200  
201 
uint32_t HELPER(neon_qadd_u32)(CPUState *env, uint32_t a, uint32_t b) 
202 
{ 
203 
uint32_t res = a + b; 
204 
if (res < a) {

205 
SET_QC(); 
206 
res = ~0;

207 
} 
208 
return res;

209 
} 
210  
211 
uint64_t HELPER(neon_qadd_u64)(CPUState *env, uint64_t src1, uint64_t src2) 
212 
{ 
213 
uint64_t res; 
214  
215 
res = src1 + src2; 
216 
if (res < src1) {

217 
SET_QC(); 
218 
res = ~(uint64_t)0;

219 
} 
220 
return res;

221 
} 
222  
223 
#define NEON_SSAT(dest, src1, src2, type) do { \ 
224 
int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 
225 
if (tmp != (type)tmp) { \

226 
SET_QC(); \ 
227 
if (src2 > 0) { \ 
228 
tmp = (1 << (sizeof(type) * 8  1))  1; \ 
229 
} else { \

230 
tmp = 1 << (sizeof(type) * 8  1); \ 
231 
} \ 
232 
} \ 
233 
dest = tmp; \ 
234 
} while(0) 
235 
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)

236 
NEON_VOP_ENV(qadd_s8, neon_s8, 4)

237 
#undef NEON_FN

238 
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)

239 
NEON_VOP_ENV(qadd_s16, neon_s16, 2)

240 
#undef NEON_FN

241 
#undef NEON_SSAT

242  
243 
uint32_t HELPER(neon_qadd_s32)(CPUState *env, uint32_t a, uint32_t b) 
244 
{ 
245 
uint32_t res = a + b; 
246 
if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {

247 
SET_QC(); 
248 
res = ~(((int32_t)a >> 31) ^ SIGNBIT);

249 
} 
250 
return res;

251 
} 
252  
253 
uint64_t HELPER(neon_qadd_s64)(CPUState *env, uint64_t src1, uint64_t src2) 
254 
{ 
255 
uint64_t res; 
256  
257 
res = src1 + src2; 
258 
if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {

259 
SET_QC(); 
260 
res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;

261 
} 
262 
return res;

263 
} 
264  
265 
#define NEON_USAT(dest, src1, src2, type) do { \ 
266 
uint32_t tmp = (uint32_t)src1  (uint32_t)src2; \ 
267 
if (tmp != (type)tmp) { \

268 
SET_QC(); \ 
269 
dest = 0; \

270 
} else { \

271 
dest = tmp; \ 
272 
}} while(0) 
273 
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)

274 
NEON_VOP_ENV(qsub_u8, neon_u8, 4)

275 
#undef NEON_FN

276 
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)

277 
NEON_VOP_ENV(qsub_u16, neon_u16, 2)

278 
#undef NEON_FN

279 
#undef NEON_USAT

280  
281 
uint32_t HELPER(neon_qsub_u32)(CPUState *env, uint32_t a, uint32_t b) 
282 
{ 
283 
uint32_t res = a  b; 
284 
if (res > a) {

285 
SET_QC(); 
286 
res = 0;

287 
} 
288 
return res;

289 
} 
290  
291 
uint64_t HELPER(neon_qsub_u64)(CPUState *env, uint64_t src1, uint64_t src2) 
292 
{ 
293 
uint64_t res; 
294  
295 
if (src1 < src2) {

296 
SET_QC(); 
297 
res = 0;

298 
} else {

299 
res = src1  src2; 
300 
} 
301 
return res;

302 
} 
303  
304 
#define NEON_SSAT(dest, src1, src2, type) do { \ 
305 
int32_t tmp = (uint32_t)src1  (uint32_t)src2; \ 
306 
if (tmp != (type)tmp) { \

307 
SET_QC(); \ 
308 
if (src2 < 0) { \ 
309 
tmp = (1 << (sizeof(type) * 8  1))  1; \ 
310 
} else { \

311 
tmp = 1 << (sizeof(type) * 8  1); \ 
312 
} \ 
313 
} \ 
314 
dest = tmp; \ 
315 
} while(0) 
316 
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)

317 
NEON_VOP_ENV(qsub_s8, neon_s8, 4)

318 
#undef NEON_FN

319 
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)

320 
NEON_VOP_ENV(qsub_s16, neon_s16, 2)

321 
#undef NEON_FN

322 
#undef NEON_SSAT

323  
324 
uint32_t HELPER(neon_qsub_s32)(CPUState *env, uint32_t a, uint32_t b) 
325 
{ 
326 
uint32_t res = a  b; 
327 
if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {

328 
SET_QC(); 
329 
res = ~(((int32_t)a >> 31) ^ SIGNBIT);

330 
} 
331 
return res;

332 
} 
333  
334 
uint64_t HELPER(neon_qsub_s64)(CPUState *env, uint64_t src1, uint64_t src2) 
335 
{ 
336 
uint64_t res; 
337  
338 
res = src1  src2; 
339 
if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {

340 
SET_QC(); 
341 
res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;

342 
} 
343 
return res;

344 
} 
345  
346 
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 
347 
NEON_VOP(hadd_s8, neon_s8, 4)

348 
NEON_VOP(hadd_u8, neon_u8, 4)

349 
NEON_VOP(hadd_s16, neon_s16, 2)

350 
NEON_VOP(hadd_u16, neon_u16, 2)

351 
#undef NEON_FN

352  
353 
int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 
354 
{ 
355 
int32_t dest; 
356  
357 
dest = (src1 >> 1) + (src2 >> 1); 
358 
if (src1 & src2 & 1) 
359 
dest++; 
360 
return dest;

361 
} 
362  
363 
uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 
364 
{ 
365 
uint32_t dest; 
366  
367 
dest = (src1 >> 1) + (src2 >> 1); 
368 
if (src1 & src2 & 1) 
369 
dest++; 
370 
return dest;

371 
} 
372  
373 
#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 
374 
NEON_VOP(rhadd_s8, neon_s8, 4)

375 
NEON_VOP(rhadd_u8, neon_u8, 4)

376 
NEON_VOP(rhadd_s16, neon_s16, 2)

377 
NEON_VOP(rhadd_u16, neon_u16, 2)

378 
#undef NEON_FN

379  
380 
int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 
381 
{ 
382 
int32_t dest; 
383  
384 
dest = (src1 >> 1) + (src2 >> 1); 
385 
if ((src1  src2) & 1) 
386 
dest++; 
387 
return dest;

388 
} 
389  
390 
uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 
391 
{ 
392 
uint32_t dest; 
393  
394 
dest = (src1 >> 1) + (src2 >> 1); 
395 
if ((src1  src2) & 1) 
396 
dest++; 
397 
return dest;

398 
} 
399  
400 
#define NEON_FN(dest, src1, src2) dest = (src1  src2) >> 1 
401 
NEON_VOP(hsub_s8, neon_s8, 4)

402 
NEON_VOP(hsub_u8, neon_u8, 4)

403 
NEON_VOP(hsub_s16, neon_s16, 2)

404 
NEON_VOP(hsub_u16, neon_u16, 2)

405 
#undef NEON_FN

406  
407 
int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 
408 
{ 
409 
int32_t dest; 
410  
411 
dest = (src1 >> 1)  (src2 >> 1); 
412 
if ((~src1) & src2 & 1) 
413 
dest; 
414 
return dest;

415 
} 
416  
417 
uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 
418 
{ 
419 
uint32_t dest; 
420  
421 
dest = (src1 >> 1)  (src2 >> 1); 
422 
if ((~src1) & src2 & 1) 
423 
dest; 
424 
return dest;

425 
} 
426  
427 
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0 
428 
NEON_VOP(cgt_s8, neon_s8, 4)

429 
NEON_VOP(cgt_u8, neon_u8, 4)

430 
NEON_VOP(cgt_s16, neon_s16, 2)

431 
NEON_VOP(cgt_u16, neon_u16, 2)

432 
NEON_VOP(cgt_s32, neon_s32, 1)

433 
NEON_VOP(cgt_u32, neon_u32, 1)

434 
#undef NEON_FN

435  
436 
#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0 
437 
NEON_VOP(cge_s8, neon_s8, 4)

438 
NEON_VOP(cge_u8, neon_u8, 4)

439 
NEON_VOP(cge_s16, neon_s16, 2)

440 
NEON_VOP(cge_u16, neon_u16, 2)

441 
NEON_VOP(cge_s32, neon_s32, 1)

442 
NEON_VOP(cge_u32, neon_u32, 1)

443 
#undef NEON_FN

444  
445 
#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2

446 
NEON_VOP(min_s8, neon_s8, 4)

447 
NEON_VOP(min_u8, neon_u8, 4)

448 
NEON_VOP(min_s16, neon_s16, 2)

449 
NEON_VOP(min_u16, neon_u16, 2)

450 
NEON_VOP(min_s32, neon_s32, 1)

451 
NEON_VOP(min_u32, neon_u32, 1)

452 
NEON_POP(pmin_s8, neon_s8, 4)

453 
NEON_POP(pmin_u8, neon_u8, 4)

454 
NEON_POP(pmin_s16, neon_s16, 2)

455 
NEON_POP(pmin_u16, neon_u16, 2)

456 
#undef NEON_FN

457  
458 
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2

459 
NEON_VOP(max_s8, neon_s8, 4)

460 
NEON_VOP(max_u8, neon_u8, 4)

461 
NEON_VOP(max_s16, neon_s16, 2)

462 
NEON_VOP(max_u16, neon_u16, 2)

463 
NEON_VOP(max_s32, neon_s32, 1)

464 
NEON_VOP(max_u32, neon_u32, 1)

465 
NEON_POP(pmax_s8, neon_s8, 4)

466 
NEON_POP(pmax_u8, neon_u8, 4)

467 
NEON_POP(pmax_s16, neon_s16, 2)

468 
NEON_POP(pmax_u16, neon_u16, 2)

469 
#undef NEON_FN

470  
471 
#define NEON_FN(dest, src1, src2) \

472 
dest = (src1 > src2) ? (src1  src2) : (src2  src1) 
473 
NEON_VOP(abd_s8, neon_s8, 4)

474 
NEON_VOP(abd_u8, neon_u8, 4)

475 
NEON_VOP(abd_s16, neon_s16, 2)

476 
NEON_VOP(abd_u16, neon_u16, 2)

477 
NEON_VOP(abd_s32, neon_s32, 1)

478 
NEON_VOP(abd_u32, neon_u32, 1)

479 
#undef NEON_FN

480  
481 
#define NEON_FN(dest, src1, src2) do { \ 
482 
int8_t tmp; \ 
483 
tmp = (int8_t)src2; \ 
484 
if (tmp >= (ssize_t)sizeof(src1) * 8  \ 
485 
tmp <= (ssize_t)sizeof(src1) * 8) { \ 
486 
dest = 0; \

487 
} else if (tmp < 0) { \ 
488 
dest = src1 >> tmp; \ 
489 
} else { \

490 
dest = src1 << tmp; \ 
491 
}} while (0) 
492 
NEON_VOP(shl_u8, neon_u8, 4)

493 
NEON_VOP(shl_u16, neon_u16, 2)

494 
NEON_VOP(shl_u32, neon_u32, 1)

495 
#undef NEON_FN

496  
497 
uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop) 
498 
{ 
499 
int8_t shift = (int8_t)shiftop; 
500 
if (shift >= 64  shift <= 64) { 
501 
val = 0;

502 
} else if (shift < 0) { 
503 
val >>= shift; 
504 
} else {

505 
val <<= shift; 
506 
} 
507 
return val;

508 
} 
509  
510 
#define NEON_FN(dest, src1, src2) do { \ 
511 
int8_t tmp; \ 
512 
tmp = (int8_t)src2; \ 
513 
if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 
514 
dest = 0; \

515 
} else if (tmp <= (ssize_t)sizeof(src1) * 8) { \ 
516 
dest = src1 >> (sizeof(src1) * 8  1); \ 
517 
} else if (tmp < 0) { \ 
518 
dest = src1 >> tmp; \ 
519 
} else { \

520 
dest = src1 << tmp; \ 
521 
}} while (0) 
522 
NEON_VOP(shl_s8, neon_s8, 4)

523 
NEON_VOP(shl_s16, neon_s16, 2)

524 
NEON_VOP(shl_s32, neon_s32, 1)

525 
#undef NEON_FN

526  
527 
uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) 
528 
{ 
529 
int8_t shift = (int8_t)shiftop; 
530 
int64_t val = valop; 
531 
if (shift >= 64) { 
532 
val = 0;

533 
} else if (shift <= 64) { 
534 
val >>= 63;

535 
} else if (shift < 0) { 
536 
val >>= shift; 
537 
} else {

538 
val <<= shift; 
539 
} 
540 
return val;

541 
} 
542  
543 
#define NEON_FN(dest, src1, src2) do { \ 
544 
int8_t tmp; \ 
545 
tmp = (int8_t)src2; \ 
546 
if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 
547 
dest = 0; \

548 
} else if (tmp < (ssize_t)sizeof(src1) * 8) { \ 
549 
dest = src1 >> (sizeof(src1) * 8  1); \ 
550 
} else if (tmp == (ssize_t)sizeof(src1) * 8) { \ 
551 
dest = src1 >> (tmp  1); \

552 
dest++; \ 
553 
dest >>= 1; \

554 
} else if (tmp < 0) { \ 
555 
dest = (src1 + (1 << (1  tmp))) >> tmp; \ 
556 
} else { \

557 
dest = src1 << tmp; \ 
558 
}} while (0) 
559 
NEON_VOP(rshl_s8, neon_s8, 4)

560 
NEON_VOP(rshl_s16, neon_s16, 2)

561 
#undef NEON_FN

562  
563 
/* The addition of the rounding constant may overflow, so we use an

564 
* intermediate 64 bits accumulator. */

565 
uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop) 
566 
{ 
567 
int32_t dest; 
568 
int32_t val = (int32_t)valop; 
569 
int8_t shift = (int8_t)shiftop; 
570 
if ((shift >= 32)  (shift <= 32)) { 
571 
dest = 0;

572 
} else if (shift < 0) { 
573 
int64_t big_dest = ((int64_t)val + (1 << (1  shift))); 
574 
dest = big_dest >> shift; 
575 
} else {

576 
dest = val << shift; 
577 
} 
578 
return dest;

579 
} 
580  
581 
/* Handling addition overflow with 64 bits inputs values is more

582 
* tricky than with 32 bits values. */

583 
uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) 
584 
{ 
585 
int8_t shift = (int8_t)shiftop; 
586 
int64_t val = valop; 
587 
if (shift >= 64) { 
588 
val = 0;

589 
} else if (shift < 64) { 
590 
val >>= 63;

591 
} else if (shift == 63) { 
592 
val >>= 63;

593 
val++; 
594 
val >>= 1;

595 
} else if (shift < 0) { 
596 
val >>= (shift  1);

597 
if (val == INT64_MAX) {

598 
/* In this case, it means that the rounding constant is 1,

599 
* and the addition would overflow. Return the actual

600 
* result directly. */

601 
val = 0x4000000000000000LL;

602 
} else {

603 
val++; 
604 
val >>= 1;

605 
} 
606 
} else {

607 
val <<= shift; 
608 
} 
609 
return val;

610 
} 
611  
612 
#define NEON_FN(dest, src1, src2) do { \ 
613 
int8_t tmp; \ 
614 
tmp = (int8_t)src2; \ 
615 
if (tmp >= (ssize_t)sizeof(src1) * 8  \ 
616 
tmp < (ssize_t)sizeof(src1) * 8) { \ 
617 
dest = 0; \

618 
} else if (tmp == (ssize_t)sizeof(src1) * 8) { \ 
619 
dest = src1 >> (tmp  1); \

620 
} else if (tmp < 0) { \ 
621 
dest = (src1 + (1 << (1  tmp))) >> tmp; \ 
622 
} else { \

623 
dest = src1 << tmp; \ 
624 
}} while (0) 
625 
NEON_VOP(rshl_u8, neon_u8, 4)

626 
NEON_VOP(rshl_u16, neon_u16, 2)

627 
#undef NEON_FN

628  
629 
/* The addition of the rounding constant may overflow, so we use an

630 
* intermediate 64 bits accumulator. */

631 
uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop) 
632 
{ 
633 
uint32_t dest; 
634 
int8_t shift = (int8_t)shiftop; 
635 
if (shift >= 32  shift < 32) { 
636 
dest = 0;

637 
} else if (shift == 32) { 
638 
dest = val >> 31;

639 
} else if (shift < 0) { 
640 
uint64_t big_dest = ((uint64_t)val + (1 << (1  shift))); 
641 
dest = big_dest >> shift; 
642 
} else {

643 
dest = val << shift; 
644 
} 
645 
return dest;

646 
} 
647  
648 
/* Handling addition overflow with 64 bits inputs values is more

649 
* tricky than with 32 bits values. */

650 
uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) 
651 
{ 
652 
int8_t shift = (uint8_t)shiftop; 
653 
if (shift >= 64  shift < 64) { 
654 
val = 0;

655 
} else if (shift == 64) { 
656 
/* Rounding a 1bit result just preserves that bit. */

657 
val >>= 63;

658 
} else if (shift < 0) { 
659 
val >>= (shift  1);

660 
if (val == UINT64_MAX) {

661 
/* In this case, it means that the rounding constant is 1,

662 
* and the addition would overflow. Return the actual

663 
* result directly. */

664 
val = 0x8000000000000000ULL;

665 
} else {

666 
val++; 
667 
val >>= 1;

668 
} 
669 
} else {

670 
val <<= shift; 
671 
} 
672 
return val;

673 
} 
674  
675 
#define NEON_FN(dest, src1, src2) do { \ 
676 
int8_t tmp; \ 
677 
tmp = (int8_t)src2; \ 
678 
if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 
679 
if (src1) { \

680 
SET_QC(); \ 
681 
dest = ~0; \

682 
} else { \

683 
dest = 0; \

684 
} \ 
685 
} else if (tmp <= (ssize_t)sizeof(src1) * 8) { \ 
686 
dest = 0; \

687 
} else if (tmp < 0) { \ 
688 
dest = src1 >> tmp; \ 
689 
} else { \

690 
dest = src1 << tmp; \ 
691 
if ((dest >> tmp) != src1) { \

692 
SET_QC(); \ 
693 
dest = ~0; \

694 
} \ 
695 
}} while (0) 
696 
NEON_VOP_ENV(qshl_u8, neon_u8, 4)

697 
NEON_VOP_ENV(qshl_u16, neon_u16, 2)

698 
NEON_VOP_ENV(qshl_u32, neon_u32, 1)

699 
#undef NEON_FN

700  
701 
uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) 
702 
{ 
703 
int8_t shift = (int8_t)shiftop; 
704 
if (shift >= 64) { 
705 
if (val) {

706 
val = ~(uint64_t)0;

707 
SET_QC(); 
708 
} 
709 
} else if (shift <= 64) { 
710 
val = 0;

711 
} else if (shift < 0) { 
712 
val >>= shift; 
713 
} else {

714 
uint64_t tmp = val; 
715 
val <<= shift; 
716 
if ((val >> shift) != tmp) {

717 
SET_QC(); 
718 
val = ~(uint64_t)0;

719 
} 
720 
} 
721 
return val;

722 
} 
723  
724 
#define NEON_FN(dest, src1, src2) do { \ 
725 
int8_t tmp; \ 
726 
tmp = (int8_t)src2; \ 
727 
if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 
728 
if (src1) { \

729 
SET_QC(); \ 
730 
dest = (uint32_t)(1 << (sizeof(src1) * 8  1)); \ 
731 
if (src1 > 0) { \ 
732 
dest; \ 
733 
} \ 
734 
} else { \

735 
dest = src1; \ 
736 
} \ 
737 
} else if (tmp <= (ssize_t)sizeof(src1) * 8) { \ 
738 
dest = src1 >> 31; \

739 
} else if (tmp < 0) { \ 
740 
dest = src1 >> tmp; \ 
741 
} else { \

742 
dest = src1 << tmp; \ 
743 
if ((dest >> tmp) != src1) { \

744 
SET_QC(); \ 
745 
dest = (uint32_t)(1 << (sizeof(src1) * 8  1)); \ 
746 
if (src1 > 0) { \ 
747 
dest; \ 
748 
} \ 
749 
} \ 
750 
}} while (0) 
751 
NEON_VOP_ENV(qshl_s8, neon_s8, 4)

752 
NEON_VOP_ENV(qshl_s16, neon_s16, 2)

753 
NEON_VOP_ENV(qshl_s32, neon_s32, 1)

754 
#undef NEON_FN

755  
756 
uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) 
757 
{ 
758 
int8_t shift = (uint8_t)shiftop; 
759 
int64_t val = valop; 
760 
if (shift >= 64) { 
761 
if (val) {

762 
SET_QC(); 
763 
val = (val >> 63) ^ ~SIGNBIT64;

764 
} 
765 
} else if (shift <= 64) { 
766 
val >>= 63;

767 
} else if (shift < 0) { 
768 
val >>= shift; 
769 
} else {

770 
int64_t tmp = val; 
771 
val <<= shift; 
772 
if ((val >> shift) != tmp) {

773 
SET_QC(); 
774 
val = (tmp >> 63) ^ ~SIGNBIT64;

775 
} 
776 
} 
777 
return val;

778 
} 
779  
780 
#define NEON_FN(dest, src1, src2) do { \ 
781 
if (src1 & (1 << (sizeof(src1) * 8  1))) { \ 
782 
SET_QC(); \ 
783 
dest = 0; \

784 
} else { \

785 
int8_t tmp; \ 
786 
tmp = (int8_t)src2; \ 
787 
if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 
788 
if (src1) { \

789 
SET_QC(); \ 
790 
dest = ~0; \

791 
} else { \

792 
dest = 0; \

793 
} \ 
794 
} else if (tmp <= (ssize_t)sizeof(src1) * 8) { \ 
795 
dest = 0; \

796 
} else if (tmp < 0) { \ 
797 
dest = src1 >> tmp; \ 
798 
} else { \

799 
dest = src1 << tmp; \ 
800 
if ((dest >> tmp) != src1) { \

801 
SET_QC(); \ 
802 
dest = ~0; \

803 
} \ 
804 
} \ 
805 
}} while (0) 
806 
NEON_VOP_ENV(qshlu_s8, neon_u8, 4)

807 
NEON_VOP_ENV(qshlu_s16, neon_u16, 2)

808 
#undef NEON_FN

809  
810 
uint32_t HELPER(neon_qshlu_s32)(CPUState *env, uint32_t valop, uint32_t shiftop) 
811 
{ 
812 
if ((int32_t)valop < 0) { 
813 
SET_QC(); 
814 
return 0; 
815 
} 
816 
return helper_neon_qshl_u32(env, valop, shiftop);

817 
} 
818  
819 
uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) 
820 
{ 
821 
if ((int64_t)valop < 0) { 
822 
SET_QC(); 
823 
return 0; 
824 
} 
825 
return helper_neon_qshl_u64(env, valop, shiftop);

826 
} 
827  
828 
/* FIXME: This is wrong. */

829 
#define NEON_FN(dest, src1, src2) do { \ 
830 
int8_t tmp; \ 
831 
tmp = (int8_t)src2; \ 
832 
if (tmp < 0) { \ 
833 
dest = (src1 + (1 << (1  tmp))) >> tmp; \ 
834 
} else { \

835 
dest = src1 << tmp; \ 
836 
if ((dest >> tmp) != src1) { \

837 
SET_QC(); \ 
838 
dest = ~0; \

839 
} \ 
840 
}} while (0) 
841 
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)

842 
NEON_VOP_ENV(qrshl_u16, neon_u16, 2)

843 
#undef NEON_FN

844  
845 
/* The addition of the rounding constant may overflow, so we use an

846 
* intermediate 64 bits accumulator. */

847 
uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop) 
848 
{ 
849 
uint32_t dest; 
850 
int8_t shift = (int8_t)shiftop; 
851 
if (shift < 0) { 
852 
uint64_t big_dest = ((uint64_t)val + (1 << (1  shift))); 
853 
dest = big_dest >> shift; 
854 
} else {

855 
dest = val << shift; 
856 
if ((dest >> shift) != val) {

857 
SET_QC(); 
858 
dest = ~0;

859 
} 
860 
} 
861 
return dest;

862 
} 
863  
864 
/* Handling addition overflow with 64 bits inputs values is more

865 
* tricky than with 32 bits values. */

866 
uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) 
867 
{ 
868 
int8_t shift = (int8_t)shiftop; 
869 
if (shift < 0) { 
870 
val >>= (shift  1);

871 
if (val == UINT64_MAX) {

872 
/* In this case, it means that the rounding constant is 1,

873 
* and the addition would overflow. Return the actual

874 
* result directly. */

875 
val = 0x8000000000000000ULL;

876 
} else {

877 
val++; 
878 
val >>= 1;

879 
} 
880 
} else { \

881 
uint64_t tmp = val; 
882 
val <<= shift; 
883 
if ((val >> shift) != tmp) {

884 
SET_QC(); 
885 
val = ~0;

886 
} 
887 
} 
888 
return val;

889 
} 
890  
891 
#define NEON_FN(dest, src1, src2) do { \ 
892 
int8_t tmp; \ 
893 
tmp = (int8_t)src2; \ 
894 
if (tmp < 0) { \ 
895 
dest = (src1 + (1 << (1  tmp))) >> tmp; \ 
896 
} else { \

897 
dest = src1 << tmp; \ 
898 
if ((dest >> tmp) != src1) { \

899 
SET_QC(); \ 
900 
dest = src1 >> 31; \

901 
} \ 
902 
}} while (0) 
903 
NEON_VOP_ENV(qrshl_s8, neon_s8, 4)

904 
NEON_VOP_ENV(qrshl_s16, neon_s16, 2)

905 
#undef NEON_FN

906  
907 
/* The addition of the rounding constant may overflow, so we use an

908 
* intermediate 64 bits accumulator. */

909 
uint32_t HELPER(neon_qrshl_s32)(CPUState *env, uint32_t valop, uint32_t shiftop) 
910 
{ 
911 
int32_t dest; 
912 
int32_t val = (int32_t)valop; 
913 
int8_t shift = (int8_t)shiftop; 
914 
if (shift < 0) { 
915 
int64_t big_dest = ((int64_t)val + (1 << (1  shift))); 
916 
dest = big_dest >> shift; 
917 
} else {

918 
dest = val << shift; 
919 
if ((dest >> shift) != val) {

920 
SET_QC(); 
921 
dest = (val >> 31) ^ ~SIGNBIT;

922 
} 
923 
} 
924 
return dest;

925 
} 
926  
927 
/* Handling addition overflow with 64 bits inputs values is more

928 
* tricky than with 32 bits values. */

929 
uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) 
930 
{ 
931 
int8_t shift = (uint8_t)shiftop; 
932 
int64_t val = valop; 
933  
934 
if (shift < 0) { 
935 
val >>= (shift  1);

936 
if (val == INT64_MAX) {

937 
/* In this case, it means that the rounding constant is 1,

938 
* and the addition would overflow. Return the actual

939 
* result directly. */

940 
val = 0x4000000000000000ULL;

941 
} else {

942 
val++; 
943 
val >>= 1;

944 
} 
945 
} else {

946 
int64_t tmp = val; 
947 
val <<= shift; 
948 
if ((val >> shift) != tmp) {

949 
SET_QC(); 
950 
val = (tmp >> 63) ^ ~SIGNBIT64;

951 
} 
952 
} 
953 
return val;

954 
} 
955  
956 
uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 
957 
{ 
958 
uint32_t mask; 
959 
mask = (a ^ b) & 0x80808080u;

960 
a &= ~0x80808080u;

961 
b &= ~0x80808080u;

962 
return (a + b) ^ mask;

963 
} 
964  
965 
uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 
966 
{ 
967 
uint32_t mask; 
968 
mask = (a ^ b) & 0x80008000u;

969 
a &= ~0x80008000u;

970 
b &= ~0x80008000u;

971 
return (a + b) ^ mask;

972 
} 
973  
974 
#define NEON_FN(dest, src1, src2) dest = src1 + src2

975 
NEON_POP(padd_u8, neon_u8, 4)

976 
NEON_POP(padd_u16, neon_u16, 2)

977 
#undef NEON_FN

978  
979 
#define NEON_FN(dest, src1, src2) dest = src1  src2

980 
NEON_VOP(sub_u8, neon_u8, 4)

981 
NEON_VOP(sub_u16, neon_u16, 2)

982 
#undef NEON_FN

983  
984 
#define NEON_FN(dest, src1, src2) dest = src1 * src2

985 
NEON_VOP(mul_u8, neon_u8, 4)

986 
NEON_VOP(mul_u16, neon_u16, 2)

987 
#undef NEON_FN

988  
989 
/* Polynomial multiplication is like integer multiplication except the

990 
partial products are XORed, not added. */

991 
uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2) 
992 
{ 
993 
uint32_t mask; 
994 
uint32_t result; 
995 
result = 0;

996 
while (op1) {

997 
mask = 0;

998 
if (op1 & 1) 
999 
mask = 0xff;

1000 
if (op1 & (1 << 8)) 
1001 
mask = (0xff << 8); 
1002 
if (op1 & (1 << 16)) 
1003 
mask = (0xff << 16); 
1004 
if (op1 & (1 << 24)) 
1005 
mask = (0xff << 24); 
1006 
result ^= op2 & mask; 
1007 
op1 = (op1 >> 1) & 0x7f7f7f7f; 
1008 
op2 = (op2 << 1) & 0xfefefefe; 
1009 
} 
1010 
return result;

1011 
} 
1012  
1013 
uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2) 
1014 
{ 
1015 
uint64_t result = 0;

1016 
uint64_t mask; 
1017 
uint64_t op2ex = op2; 
1018 
op2ex = (op2ex & 0xff) 

1019 
((op2ex & 0xff00) << 8)  
1020 
((op2ex & 0xff0000) << 16)  
1021 
((op2ex & 0xff000000) << 24); 
1022 
while (op1) {

1023 
mask = 0;

1024 
if (op1 & 1) { 
1025 
mask = 0xffff;

1026 
} 
1027 
if (op1 & (1 << 8)) { 
1028 
mask = (0xffffU << 16); 
1029 
} 
1030 
if (op1 & (1 << 16)) { 
1031 
mask = (0xffffULL << 32); 
1032 
} 
1033 
if (op1 & (1 << 24)) { 
1034 
mask = (0xffffULL << 48); 
1035 
} 
1036 
result ^= op2ex & mask; 
1037 
op1 = (op1 >> 1) & 0x7f7f7f7f; 
1038 
op2ex <<= 1;

1039 
} 
1040 
return result;

1041 
} 
1042  
1043 
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? 1 : 0 
1044 
NEON_VOP(tst_u8, neon_u8, 4)

1045 
NEON_VOP(tst_u16, neon_u16, 2)

1046 
NEON_VOP(tst_u32, neon_u32, 1)

1047 
#undef NEON_FN

1048  
1049 
#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? 1 : 0 
1050 
NEON_VOP(ceq_u8, neon_u8, 4)

1051 
NEON_VOP(ceq_u16, neon_u16, 2)

1052 
NEON_VOP(ceq_u32, neon_u32, 1)

1053 
#undef NEON_FN

1054  
1055 
#define NEON_FN(dest, src, dummy) dest = (src < 0) ? src : src 
1056 
NEON_VOP1(abs_s8, neon_s8, 4)

1057 
NEON_VOP1(abs_s16, neon_s16, 2)

1058 
#undef NEON_FN

1059  
1060 
/* Count Leading Sign/Zero Bits. */

1061 
static inline int do_clz8(uint8_t x) 
1062 
{ 
1063 
int n;

1064 
for (n = 8; x; n) 
1065 
x >>= 1;

1066 
return n;

1067 
} 
1068  
1069 
static inline int do_clz16(uint16_t x) 
1070 
{ 
1071 
int n;

1072 
for (n = 16; x; n) 
1073 
x >>= 1;

1074 
return n;

1075 
} 
1076  
1077 
#define NEON_FN(dest, src, dummy) dest = do_clz8(src)

1078 
NEON_VOP1(clz_u8, neon_u8, 4)

1079 
#undef NEON_FN

1080  
1081 
#define NEON_FN(dest, src, dummy) dest = do_clz16(src)

1082 
NEON_VOP1(clz_u16, neon_u16, 2)

1083 
#undef NEON_FN

1084  
1085 
#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src)  1 
1086 
NEON_VOP1(cls_s8, neon_s8, 4)

1087 
#undef NEON_FN

1088  
1089 
#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src)  1 
1090 
NEON_VOP1(cls_s16, neon_s16, 2)

1091 
#undef NEON_FN

1092  
1093 
uint32_t HELPER(neon_cls_s32)(uint32_t x) 
1094 
{ 
1095 
int count;

1096 
if ((int32_t)x < 0) 
1097 
x = ~x; 
1098 
for (count = 32; x; count) 
1099 
x = x >> 1;

1100 
return count  1; 
1101 
} 
1102  
1103 
/* Bit count. */

1104 
uint32_t HELPER(neon_cnt_u8)(uint32_t x) 
1105 
{ 
1106 
x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 
1107 
x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 
1108 
x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 
1109 
return x;

1110 
} 
1111  
1112 
#define NEON_QDMULH16(dest, src1, src2, round) do { \ 
1113 
uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 
1114 
if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 
1115 
SET_QC(); \ 
1116 
tmp = (tmp >> 31) ^ ~SIGNBIT; \

1117 
} else { \

1118 
tmp <<= 1; \

1119 
} \ 
1120 
if (round) { \

1121 
int32_t old = tmp; \ 
1122 
tmp += 1 << 15; \ 
1123 
if ((int32_t)tmp < old) { \

1124 
SET_QC(); \ 
1125 
tmp = SIGNBIT  1; \

1126 
} \ 
1127 
} \ 
1128 
dest = tmp >> 16; \

1129 
} while(0) 
1130 
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 
1131 
NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)

1132 
#undef NEON_FN

1133 
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 
1134 
NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)

1135 
#undef NEON_FN

1136 
#undef NEON_QDMULH16

1137  
1138 
#define NEON_QDMULH32(dest, src1, src2, round) do { \ 
1139 
uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 
1140 
if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 
1141 
SET_QC(); \ 
1142 
tmp = (tmp >> 63) ^ ~SIGNBIT64; \

1143 
} else { \

1144 
tmp <<= 1; \

1145 
} \ 
1146 
if (round) { \

1147 
int64_t old = tmp; \ 
1148 
tmp += (int64_t)1 << 31; \ 
1149 
if ((int64_t)tmp < old) { \

1150 
SET_QC(); \ 
1151 
tmp = SIGNBIT64  1; \

1152 
} \ 
1153 
} \ 
1154 
dest = tmp >> 32; \

1155 
} while(0) 
1156 
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 
1157 
NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)

1158 
#undef NEON_FN

1159 
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 
1160 
NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)

1161 
#undef NEON_FN

1162 
#undef NEON_QDMULH32

1163  
1164 
uint32_t HELPER(neon_narrow_u8)(uint64_t x) 
1165 
{ 
1166 
return (x & 0xffu)  ((x >> 8) & 0xff00u)  ((x >> 16) & 0xff0000u) 
1167 
 ((x >> 24) & 0xff000000u); 
1168 
} 
1169  
1170 
uint32_t HELPER(neon_narrow_u16)(uint64_t x) 
1171 
{ 
1172 
return (x & 0xffffu)  ((x >> 16) & 0xffff0000u); 
1173 
} 
1174  
1175 
uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 
1176 
{ 
1177 
return ((x >> 8) & 0xff)  ((x >> 16) & 0xff00) 
1178 
 ((x >> 24) & 0xff0000)  ((x >> 32) & 0xff000000); 
1179 
} 
1180  
1181 
uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 
1182 
{ 
1183 
return ((x >> 16) & 0xffff)  ((x >> 32) & 0xffff0000); 
1184 
} 
1185  
1186 
uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 
1187 
{ 
1188 
x &= 0xff80ff80ff80ff80ull;

1189 
x += 0x0080008000800080ull;

1190 
return ((x >> 8) & 0xff)  ((x >> 16) & 0xff00) 
1191 
 ((x >> 24) & 0xff0000)  ((x >> 32) & 0xff000000); 
1192 
} 
1193  
1194 
uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 
1195 
{ 
1196 
x &= 0xffff8000ffff8000ull;

1197 
x += 0x0000800000008000ull;

1198 
return ((x >> 16) & 0xffff)  ((x >> 32) & 0xffff0000); 
1199 
} 
1200  
1201 
uint32_t HELPER(neon_unarrow_sat8)(CPUState *env, uint64_t x) 
1202 
{ 
1203 
uint16_t s; 
1204 
uint8_t d; 
1205 
uint32_t res = 0;

1206 
#define SAT8(n) \

1207 
s = x >> n; \ 
1208 
if (s & 0x8000) { \ 
1209 
SET_QC(); \ 
1210 
} else { \

1211 
if (s > 0xff) { \ 
1212 
d = 0xff; \

1213 
SET_QC(); \ 
1214 
} else { \

1215 
d = s; \ 
1216 
} \ 
1217 
res = (uint32_t)d << (n / 2); \

1218 
} 
1219  
1220 
SAT8(0);

1221 
SAT8(16);

1222 
SAT8(32);

1223 
SAT8(48);

1224 
#undef SAT8

1225 
return res;

1226 
} 
1227  
1228 
uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x) 
1229 
{ 
1230 
uint16_t s; 
1231 
uint8_t d; 
1232 
uint32_t res = 0;

1233 
#define SAT8(n) \

1234 
s = x >> n; \ 
1235 
if (s > 0xff) { \ 
1236 
d = 0xff; \

1237 
SET_QC(); \ 
1238 
} else { \

1239 
d = s; \ 
1240 
} \ 
1241 
res = (uint32_t)d << (n / 2);

1242  
1243 
SAT8(0);

1244 
SAT8(16);

1245 
SAT8(32);

1246 
SAT8(48);

1247 
#undef SAT8

1248 
return res;

1249 
} 
1250  
1251 
uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x) 
1252 
{ 
1253 
int16_t s; 
1254 
uint8_t d; 
1255 
uint32_t res = 0;

1256 
#define SAT8(n) \

1257 
s = x >> n; \ 
1258 
if (s != (int8_t)s) { \

1259 
d = (s >> 15) ^ 0x7f; \ 
1260 
SET_QC(); \ 
1261 
} else { \

1262 
d = s; \ 
1263 
} \ 
1264 
res = (uint32_t)d << (n / 2);

1265  
1266 
SAT8(0);

1267 
SAT8(16);

1268 
SAT8(32);

1269 
SAT8(48);

1270 
#undef SAT8

1271 
return res;

1272 
} 
1273  
1274 
uint32_t HELPER(neon_unarrow_sat16)(CPUState *env, uint64_t x) 
1275 
{ 
1276 
uint32_t high; 
1277 
uint32_t low; 
1278 
low = x; 
1279 
if (low & 0x80000000) { 
1280 
low = 0;

1281 
SET_QC(); 
1282 
} else if (low > 0xffff) { 
1283 
low = 0xffff;

1284 
SET_QC(); 
1285 
} 
1286 
high = x >> 32;

1287 
if (high & 0x80000000) { 
1288 
high = 0;

1289 
SET_QC(); 
1290 
} else if (high > 0xffff) { 
1291 
high = 0xffff;

1292 
SET_QC(); 
1293 
} 
1294 
return low  (high << 16); 
1295 
} 
1296  
1297 
uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x) 
1298 
{ 
1299 
uint32_t high; 
1300 
uint32_t low; 
1301 
low = x; 
1302 
if (low > 0xffff) { 
1303 
low = 0xffff;

1304 
SET_QC(); 
1305 
} 
1306 
high = x >> 32;

1307 
if (high > 0xffff) { 
1308 
high = 0xffff;

1309 
SET_QC(); 
1310 
} 
1311 
return low  (high << 16); 
1312 
} 
1313  
1314 
uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x) 
1315 
{ 
1316 
int32_t low; 
1317 
int32_t high; 
1318 
low = x; 
1319 
if (low != (int16_t)low) {

1320 
low = (low >> 31) ^ 0x7fff; 
1321 
SET_QC(); 
1322 
} 
1323 
high = x >> 32;

1324 
if (high != (int16_t)high) {

1325 
high = (high >> 31) ^ 0x7fff; 
1326 
SET_QC(); 
1327 
} 
1328 
return (uint16_t)low  (high << 16); 
1329 
} 
1330  
1331 
uint32_t HELPER(neon_unarrow_sat32)(CPUState *env, uint64_t x) 
1332 
{ 
1333 
if (x & 0x8000000000000000ull) { 
1334 
SET_QC(); 
1335 
return 0; 
1336 
} 
1337 
if (x > 0xffffffffu) { 
1338 
SET_QC(); 
1339 
return 0xffffffffu; 
1340 
} 
1341 
return x;

1342 
} 
1343  
1344 
uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x) 
1345 
{ 
1346 
if (x > 0xffffffffu) { 
1347 
SET_QC(); 
1348 
return 0xffffffffu; 
1349 
} 
1350 
return x;

1351 
} 
1352  
1353 
uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x) 
1354 
{ 
1355 
if ((int64_t)x != (int32_t)x) {

1356 
SET_QC(); 
1357 
return ((int64_t)x >> 63) ^ 0x7fffffff; 
1358 
} 
1359 
return x;

1360 
} 
1361  
1362 
uint64_t HELPER(neon_widen_u8)(uint32_t x) 
1363 
{ 
1364 
uint64_t tmp; 
1365 
uint64_t ret; 
1366 
ret = (uint8_t)x; 
1367 
tmp = (uint8_t)(x >> 8);

1368 
ret = tmp << 16;

1369 
tmp = (uint8_t)(x >> 16);

1370 
ret = tmp << 32;

1371 
tmp = (uint8_t)(x >> 24);

1372 
ret = tmp << 48;

1373 
return ret;

1374 
} 
1375  
1376 
uint64_t HELPER(neon_widen_s8)(uint32_t x) 
1377 
{ 
1378 
uint64_t tmp; 
1379 
uint64_t ret; 
1380 
ret = (uint16_t)(int8_t)x; 
1381 
tmp = (uint16_t)(int8_t)(x >> 8);

1382 
ret = tmp << 16;

1383 
tmp = (uint16_t)(int8_t)(x >> 16);

1384 
ret = tmp << 32;

1385 
tmp = (uint16_t)(int8_t)(x >> 24);

1386 
ret = tmp << 48;

1387 
return ret;

1388 
} 
1389  
1390 
uint64_t HELPER(neon_widen_u16)(uint32_t x) 
1391 
{ 
1392 
uint64_t high = (uint16_t)(x >> 16);

1393 
return ((uint16_t)x)  (high << 32); 
1394 
} 
1395  
1396 
uint64_t HELPER(neon_widen_s16)(uint32_t x) 
1397 
{ 
1398 
uint64_t high = (int16_t)(x >> 16);

1399 
return ((uint32_t)(int16_t)x)  (high << 32); 
1400 
} 
1401  
1402 
uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 
1403 
{ 
1404 
uint64_t mask; 
1405 
mask = (a ^ b) & 0x8000800080008000ull;

1406 
a &= ~0x8000800080008000ull;

1407 
b &= ~0x8000800080008000ull;

1408 
return (a + b) ^ mask;

1409 
} 
1410  
1411 
uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 
1412 
{ 
1413 
uint64_t mask; 
1414 
mask = (a ^ b) & 0x8000000080000000ull;

1415 
a &= ~0x8000000080000000ull;

1416 
b &= ~0x8000000080000000ull;

1417 
return (a + b) ^ mask;

1418 
} 
1419  
1420 
uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 
1421 
{ 
1422 
uint64_t tmp; 
1423 
uint64_t tmp2; 
1424  
1425 
tmp = a & 0x0000ffff0000ffffull;

1426 
tmp += (a >> 16) & 0x0000ffff0000ffffull; 
1427 
tmp2 = b & 0xffff0000ffff0000ull;

1428 
tmp2 += (b << 16) & 0xffff0000ffff0000ull; 
1429 
return ( tmp & 0xffff) 
1430 
 ((tmp >> 16) & 0xffff0000ull) 
1431 
 ((tmp2 << 16) & 0xffff00000000ull) 
1432 
 ( tmp2 & 0xffff000000000000ull);

1433 
} 
1434  
1435 
uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 
1436 
{ 
1437 
uint32_t low = a + (a >> 32);

1438 
uint32_t high = b + (b >> 32);

1439 
return low + ((uint64_t)high << 32); 
1440 
} 
1441  
1442 
uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 
1443 
{ 
1444 
uint64_t mask; 
1445 
mask = (a ^ ~b) & 0x8000800080008000ull;

1446 
a = 0x8000800080008000ull;

1447 
b &= ~0x8000800080008000ull;

1448 
return (a  b) ^ mask;

1449 
} 
1450  
1451 
uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 
1452 
{ 
1453 
uint64_t mask; 
1454 
mask = (a ^ ~b) & 0x8000000080000000ull;

1455 
a = 0x8000000080000000ull;

1456 
b &= ~0x8000000080000000ull;

1457 
return (a  b) ^ mask;

1458 
} 
1459  
1460 
uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b) 
1461 
{ 
1462 
uint32_t x, y; 
1463 
uint32_t low, high; 
1464  
1465 
x = a; 
1466 
y = b; 
1467 
low = x + y; 
1468 
if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {

1469 
SET_QC(); 
1470 
low = ((int32_t)x >> 31) ^ ~SIGNBIT;

1471 
} 
1472 
x = a >> 32;

1473 
y = b >> 32;

1474 
high = x + y; 
1475 
if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {

1476 
SET_QC(); 
1477 
high = ((int32_t)x >> 31) ^ ~SIGNBIT;

1478 
} 
1479 
return low  ((uint64_t)high << 32); 
1480 
} 
1481  
1482 
uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b) 
1483 
{ 
1484 
uint64_t result; 
1485  
1486 
result = a + b; 
1487 
if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {

1488 
SET_QC(); 
1489 
result = ((int64_t)a >> 63) ^ ~SIGNBIT64;

1490 
} 
1491 
return result;

1492 
} 
1493  
1494 
#define DO_ABD(dest, x, y, type) do { \ 
1495 
type tmp_x = x; \ 
1496 
type tmp_y = y; \ 
1497 
dest = ((tmp_x > tmp_y) ? tmp_x  tmp_y : tmp_y  tmp_x); \ 
1498 
} while(0) 
1499  
1500 
uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 
1501 
{ 
1502 
uint64_t tmp; 
1503 
uint64_t result; 
1504 
DO_ABD(result, a, b, uint8_t); 
1505 
DO_ABD(tmp, a >> 8, b >> 8, uint8_t); 
1506 
result = tmp << 16;

1507 
DO_ABD(tmp, a >> 16, b >> 16, uint8_t); 
1508 
result = tmp << 32;

1509 
DO_ABD(tmp, a >> 24, b >> 24, uint8_t); 
1510 
result = tmp << 48;

1511 
return result;

1512 
} 
1513  
1514 
uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 
1515 
{ 
1516 
uint64_t tmp; 
1517 
uint64_t result; 
1518 
DO_ABD(result, a, b, int8_t); 
1519 
DO_ABD(tmp, a >> 8, b >> 8, int8_t); 
1520 
result = tmp << 16;

1521 
DO_ABD(tmp, a >> 16, b >> 16, int8_t); 
1522 
result = tmp << 32;

1523 
DO_ABD(tmp, a >> 24, b >> 24, int8_t); 
1524 
result = tmp << 48;

1525 
return result;

1526 
} 
1527  
1528 
uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 
1529 
{ 
1530 
uint64_t tmp; 
1531 
uint64_t result; 
1532 
DO_ABD(result, a, b, uint16_t); 
1533 
DO_ABD(tmp, a >> 16, b >> 16, uint16_t); 
1534 
return result  (tmp << 32); 
1535 
} 
1536  
1537 
uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 
1538 
{ 
1539 
uint64_t tmp; 
1540 
uint64_t result; 
1541 
DO_ABD(result, a, b, int16_t); 
1542 
DO_ABD(tmp, a >> 16, b >> 16, int16_t); 
1543 
return result  (tmp << 32); 
1544 
} 
1545  
1546 
uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 
1547 
{ 
1548 
uint64_t result; 
1549 
DO_ABD(result, a, b, uint32_t); 
1550 
return result;

1551 
} 
1552  
1553 
uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 
1554 
{ 
1555 
uint64_t result; 
1556 
DO_ABD(result, a, b, int32_t); 
1557 
return result;

1558 
} 
1559 
#undef DO_ABD

1560  
1561 
/* Widening multiply. Named type is the source type. */

1562 
#define DO_MULL(dest, x, y, type1, type2) do { \ 
1563 
type1 tmp_x = x; \ 
1564 
type1 tmp_y = y; \ 
1565 
dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 
1566 
} while(0) 
1567  
1568 
uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 
1569 
{ 
1570 
uint64_t tmp; 
1571 
uint64_t result; 
1572  
1573 
DO_MULL(result, a, b, uint8_t, uint16_t); 
1574 
DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 
1575 
result = tmp << 16;

1576 
DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 
1577 
result = tmp << 32;

1578 
DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 
1579 
result = tmp << 48;

1580 
return result;

1581 
} 
1582  
1583 
uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 
1584 
{ 
1585 
uint64_t tmp; 
1586 
uint64_t result; 
1587  
1588 
DO_MULL(result, a, b, int8_t, uint16_t); 
1589 
DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 
1590 
result = tmp << 16;

1591 
DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 
1592 
result = tmp << 32;

1593 
DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 
1594 
result = tmp << 48;

1595 
return result;

1596 
} 
1597  
1598 
uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 
1599 
{ 
1600 
uint64_t tmp; 
1601 
uint64_t result; 
1602  
1603 
DO_MULL(result, a, b, uint16_t, uint32_t); 
1604 
DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 
1605 
return result  (tmp << 32); 
1606 
} 
1607  
1608 
uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 
1609 
{ 
1610 
uint64_t tmp; 
1611 
uint64_t result; 
1612  
1613 
DO_MULL(result, a, b, int16_t, uint32_t); 
1614 
DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 
1615 
return result  (tmp << 32); 
1616 
} 
1617  
1618 
uint64_t HELPER(neon_negl_u16)(uint64_t x) 
1619 
{ 
1620 
uint16_t tmp; 
1621 
uint64_t result; 
1622 
result = (uint16_t)x; 
1623 
tmp = (x >> 16);

1624 
result = (uint64_t)tmp << 16;

1625 
tmp = (x >> 32);

1626 
result = (uint64_t)tmp << 32;

1627 
tmp = (x >> 48);

1628 
result = (uint64_t)tmp << 48;

1629 
return result;

1630 
} 
1631  
1632 
uint64_t HELPER(neon_negl_u32)(uint64_t x) 
1633 
{ 
1634 
uint32_t low = x; 
1635 
uint32_t high = (x >> 32);

1636 
return low  ((uint64_t)high << 32); 
1637 
} 
1638  
1639 
/* FIXME: There should be a native op for this. */

1640 
uint64_t HELPER(neon_negl_u64)(uint64_t x) 
1641 
{ 
1642 
return x;

1643 
} 
1644  
1645 
/* Saturnating sign manuipulation. */

1646 
/* ??? Make these use NEON_VOP1 */

1647 
#define DO_QABS8(x) do { \ 
1648 
if (x == (int8_t)0x80) { \ 
1649 
x = 0x7f; \

1650 
SET_QC(); \ 
1651 
} else if (x < 0) { \ 
1652 
x = x; \ 
1653 
}} while (0) 
1654 
uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x) 
1655 
{ 
1656 
neon_s8 vec; 
1657 
NEON_UNPACK(neon_s8, vec, x); 
1658 
DO_QABS8(vec.v1); 
1659 
DO_QABS8(vec.v2); 
1660 
DO_QABS8(vec.v3); 
1661 
DO_QABS8(vec.v4); 
1662 
NEON_PACK(neon_s8, x, vec); 
1663 
return x;

1664 
} 
1665 
#undef DO_QABS8

1666  
1667 
#define DO_QNEG8(x) do { \ 
1668 
if (x == (int8_t)0x80) { \ 
1669 
x = 0x7f; \

1670 
SET_QC(); \ 
1671 
} else { \

1672 
x = x; \ 
1673 
}} while (0) 
1674 
uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x) 
1675 
{ 
1676 
neon_s8 vec; 
1677 
NEON_UNPACK(neon_s8, vec, x); 
1678 
DO_QNEG8(vec.v1); 
1679 
DO_QNEG8(vec.v2); 
1680 
DO_QNEG8(vec.v3); 
1681 
DO_QNEG8(vec.v4); 
1682 
NEON_PACK(neon_s8, x, vec); 
1683 
return x;

1684 
} 
1685 
#undef DO_QNEG8

1686  
1687 
#define DO_QABS16(x) do { \ 
1688 
if (x == (int16_t)0x8000) { \ 
1689 
x = 0x7fff; \

1690 
SET_QC(); \ 
1691 
} else if (x < 0) { \ 
1692 
x = x; \ 
1693 
}} while (0) 
1694 
uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x) 
1695 
{ 
1696 
neon_s16 vec; 
1697 
NEON_UNPACK(neon_s16, vec, x); 
1698 
DO_QABS16(vec.v1); 
1699 
DO_QABS16(vec.v2); 
1700 
NEON_PACK(neon_s16, x, vec); 
1701 
return x;

1702 
} 
1703 
#undef DO_QABS16

1704  
1705 
#define DO_QNEG16(x) do { \ 
1706 
if (x == (int16_t)0x8000) { \ 
1707 
x = 0x7fff; \

1708 
SET_QC(); \ 
1709 
} else { \

1710 
x = x; \ 
1711 
}} while (0) 
1712 
uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x) 
1713 
{ 
1714 
neon_s16 vec; 
1715 
NEON_UNPACK(neon_s16, vec, x); 
1716 
DO_QNEG16(vec.v1); 
1717 
DO_QNEG16(vec.v2); 
1718 
NEON_PACK(neon_s16, x, vec); 
1719 
return x;

1720 
} 
1721 
#undef DO_QNEG16

1722  
1723 
uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x) 
1724 
{ 
1725 
if (x == SIGNBIT) {

1726 
SET_QC(); 
1727 
x = ~SIGNBIT; 
1728 
} else if ((int32_t)x < 0) { 
1729 
x = x; 
1730 
} 
1731 
return x;

1732 
} 
1733  
1734 
uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x) 
1735 
{ 
1736 
if (x == SIGNBIT) {

1737 
SET_QC(); 
1738 
x = ~SIGNBIT; 
1739 
} else {

1740 
x = x; 
1741 
} 
1742 
return x;

1743 
} 
1744  
1745 
/* NEON Float helpers. */

1746 
uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b) 
1747 
{ 
1748 
float32 f0 = vfp_itos(a); 
1749 
float32 f1 = vfp_itos(b); 
1750 
return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b; 
1751 
} 
1752  
1753 
uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b) 
1754 
{ 
1755 
float32 f0 = vfp_itos(a); 
1756 
float32 f1 = vfp_itos(b); 
1757 
return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b; 
1758 
} 
1759  
1760 
uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b) 
1761 
{ 
1762 
float32 f0 = vfp_itos(a); 
1763 
float32 f1 = vfp_itos(b); 
1764 
return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1) 
1765 
? float32_sub(f0, f1, NFS) 
1766 
: float32_sub(f1, f0, NFS)); 
1767 
} 
1768  
1769 
uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b) 
1770 
{ 
1771 
return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));

1772 
} 
1773  
1774 
uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b) 
1775 
{ 
1776 
return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));

1777 
} 
1778  
1779 
uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b) 
1780 
{ 
1781 
return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));

1782 
} 
1783  
1784 
/* Floating point comparisons produce an integer result. */

1785 
#define NEON_VOP_FCMP(name, cmp) \

1786 
uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \ 
1787 
{ \ 
1788 
if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \ 
1789 
return ~0; \ 
1790 
else \

1791 
return 0; \ 
1792 
} 
1793  
1794 
NEON_VOP_FCMP(ceq_f32, ==) 
1795 
NEON_VOP_FCMP(cge_f32, >=) 
1796 
NEON_VOP_FCMP(cgt_f32, >) 
1797  
1798 
uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b) 
1799 
{ 
1800 
float32 f0 = float32_abs(vfp_itos(a)); 
1801 
float32 f1 = float32_abs(vfp_itos(b)); 
1802 
return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0; 
1803 
} 
1804  
1805 
uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b) 
1806 
{ 
1807 
float32 f0 = float32_abs(vfp_itos(a)); 
1808 
float32 f1 = float32_abs(vfp_itos(b)); 
1809 
return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0; 
1810 
} 
1811  
1812 
#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE))  1)) 
1813  
1814 
void HELPER(neon_qunzip8)(CPUState *env, uint32_t rd, uint32_t rm)

1815 
{ 
1816 
uint64_t zm0 = float64_val(env>vfp.regs[rm]); 
1817 
uint64_t zm1 = float64_val(env>vfp.regs[rm + 1]);

1818 
uint64_t zd0 = float64_val(env>vfp.regs[rd]); 
1819 
uint64_t zd1 = float64_val(env>vfp.regs[rd + 1]);

1820 
uint64_t d0 = ELEM(zd0, 0, 8)  (ELEM(zd0, 2, 8) << 8) 
1821 
 (ELEM(zd0, 4, 8) << 16)  (ELEM(zd0, 6, 8) << 24) 
1822 
 (ELEM(zd1, 0, 8) << 32)  (ELEM(zd1, 2, 8) << 40) 
1823 
 (ELEM(zd1, 4, 8) << 48)  (ELEM(zd1, 6, 8) << 56); 
1824 
uint64_t d1 = ELEM(zm0, 0, 8)  (ELEM(zm0, 2, 8) << 8) 
1825 
 (ELEM(zm0, 4, 8) << 16)  (ELEM(zm0, 6, 8) << 24) 
1826 
 (ELEM(zm1, 0, 8) << 32)  (ELEM(zm1, 2, 8) << 40) 
1827 
 (ELEM(zm1, 4, 8) << 48)  (ELEM(zm1, 6, 8) << 56); 
1828 
uint64_t m0 = ELEM(zd0, 1, 8)  (ELEM(zd0, 3, 8) << 8) 
1829 
 (ELEM(zd0, 5, 8) << 16)  (ELEM(zd0, 7, 8) << 24) 
1830 
 (ELEM(zd1, 1, 8) << 32)  (ELEM(zd1, 3, 8) << 40) 
1831 
 (ELEM(zd1, 5, 8) << 48)  (ELEM(zd1, 7, 8) << 56); 
1832 
uint64_t m1 = ELEM(zm0, 1, 8)  (ELEM(zm0, 3, 8) << 8) 
1833 
 (ELEM(zm0, 5, 8) << 16)  (ELEM(zm0, 7, 8) << 24) 
1834 
 (ELEM(zm1, 1, 8) << 32)  (ELEM(zm1, 3, 8) << 40) 
1835 
 (ELEM(zm1, 5, 8) << 48)  (ELEM(zm1, 7, 8) << 56); 
1836 
env>vfp.regs[rm] = make_float64(m0); 
1837 
env>vfp.regs[rm + 1] = make_float64(m1);

1838 
env>vfp.regs[rd] = make_float64(d0); 
1839 
env>vfp.regs[rd + 1] = make_float64(d1);

1840 
} 
1841  
1842 
void HELPER(neon_qunzip16)(CPUState *env, uint32_t rd, uint32_t rm)

1843 
{ 
1844 
uint64_t zm0 = float64_val(env>vfp.regs[rm]); 
1845 
uint64_t zm1 = float64_val(env>vfp.regs[rm + 1]);

1846 
uint64_t zd0 = float64_val(env>vfp.regs[rd]); 
1847 
uint64_t zd1 = float64_val(env>vfp.regs[rd + 1]);

1848 
uint64_t d0 = ELEM(zd0, 0, 16)  (ELEM(zd0, 2, 16) << 16) 
1849 
 (ELEM(zd1, 0, 16) << 32)  (ELEM(zd1, 2, 16) << 48); 
1850 
uint64_t d1 = ELEM(zm0, 0, 16)  (ELEM(zm0, 2, 16) << 16) 
1851 
 (ELEM(zm1, 0, 16) << 32)  (ELEM(zm1, 2, 16) << 48); 
1852 
uint64_t m0 = ELEM(zd0, 1, 16)  (ELEM(zd0, 3, 16) << 16) 
1853 
 (ELEM(zd1, 1, 16) << 32)  (ELEM(zd1, 3, 16) << 48); 
1854 
uint64_t m1 = ELEM(zm0, 1, 16)  (ELEM(zm0, 3, 16) << 16) 
1855 
 (ELEM(zm1, 1, 16) << 32)  (ELEM(zm1, 3, 16) << 48); 
1856 
env>vfp.regs[rm] = make_float64(m0); 
1857 
env>vfp.regs[rm + 1] = make_float64(m1);

1858 
env>vfp.regs[rd] = make_float64(d0); 
1859 
env>vfp.regs[rd + 1] = make_float64(d1);

1860 
} 
1861  
1862 
void HELPER(neon_qunzip32)(CPUState *env, uint32_t rd, uint32_t rm)

1863 
{ 
1864 
uint64_t zm0 = float64_val(env>vfp.regs[rm]); 
1865 
uint64_t zm1 = float64_val(env>vfp.regs[rm + 1]);

1866 
uint64_t zd0 = float64_val(env>vfp.regs[rd]); 
1867 
uint64_t zd1 = float64_val(env>vfp.regs[rd + 1]);

1868 
uint64_t d0 = ELEM(zd0, 0, 32)  (ELEM(zd1, 0, 32) << 32); 
1869 
uint64_t d1 = ELEM(zm0, 0, 32)  (ELEM(zm1, 0, 32) << 32); 
1870 
uint64_t m0 = ELEM(zd0, 1, 32)  (ELEM(zd1, 1, 32) << 32); 
1871 
uint64_t m1 = ELEM(zm0, 1, 32)  (ELEM(zm1, 1, 32) << 32); 
1872 
env>vfp.regs[rm] = make_float64(m0); 
1873 
env>vfp.regs[rm + 1] = make_float64(m1);

1874 
env>vfp.regs[rd] = make_float64(d0); 
1875 
env>vfp.regs[rd + 1] = make_float64(d1);

1876 
} 
1877  
1878 
void HELPER(neon_unzip8)(CPUState *env, uint32_t rd, uint32_t rm)

1879 
{ 
1880 
uint64_t zm = float64_val(env>vfp.regs[rm]); 
1881 
uint64_t zd = float64_val(env>vfp.regs[rd]); 
1882 
uint64_t d0 = ELEM(zd, 0, 8)  (ELEM(zd, 2, 8) << 8) 
1883 
 (ELEM(zd, 4, 8) << 16)  (ELEM(zd, 6, 8) << 24) 
1884 
 (ELEM(zm, 0, 8) << 32)  (ELEM(zm, 2, 8) << 40) 
1885 
 (ELEM(zm, 4, 8) << 48)  (ELEM(zm, 6, 8) << 56); 
1886 
uint64_t m0 = ELEM(zd, 1, 8)  (ELEM(zd, 3, 8) << 8) 
1887 
 (ELEM(zd, 5, 8) << 16)  (ELEM(zd, 7, 8) << 24) 
1888 
 (ELEM(zm, 1, 8) << 32)  (ELEM(zm, 3, 8) << 40) 
1889 
 (ELEM(zm, 5, 8) << 48)  (ELEM(zm, 7, 8) << 56); 
1890 
env>vfp.regs[rm] = make_float64(m0); 
1891 
env>vfp.regs[rd] = make_float64(d0); 
1892 
} 
1893  
1894 
void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm)

1895 
{ 
1896 
uint64_t zm = float64_val(env>vfp.regs[rm]); 
1897 
uint64_t zd = float64_val(env>vfp.regs[rd]); 
1898 
uint64_t d0 = ELEM(zd, 0, 16)  (ELEM(zd, 2, 16) << 16) 
1899 
 (ELEM(zm, 0, 16) << 32)  (ELEM(zm, 2, 16) << 48); 
1900 
uint64_t m0 = ELEM(zd, 1, 16)  (ELEM(zd, 3, 16) << 16) 
1901 
 (ELEM(zm, 1, 16) << 32)  (ELEM(zm, 3, 16) << 48); 
1902 
env>vfp.regs[rm] = make_float64(m0); 
1903 
env>vfp.regs[rd] = make_float64(d0); 
1904 
} 
1905  
1906 
void HELPER(neon_qzip8)(CPUState *env, uint32_t rd, uint32_t rm)

1907 
{ 
1908 
uint64_t zm0 = float64_val(env>vfp.regs[rm]); 
1909 
uint64_t zm1 = float64_val(env>vfp.regs[rm + 1]);

1910 
uint64_t zd0 = float64_val(env>vfp.regs[rd]); 
1911 
uint64_t zd1 = float64_val(env>vfp.regs[rd + 1]);

1912 
uint64_t d0 = ELEM(zd0, 0, 8)  (ELEM(zm0, 0, 8) << 8) 
1913 
 (ELEM(zd0, 1, 8) << 16)  (ELEM(zm0, 1, 8) << 24) 
1914 
 (ELEM(zd0, 2, 8) << 32)  (ELEM(zm0, 2, 8) << 40) 
1915 
 (ELEM(zd0, 3, 8) << 48)  (ELEM(zm0, 3, 8) << 56); 
1916 
uint64_t d1 = ELEM(zd0, 4, 8)  (ELEM(zm0, 4, 8) << 8) 
1917 
 (ELEM(zd0, 5, 8) << 16)  (ELEM(zm0, 5, 8) << 24) 
1918 
 (ELEM(zd0, 6, 8) << 32)  (ELEM(zm0, 6, 8) << 40) 
1919 
 (ELEM(zd0, 7, 8) << 48)  (ELEM(zm0, 7, 8) << 56); 
1920 
uint64_t m0 = ELEM(zd1, 0, 8)  (ELEM(zm1, 0, 8) << 8) 
1921 
 (ELEM(zd1, 1, 8) << 16)  (ELEM(zm1, 1, 8) << 24) 
1922 
 (ELEM(zd1, 2, 8) << 32)  (ELEM(zm1, 2, 8) << 40) 
1923 
 (ELEM(zd1, 3, 8) << 48)  (ELEM(zm1, 3, 8) << 56); 
1924 
uint64_t m1 = ELEM(zd1, 4, 8)  (ELEM(zm1, 4, 8) << 8) 
1925 
 (ELEM(zd1, 5, 8) << 16)  (ELEM(zm1, 5, 8) << 24) 
1926 
 (ELEM(zd1, 6, 8) << 32)  (ELEM(zm1, 6, 8) << 40) 
1927 
 (ELEM(zd1, 7, 8) << 48)  (ELEM(zm1, 7, 8) << 56); 
1928 
env>vfp.regs[rm] = make_float64(m0); 
1929 
env>vfp.regs[rm + 1] = make_float64(m1);

1930 
env>vfp.regs[rd] = make_float64(d0); 
1931 
env>vfp.regs[rd + 1] = make_float64(d1);

1932 
} 
1933  
1934 
void HELPER(neon_qzip16)(CPUState *env, uint32_t rd, uint32_t rm)

1935 
{ 
1936 
uint64_t zm0 = float64_val(env>vfp.regs[rm]); 
1937 
uint64_t zm1 = float64_val(env>vfp.regs[rm + 1]);

1938 
uint64_t zd0 = float64_val(env>vfp.regs[rd]); 
1939 
uint64_t zd1 = float64_val(env>vfp.regs[rd + 1]);

1940 
uint64_t d0 = ELEM(zd0, 0, 16)  (ELEM(zm0, 0, 16) << 16) 
1941 
 (ELEM(zd0, 1, 16) << 32)  (ELEM(zm0, 1, 16) << 48); 
1942 
uint64_t d1 = ELEM(zd0, 2, 16)  (ELEM(zm0, 2, 16) << 16) 
1943 
 (ELEM(zd0, 3, 16) << 32)  (ELEM(zm0, 3, 16) << 48); 
1944 
uint64_t m0 = ELEM(zd1, 0, 16)  (ELEM(zm1, 0, 16) << 16) 
1945 
 (ELEM(zd1, 1, 16) << 32)  (ELEM(zm1, 1, 16) << 48); 
1946 
uint64_t m1 = ELEM(zd1, 2, 16)  (ELEM(zm1, 2, 16) << 16) 
1947 
 (ELEM(zd1, 3, 16) << 32)  (ELEM(zm1, 3, 16) << 48); 
1948 
env>vfp.regs[rm] = make_float64(m0); 
1949 
env>vfp.regs[rm + 1] = make_float64(m1);

1950 
env>vfp.regs[rd] = make_float64(d0); 
1951 
env>vfp.regs[rd + 1] = make_float64(d1);

1952 
} 
1953  
1954 
void HELPER(neon_qzip32)(CPUState *env, uint32_t rd, uint32_t rm)

1955 
{ 
1956 
uint64_t zm0 = float64_val(env>vfp.regs[rm]); 
1957 
uint64_t zm1 = float64_val(env>vfp.regs[rm + 1]);

1958 
uint64_t zd0 = float64_val(env>vfp.regs[rd]); 
1959 
uint64_t zd1 = float64_val(env>vfp.regs[rd + 1]);

1960 
uint64_t d0 = ELEM(zd0, 0, 32)  (ELEM(zm0, 0, 32) << 32); 
1961 
uint64_t d1 = ELEM(zd0, 1, 32)  (ELEM(zm0, 1, 32) << 32); 
1962 
uint64_t m0 = ELEM(zd1, 0, 32)  (ELEM(zm1, 0, 32) << 32); 
1963 
uint64_t m1 = ELEM(zd1, 1, 32)  (ELEM(zm1, 1, 32) << 32); 
1964 
env>vfp.regs[rm] = make_float64(m0); 
1965 
env>vfp.regs[rm + 1] = make_float64(m1);

1966 
env>vfp.regs[rd] = make_float64(d0); 
1967 
env>vfp.regs[rd + 1] = make_float64(d1);

1968 
} 
1969  
1970 
void HELPER(neon_zip8)(CPUState *env, uint32_t rd, uint32_t rm)

1971 
{ 
1972 
uint64_t zm = float64_val(env>vfp.regs[rm]); 
1973 
uint64_t zd = float64_val(env>vfp.regs[rd]); 
1974 
uint64_t d0 = ELEM(zd, 0, 8)  (ELEM(zm, 0, 8) << 8) 
1975 
 (ELEM(zd, 1, 8) << 16)  (ELEM(zm, 1, 8) << 24) 
1976 
 (ELEM(zd, 2, 8) << 32)  (ELEM(zm, 2, 8) << 40) 
1977 
 (ELEM(zd, 3, 8) << 48)  (ELEM(zm, 3, 8) << 56); 
1978 
uint64_t m0 = ELEM(zd, 4, 8)  (ELEM(zm, 4, 8) << 8) 
1979 
 (ELEM(zd, 5, 8) << 16)  (ELEM(zm, 5, 8) << 24) 
1980 
 (ELEM(zd, 6, 8) << 32)  (ELEM(zm, 6, 8) << 40) 
1981 
 (ELEM(zd, 7, 8) << 48)  (ELEM(zm, 7, 8) << 56); 
1982 
env>vfp.regs[rm] = make_float64(m0); 
1983 
env>vfp.regs[rd] = make_float64(d0); 
1984 
} 
1985  
1986 
void HELPER(neon_zip16)(CPUState *env, uint32_t rd, uint32_t rm)

1987 
{ 
1988 
uint64_t zm = float64_val(env>vfp.regs[rm]); 
1989 
uint64_t zd = float64_val(env>vfp.regs[rd]); 
1990 
uint64_t d0 = ELEM(zd, 0, 16)  (ELEM(zm, 0, 16) << 16) 
1991 
 (ELEM(zd, 1, 16) << 32)  (ELEM(zm, 1, 16) << 48); 
1992 
uint64_t m0 = ELEM(zd, 2, 16)  (ELEM(zm, 2, 16) << 16) 
1993 
 (ELEM(zd, 3, 16) << 32)  (ELEM(zm, 3, 16) << 48); 
1994 
env>vfp.regs[rm] = make_float64(m0); 
1995 
env>vfp.regs[rd] = make_float64(d0); 
1996 
} 