root / target-mips / lmi_helper.c @ c09b437b
History | View | Annotate | Download (14 kB)
1 |
/*
|
---|---|
2 |
* Loongson Multimedia Instruction emulation helpers for QEMU.
|
3 |
*
|
4 |
* Copyright (c) 2011 Richard Henderson <rth@twiddle.net>
|
5 |
*
|
6 |
* This library is free software; you can redistribute it and/or
|
7 |
* modify it under the terms of the GNU Lesser General Public
|
8 |
* License as published by the Free Software Foundation; either
|
9 |
* version 2 of the License, or (at your option) any later version.
|
10 |
*
|
11 |
* This library is distributed in the hope that it will be useful,
|
12 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 |
* Lesser General Public License for more details.
|
15 |
*
|
16 |
* You should have received a copy of the GNU Lesser General Public
|
17 |
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
18 |
*/
|
19 |
|
20 |
#include "cpu.h" |
21 |
#include "helper.h" |
22 |
|
23 |
/* If the byte ordering doesn't matter, i.e. all columns are treated
|
24 |
identically, then this union can be used directly. If byte ordering
|
25 |
does matter, we generally ignore dumping to memory. */
|
26 |
typedef union { |
27 |
uint8_t ub[8];
|
28 |
int8_t sb[8];
|
29 |
uint16_t uh[4];
|
30 |
int16_t sh[4];
|
31 |
uint32_t uw[2];
|
32 |
int32_t sw[2];
|
33 |
uint64_t d; |
34 |
} LMIValue; |
35 |
|
36 |
/* Some byte ordering issues can be mitigated by XORing in the following. */
|
37 |
#ifdef HOST_WORDS_BIGENDIAN
|
38 |
# define BYTE_ORDER_XOR(N) N
|
39 |
#else
|
40 |
# define BYTE_ORDER_XOR(N) 0 |
41 |
#endif
|
42 |
|
43 |
#define SATSB(x) (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x) |
44 |
#define SATUB(x) (x > 0xff ? 0xff : x) |
45 |
|
46 |
#define SATSH(x) (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x) |
47 |
#define SATUH(x) (x > 0xffff ? 0xffff : x) |
48 |
|
49 |
#define SATSW(x) \
|
50 |
(x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x) |
51 |
#define SATUW(x) (x > 0xffffffffull ? 0xffffffffull : x) |
52 |
|
53 |
uint64_t helper_paddsb(uint64_t fs, uint64_t ft) |
54 |
{ |
55 |
LMIValue vs, vt; |
56 |
unsigned int i; |
57 |
|
58 |
vs.d = fs; |
59 |
vt.d = ft; |
60 |
for (i = 0; i < 8; ++i) { |
61 |
int r = vs.sb[i] + vt.sb[i];
|
62 |
vs.sb[i] = SATSB(r); |
63 |
} |
64 |
return vs.d;
|
65 |
} |
66 |
|
67 |
uint64_t helper_paddusb(uint64_t fs, uint64_t ft) |
68 |
{ |
69 |
LMIValue vs, vt; |
70 |
unsigned int i; |
71 |
|
72 |
vs.d = fs; |
73 |
vt.d = ft; |
74 |
for (i = 0; i < 8; ++i) { |
75 |
int r = vs.ub[i] + vt.ub[i];
|
76 |
vs.ub[i] = SATUB(r); |
77 |
} |
78 |
return vs.d;
|
79 |
} |
80 |
|
81 |
uint64_t helper_paddsh(uint64_t fs, uint64_t ft) |
82 |
{ |
83 |
LMIValue vs, vt; |
84 |
unsigned int i; |
85 |
|
86 |
vs.d = fs; |
87 |
vt.d = ft; |
88 |
for (i = 0; i < 4; ++i) { |
89 |
int r = vs.sh[i] + vt.sh[i];
|
90 |
vs.sh[i] = SATSH(r); |
91 |
} |
92 |
return vs.d;
|
93 |
} |
94 |
|
95 |
uint64_t helper_paddush(uint64_t fs, uint64_t ft) |
96 |
{ |
97 |
LMIValue vs, vt; |
98 |
unsigned int i; |
99 |
|
100 |
vs.d = fs; |
101 |
vt.d = ft; |
102 |
for (i = 0; i < 4; ++i) { |
103 |
int r = vs.uh[i] + vt.uh[i];
|
104 |
vs.uh[i] = SATUH(r); |
105 |
} |
106 |
return vs.d;
|
107 |
} |
108 |
|
109 |
uint64_t helper_paddb(uint64_t fs, uint64_t ft) |
110 |
{ |
111 |
LMIValue vs, vt; |
112 |
unsigned int i; |
113 |
|
114 |
vs.d = fs; |
115 |
vt.d = ft; |
116 |
for (i = 0; i < 8; ++i) { |
117 |
vs.ub[i] += vt.ub[i]; |
118 |
} |
119 |
return vs.d;
|
120 |
} |
121 |
|
122 |
uint64_t helper_paddh(uint64_t fs, uint64_t ft) |
123 |
{ |
124 |
LMIValue vs, vt; |
125 |
unsigned int i; |
126 |
|
127 |
vs.d = fs; |
128 |
vt.d = ft; |
129 |
for (i = 0; i < 4; ++i) { |
130 |
vs.uh[i] += vt.uh[i]; |
131 |
} |
132 |
return vs.d;
|
133 |
} |
134 |
|
135 |
uint64_t helper_paddw(uint64_t fs, uint64_t ft) |
136 |
{ |
137 |
LMIValue vs, vt; |
138 |
unsigned int i; |
139 |
|
140 |
vs.d = fs; |
141 |
vt.d = ft; |
142 |
for (i = 0; i < 2; ++i) { |
143 |
vs.uw[i] += vt.uw[i]; |
144 |
} |
145 |
return vs.d;
|
146 |
} |
147 |
|
148 |
uint64_t helper_psubsb(uint64_t fs, uint64_t ft) |
149 |
{ |
150 |
LMIValue vs, vt; |
151 |
unsigned int i; |
152 |
|
153 |
vs.d = fs; |
154 |
vt.d = ft; |
155 |
for (i = 0; i < 8; ++i) { |
156 |
int r = vs.sb[i] - vt.sb[i];
|
157 |
vs.sb[i] = SATSB(r); |
158 |
} |
159 |
return vs.d;
|
160 |
} |
161 |
|
162 |
uint64_t helper_psubusb(uint64_t fs, uint64_t ft) |
163 |
{ |
164 |
LMIValue vs, vt; |
165 |
unsigned int i; |
166 |
|
167 |
vs.d = fs; |
168 |
vt.d = ft; |
169 |
for (i = 0; i < 8; ++i) { |
170 |
int r = vs.ub[i] - vt.ub[i];
|
171 |
vs.ub[i] = SATUB(r); |
172 |
} |
173 |
return vs.d;
|
174 |
} |
175 |
|
176 |
uint64_t helper_psubsh(uint64_t fs, uint64_t ft) |
177 |
{ |
178 |
LMIValue vs, vt; |
179 |
unsigned int i; |
180 |
|
181 |
vs.d = fs; |
182 |
vt.d = ft; |
183 |
for (i = 0; i < 4; ++i) { |
184 |
int r = vs.sh[i] - vt.sh[i];
|
185 |
vs.sh[i] = SATSH(r); |
186 |
} |
187 |
return vs.d;
|
188 |
} |
189 |
|
190 |
uint64_t helper_psubush(uint64_t fs, uint64_t ft) |
191 |
{ |
192 |
LMIValue vs, vt; |
193 |
unsigned int i; |
194 |
|
195 |
vs.d = fs; |
196 |
vt.d = ft; |
197 |
for (i = 0; i < 4; ++i) { |
198 |
int r = vs.uh[i] - vt.uh[i];
|
199 |
vs.uh[i] = SATUH(r); |
200 |
} |
201 |
return vs.d;
|
202 |
} |
203 |
|
204 |
uint64_t helper_psubb(uint64_t fs, uint64_t ft) |
205 |
{ |
206 |
LMIValue vs, vt; |
207 |
unsigned int i; |
208 |
|
209 |
vs.d = fs; |
210 |
vt.d = ft; |
211 |
for (i = 0; i < 8; ++i) { |
212 |
vs.ub[i] -= vt.ub[i]; |
213 |
} |
214 |
return vs.d;
|
215 |
} |
216 |
|
217 |
uint64_t helper_psubh(uint64_t fs, uint64_t ft) |
218 |
{ |
219 |
LMIValue vs, vt; |
220 |
unsigned int i; |
221 |
|
222 |
vs.d = fs; |
223 |
vt.d = ft; |
224 |
for (i = 0; i < 4; ++i) { |
225 |
vs.uh[i] -= vt.uh[i]; |
226 |
} |
227 |
return vs.d;
|
228 |
} |
229 |
|
230 |
uint64_t helper_psubw(uint64_t fs, uint64_t ft) |
231 |
{ |
232 |
LMIValue vs, vt; |
233 |
unsigned int i; |
234 |
|
235 |
vs.d = fs; |
236 |
vt.d = ft; |
237 |
for (i = 0; i < 2; ++i) { |
238 |
vs.uw[i] -= vt.uw[i]; |
239 |
} |
240 |
return vs.d;
|
241 |
} |
242 |
|
243 |
uint64_t helper_pshufh(uint64_t fs, uint64_t ft) |
244 |
{ |
245 |
unsigned host = BYTE_ORDER_XOR(3); |
246 |
LMIValue vd, vs; |
247 |
unsigned i;
|
248 |
|
249 |
vs.d = fs; |
250 |
vd.d = 0;
|
251 |
for (i = 0; i < 4; i++, ft >>= 2) { |
252 |
vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host];
|
253 |
} |
254 |
return vd.d;
|
255 |
} |
256 |
|
257 |
uint64_t helper_packsswh(uint64_t fs, uint64_t ft) |
258 |
{ |
259 |
uint64_t fd = 0;
|
260 |
int64_t tmp; |
261 |
|
262 |
tmp = (int32_t)(fs >> 0);
|
263 |
tmp = SATSH(tmp); |
264 |
fd |= (tmp & 0xffff) << 0; |
265 |
|
266 |
tmp = (int32_t)(fs >> 32);
|
267 |
tmp = SATSH(tmp); |
268 |
fd |= (tmp & 0xffff) << 16; |
269 |
|
270 |
tmp = (int32_t)(ft >> 0);
|
271 |
tmp = SATSH(tmp); |
272 |
fd |= (tmp & 0xffff) << 32; |
273 |
|
274 |
tmp = (int32_t)(ft >> 32);
|
275 |
tmp = SATSH(tmp); |
276 |
fd |= (tmp & 0xffff) << 48; |
277 |
|
278 |
return fd;
|
279 |
} |
280 |
|
281 |
uint64_t helper_packsshb(uint64_t fs, uint64_t ft) |
282 |
{ |
283 |
uint64_t fd = 0;
|
284 |
unsigned int i; |
285 |
|
286 |
for (i = 0; i < 4; ++i) { |
287 |
int16_t tmp = fs >> (i * 16);
|
288 |
tmp = SATSB(tmp); |
289 |
fd |= (uint64_t)(tmp & 0xff) << (i * 8); |
290 |
} |
291 |
for (i = 0; i < 4; ++i) { |
292 |
int16_t tmp = ft >> (i * 16);
|
293 |
tmp = SATSB(tmp); |
294 |
fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32); |
295 |
} |
296 |
|
297 |
return fd;
|
298 |
} |
299 |
|
300 |
uint64_t helper_packushb(uint64_t fs, uint64_t ft) |
301 |
{ |
302 |
uint64_t fd = 0;
|
303 |
unsigned int i; |
304 |
|
305 |
for (i = 0; i < 4; ++i) { |
306 |
int16_t tmp = fs >> (i * 16);
|
307 |
tmp = SATUB(tmp); |
308 |
fd |= (uint64_t)(tmp & 0xff) << (i * 8); |
309 |
} |
310 |
for (i = 0; i < 4; ++i) { |
311 |
int16_t tmp = ft >> (i * 16);
|
312 |
tmp = SATUB(tmp); |
313 |
fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32); |
314 |
} |
315 |
|
316 |
return fd;
|
317 |
} |
318 |
|
319 |
uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft) |
320 |
{ |
321 |
return (fs & 0xffffffff) | (ft << 32); |
322 |
} |
323 |
|
324 |
uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft) |
325 |
{ |
326 |
return (fs >> 32) | (ft & ~0xffffffffull); |
327 |
} |
328 |
|
329 |
uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft) |
330 |
{ |
331 |
unsigned host = BYTE_ORDER_XOR(3); |
332 |
LMIValue vd, vs, vt; |
333 |
|
334 |
vs.d = fs; |
335 |
vt.d = ft; |
336 |
vd.uh[0 ^ host] = vs.uh[0 ^ host]; |
337 |
vd.uh[1 ^ host] = vt.uh[0 ^ host]; |
338 |
vd.uh[2 ^ host] = vs.uh[1 ^ host]; |
339 |
vd.uh[3 ^ host] = vt.uh[1 ^ host]; |
340 |
|
341 |
return vd.d;
|
342 |
} |
343 |
|
344 |
uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft) |
345 |
{ |
346 |
unsigned host = BYTE_ORDER_XOR(3); |
347 |
LMIValue vd, vs, vt; |
348 |
|
349 |
vs.d = fs; |
350 |
vt.d = ft; |
351 |
vd.uh[0 ^ host] = vs.uh[2 ^ host]; |
352 |
vd.uh[1 ^ host] = vt.uh[2 ^ host]; |
353 |
vd.uh[2 ^ host] = vs.uh[3 ^ host]; |
354 |
vd.uh[3 ^ host] = vt.uh[3 ^ host]; |
355 |
|
356 |
return vd.d;
|
357 |
} |
358 |
|
359 |
uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft) |
360 |
{ |
361 |
unsigned host = BYTE_ORDER_XOR(7); |
362 |
LMIValue vd, vs, vt; |
363 |
|
364 |
vs.d = fs; |
365 |
vt.d = ft; |
366 |
vd.ub[0 ^ host] = vs.ub[0 ^ host]; |
367 |
vd.ub[1 ^ host] = vt.ub[0 ^ host]; |
368 |
vd.ub[2 ^ host] = vs.ub[1 ^ host]; |
369 |
vd.ub[3 ^ host] = vt.ub[1 ^ host]; |
370 |
vd.ub[4 ^ host] = vs.ub[2 ^ host]; |
371 |
vd.ub[5 ^ host] = vt.ub[2 ^ host]; |
372 |
vd.ub[6 ^ host] = vs.ub[3 ^ host]; |
373 |
vd.ub[7 ^ host] = vt.ub[3 ^ host]; |
374 |
|
375 |
return vd.d;
|
376 |
} |
377 |
|
378 |
uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft) |
379 |
{ |
380 |
unsigned host = BYTE_ORDER_XOR(7); |
381 |
LMIValue vd, vs, vt; |
382 |
|
383 |
vs.d = fs; |
384 |
vt.d = ft; |
385 |
vd.ub[0 ^ host] = vs.ub[4 ^ host]; |
386 |
vd.ub[1 ^ host] = vt.ub[4 ^ host]; |
387 |
vd.ub[2 ^ host] = vs.ub[5 ^ host]; |
388 |
vd.ub[3 ^ host] = vt.ub[5 ^ host]; |
389 |
vd.ub[4 ^ host] = vs.ub[6 ^ host]; |
390 |
vd.ub[5 ^ host] = vt.ub[6 ^ host]; |
391 |
vd.ub[6 ^ host] = vs.ub[7 ^ host]; |
392 |
vd.ub[7 ^ host] = vt.ub[7 ^ host]; |
393 |
|
394 |
return vd.d;
|
395 |
} |
396 |
|
397 |
uint64_t helper_pavgh(uint64_t fs, uint64_t ft) |
398 |
{ |
399 |
LMIValue vs, vt; |
400 |
unsigned i;
|
401 |
|
402 |
vs.d = fs; |
403 |
vt.d = ft; |
404 |
for (i = 0; i < 4; i++) { |
405 |
vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1; |
406 |
} |
407 |
return vs.d;
|
408 |
} |
409 |
|
410 |
uint64_t helper_pavgb(uint64_t fs, uint64_t ft) |
411 |
{ |
412 |
LMIValue vs, vt; |
413 |
unsigned i;
|
414 |
|
415 |
vs.d = fs; |
416 |
vt.d = ft; |
417 |
for (i = 0; i < 8; i++) { |
418 |
vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1; |
419 |
} |
420 |
return vs.d;
|
421 |
} |
422 |
|
423 |
uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft) |
424 |
{ |
425 |
LMIValue vs, vt; |
426 |
unsigned i;
|
427 |
|
428 |
vs.d = fs; |
429 |
vt.d = ft; |
430 |
for (i = 0; i < 4; i++) { |
431 |
vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]); |
432 |
} |
433 |
return vs.d;
|
434 |
} |
435 |
|
436 |
uint64_t helper_pminsh(uint64_t fs, uint64_t ft) |
437 |
{ |
438 |
LMIValue vs, vt; |
439 |
unsigned i;
|
440 |
|
441 |
vs.d = fs; |
442 |
vt.d = ft; |
443 |
for (i = 0; i < 4; i++) { |
444 |
vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]); |
445 |
} |
446 |
return vs.d;
|
447 |
} |
448 |
|
449 |
uint64_t helper_pmaxub(uint64_t fs, uint64_t ft) |
450 |
{ |
451 |
LMIValue vs, vt; |
452 |
unsigned i;
|
453 |
|
454 |
vs.d = fs; |
455 |
vt.d = ft; |
456 |
for (i = 0; i < 4; i++) { |
457 |
vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]); |
458 |
} |
459 |
return vs.d;
|
460 |
} |
461 |
|
462 |
uint64_t helper_pminub(uint64_t fs, uint64_t ft) |
463 |
{ |
464 |
LMIValue vs, vt; |
465 |
unsigned i;
|
466 |
|
467 |
vs.d = fs; |
468 |
vt.d = ft; |
469 |
for (i = 0; i < 4; i++) { |
470 |
vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]); |
471 |
} |
472 |
return vs.d;
|
473 |
} |
474 |
|
475 |
uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft) |
476 |
{ |
477 |
LMIValue vs, vt; |
478 |
unsigned i;
|
479 |
|
480 |
vs.d = fs; |
481 |
vt.d = ft; |
482 |
for (i = 0; i < 2; i++) { |
483 |
vs.uw[i] = -(vs.uw[i] == vt.uw[i]); |
484 |
} |
485 |
return vs.d;
|
486 |
} |
487 |
|
488 |
uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft) |
489 |
{ |
490 |
LMIValue vs, vt; |
491 |
unsigned i;
|
492 |
|
493 |
vs.d = fs; |
494 |
vt.d = ft; |
495 |
for (i = 0; i < 2; i++) { |
496 |
vs.uw[i] = -(vs.uw[i] > vt.uw[i]); |
497 |
} |
498 |
return vs.d;
|
499 |
} |
500 |
|
501 |
uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft) |
502 |
{ |
503 |
LMIValue vs, vt; |
504 |
unsigned i;
|
505 |
|
506 |
vs.d = fs; |
507 |
vt.d = ft; |
508 |
for (i = 0; i < 4; i++) { |
509 |
vs.uh[i] = -(vs.uh[i] == vt.uh[i]); |
510 |
} |
511 |
return vs.d;
|
512 |
} |
513 |
|
514 |
uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft) |
515 |
{ |
516 |
LMIValue vs, vt; |
517 |
unsigned i;
|
518 |
|
519 |
vs.d = fs; |
520 |
vt.d = ft; |
521 |
for (i = 0; i < 4; i++) { |
522 |
vs.uh[i] = -(vs.uh[i] > vt.uh[i]); |
523 |
} |
524 |
return vs.d;
|
525 |
} |
526 |
|
527 |
uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft) |
528 |
{ |
529 |
LMIValue vs, vt; |
530 |
unsigned i;
|
531 |
|
532 |
vs.d = fs; |
533 |
vt.d = ft; |
534 |
for (i = 0; i < 8; i++) { |
535 |
vs.ub[i] = -(vs.ub[i] == vt.ub[i]); |
536 |
} |
537 |
return vs.d;
|
538 |
} |
539 |
|
540 |
uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft) |
541 |
{ |
542 |
LMIValue vs, vt; |
543 |
unsigned i;
|
544 |
|
545 |
vs.d = fs; |
546 |
vt.d = ft; |
547 |
for (i = 0; i < 8; i++) { |
548 |
vs.ub[i] = -(vs.ub[i] > vt.ub[i]); |
549 |
} |
550 |
return vs.d;
|
551 |
} |
552 |
|
553 |
uint64_t helper_psllw(uint64_t fs, uint64_t ft) |
554 |
{ |
555 |
LMIValue vs; |
556 |
unsigned i;
|
557 |
|
558 |
ft &= 0x7f;
|
559 |
if (ft > 31) { |
560 |
return 0; |
561 |
} |
562 |
vs.d = fs; |
563 |
for (i = 0; i < 2; ++i) { |
564 |
vs.uw[i] <<= ft; |
565 |
} |
566 |
return vs.d;
|
567 |
} |
568 |
|
569 |
uint64_t helper_psrlw(uint64_t fs, uint64_t ft) |
570 |
{ |
571 |
LMIValue vs; |
572 |
unsigned i;
|
573 |
|
574 |
ft &= 0x7f;
|
575 |
if (ft > 31) { |
576 |
return 0; |
577 |
} |
578 |
vs.d = fs; |
579 |
for (i = 0; i < 2; ++i) { |
580 |
vs.uw[i] >>= ft; |
581 |
} |
582 |
return vs.d;
|
583 |
} |
584 |
|
585 |
uint64_t helper_psraw(uint64_t fs, uint64_t ft) |
586 |
{ |
587 |
LMIValue vs; |
588 |
unsigned i;
|
589 |
|
590 |
ft &= 0x7f;
|
591 |
if (ft > 31) { |
592 |
ft = 31;
|
593 |
} |
594 |
vs.d = fs; |
595 |
for (i = 0; i < 2; ++i) { |
596 |
vs.sw[i] >>= ft; |
597 |
} |
598 |
return vs.d;
|
599 |
} |
600 |
|
601 |
uint64_t helper_psllh(uint64_t fs, uint64_t ft) |
602 |
{ |
603 |
LMIValue vs; |
604 |
unsigned i;
|
605 |
|
606 |
ft &= 0x7f;
|
607 |
if (ft > 15) { |
608 |
return 0; |
609 |
} |
610 |
vs.d = fs; |
611 |
for (i = 0; i < 4; ++i) { |
612 |
vs.uh[i] <<= ft; |
613 |
} |
614 |
return vs.d;
|
615 |
} |
616 |
|
617 |
uint64_t helper_psrlh(uint64_t fs, uint64_t ft) |
618 |
{ |
619 |
LMIValue vs; |
620 |
unsigned i;
|
621 |
|
622 |
ft &= 0x7f;
|
623 |
if (ft > 15) { |
624 |
return 0; |
625 |
} |
626 |
vs.d = fs; |
627 |
for (i = 0; i < 4; ++i) { |
628 |
vs.uh[i] >>= ft; |
629 |
} |
630 |
return vs.d;
|
631 |
} |
632 |
|
633 |
uint64_t helper_psrah(uint64_t fs, uint64_t ft) |
634 |
{ |
635 |
LMIValue vs; |
636 |
unsigned i;
|
637 |
|
638 |
ft &= 0x7f;
|
639 |
if (ft > 15) { |
640 |
ft = 15;
|
641 |
} |
642 |
vs.d = fs; |
643 |
for (i = 0; i < 4; ++i) { |
644 |
vs.sh[i] >>= ft; |
645 |
} |
646 |
return vs.d;
|
647 |
} |
648 |
|
649 |
uint64_t helper_pmullh(uint64_t fs, uint64_t ft) |
650 |
{ |
651 |
LMIValue vs, vt; |
652 |
unsigned i;
|
653 |
|
654 |
vs.d = fs; |
655 |
vt.d = ft; |
656 |
for (i = 0; i < 4; ++i) { |
657 |
vs.sh[i] *= vt.sh[i]; |
658 |
} |
659 |
return vs.d;
|
660 |
} |
661 |
|
662 |
uint64_t helper_pmulhh(uint64_t fs, uint64_t ft) |
663 |
{ |
664 |
LMIValue vs, vt; |
665 |
unsigned i;
|
666 |
|
667 |
vs.d = fs; |
668 |
vt.d = ft; |
669 |
for (i = 0; i < 4; ++i) { |
670 |
int32_t r = vs.sh[i] * vt.sh[i]; |
671 |
vs.sh[i] = r >> 16;
|
672 |
} |
673 |
return vs.d;
|
674 |
} |
675 |
|
676 |
uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft) |
677 |
{ |
678 |
LMIValue vs, vt; |
679 |
unsigned i;
|
680 |
|
681 |
vs.d = fs; |
682 |
vt.d = ft; |
683 |
for (i = 0; i < 4; ++i) { |
684 |
uint32_t r = vs.uh[i] * vt.uh[i]; |
685 |
vs.uh[i] = r >> 16;
|
686 |
} |
687 |
return vs.d;
|
688 |
} |
689 |
|
690 |
uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft) |
691 |
{ |
692 |
unsigned host = BYTE_ORDER_XOR(3); |
693 |
LMIValue vs, vt; |
694 |
uint32_t p0, p1; |
695 |
|
696 |
vs.d = fs; |
697 |
vt.d = ft; |
698 |
p0 = vs.sh[0 ^ host] * vt.sh[0 ^ host]; |
699 |
p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host]; |
700 |
p1 = vs.sh[2 ^ host] * vt.sh[2 ^ host]; |
701 |
p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host]; |
702 |
|
703 |
return ((uint64_t)p1 << 32) | p0; |
704 |
} |
705 |
|
706 |
uint64_t helper_pasubub(uint64_t fs, uint64_t ft) |
707 |
{ |
708 |
LMIValue vs, vt; |
709 |
unsigned i;
|
710 |
|
711 |
vs.d = fs; |
712 |
vt.d = ft; |
713 |
for (i = 0; i < 8; ++i) { |
714 |
int r = vs.ub[i] - vt.ub[i];
|
715 |
vs.ub[i] = (r < 0 ? -r : r);
|
716 |
} |
717 |
return vs.d;
|
718 |
} |
719 |
|
720 |
uint64_t helper_biadd(uint64_t fs) |
721 |
{ |
722 |
unsigned i, fd;
|
723 |
|
724 |
for (i = fd = 0; i < 8; ++i) { |
725 |
fd += (fs >> (i * 8)) & 0xff; |
726 |
} |
727 |
return fd & 0xffff; |
728 |
} |
729 |
|
730 |
uint64_t helper_pmovmskb(uint64_t fs) |
731 |
{ |
732 |
unsigned fd = 0; |
733 |
|
734 |
fd |= ((fs >> 7) & 1) << 0; |
735 |
fd |= ((fs >> 15) & 1) << 1; |
736 |
fd |= ((fs >> 23) & 1) << 2; |
737 |
fd |= ((fs >> 31) & 1) << 3; |
738 |
fd |= ((fs >> 39) & 1) << 4; |
739 |
fd |= ((fs >> 47) & 1) << 5; |
740 |
fd |= ((fs >> 55) & 1) << 6; |
741 |
fd |= ((fs >> 63) & 1) << 7; |
742 |
|
743 |
return fd & 0xff; |
744 |
} |