Revision 4242b1bd target-i386/ops_sse.h
b/target-i386/ops_sse.h | ||
---|---|---|
1 | 1 |
/* |
2 |
* MMX/3DNow!/SSE/SSE2/SSE3/PNI support |
|
2 |
* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support
|
|
3 | 3 |
* |
4 | 4 |
* Copyright (c) 2005 Fabrice Bellard |
5 | 5 |
* |
... | ... | |
1275 | 1275 |
} |
1276 | 1276 |
#endif |
1277 | 1277 |
|
1278 |
/* SSSE3 op helpers */ |
|
1279 |
void glue(helper_pshufb, SUFFIX) (Reg *d, Reg *s) |
|
1280 |
{ |
|
1281 |
int i; |
|
1282 |
Reg r; |
|
1283 |
|
|
1284 |
for (i = 0; i < (8 << SHIFT); i++) |
|
1285 |
r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1))); |
|
1286 |
|
|
1287 |
*d = r; |
|
1288 |
} |
|
1289 |
|
|
1290 |
void glue(helper_phaddw, SUFFIX) (Reg *d, Reg *s) |
|
1291 |
{ |
|
1292 |
d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1); |
|
1293 |
d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3); |
|
1294 |
XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5)); |
|
1295 |
XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7)); |
|
1296 |
d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1); |
|
1297 |
d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3); |
|
1298 |
XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5)); |
|
1299 |
XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7)); |
|
1300 |
} |
|
1301 |
|
|
1302 |
void glue(helper_phaddd, SUFFIX) (Reg *d, Reg *s) |
|
1303 |
{ |
|
1304 |
d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1); |
|
1305 |
XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3)); |
|
1306 |
d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1); |
|
1307 |
XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3)); |
|
1308 |
} |
|
1309 |
|
|
1310 |
void glue(helper_phaddsw, SUFFIX) (Reg *d, Reg *s) |
|
1311 |
{ |
|
1312 |
d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1)); |
|
1313 |
d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3)); |
|
1314 |
XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5))); |
|
1315 |
XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7))); |
|
1316 |
d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1)); |
|
1317 |
d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3)); |
|
1318 |
XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5))); |
|
1319 |
XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7))); |
|
1320 |
} |
|
1321 |
|
|
1322 |
void glue(helper_pmaddubsw, SUFFIX) (Reg *d, Reg *s) |
|
1323 |
{ |
|
1324 |
d->W(0) = satsw((int8_t)s->B( 0) * (uint8_t)d->B( 0) + |
|
1325 |
(int8_t)s->B( 1) * (uint8_t)d->B( 1)); |
|
1326 |
d->W(1) = satsw((int8_t)s->B( 2) * (uint8_t)d->B( 2) + |
|
1327 |
(int8_t)s->B( 3) * (uint8_t)d->B( 3)); |
|
1328 |
d->W(2) = satsw((int8_t)s->B( 4) * (uint8_t)d->B( 4) + |
|
1329 |
(int8_t)s->B( 5) * (uint8_t)d->B( 5)); |
|
1330 |
d->W(3) = satsw((int8_t)s->B( 6) * (uint8_t)d->B( 6) + |
|
1331 |
(int8_t)s->B( 7) * (uint8_t)d->B( 7)); |
|
1332 |
#if SHIFT == 1 |
|
1333 |
d->W(4) = satsw((int8_t)s->B( 8) * (uint8_t)d->B( 8) + |
|
1334 |
(int8_t)s->B( 9) * (uint8_t)d->B( 9)); |
|
1335 |
d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) + |
|
1336 |
(int8_t)s->B(11) * (uint8_t)d->B(11)); |
|
1337 |
d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) + |
|
1338 |
(int8_t)s->B(13) * (uint8_t)d->B(13)); |
|
1339 |
d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) + |
|
1340 |
(int8_t)s->B(15) * (uint8_t)d->B(15)); |
|
1341 |
#endif |
|
1342 |
} |
|
1343 |
|
|
1344 |
void glue(helper_phsubw, SUFFIX) (Reg *d, Reg *s) |
|
1345 |
{ |
|
1346 |
d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1); |
|
1347 |
d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3); |
|
1348 |
XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5)); |
|
1349 |
XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7)); |
|
1350 |
d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1); |
|
1351 |
d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3); |
|
1352 |
XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5)); |
|
1353 |
XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7)); |
|
1354 |
} |
|
1355 |
|
|
1356 |
void glue(helper_phsubd, SUFFIX) (Reg *d, Reg *s) |
|
1357 |
{ |
|
1358 |
d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1); |
|
1359 |
XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3)); |
|
1360 |
d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1); |
|
1361 |
XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3)); |
|
1362 |
} |
|
1363 |
|
|
1364 |
void glue(helper_phsubsw, SUFFIX) (Reg *d, Reg *s) |
|
1365 |
{ |
|
1366 |
d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1)); |
|
1367 |
d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3)); |
|
1368 |
XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5))); |
|
1369 |
XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7))); |
|
1370 |
d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1)); |
|
1371 |
d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3)); |
|
1372 |
XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5))); |
|
1373 |
XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7))); |
|
1374 |
} |
|
1375 |
|
|
1376 |
#define FABSB(_, x) x > INT8_MAX ? -(int8_t ) x : x |
|
1377 |
#define FABSW(_, x) x > INT16_MAX ? -(int16_t) x : x |
|
1378 |
#define FABSL(_, x) x > INT32_MAX ? -(int32_t) x : x |
|
1379 |
SSE_HELPER_B(helper_pabsb, FABSB) |
|
1380 |
SSE_HELPER_W(helper_pabsw, FABSW) |
|
1381 |
SSE_HELPER_L(helper_pabsd, FABSL) |
|
1382 |
|
|
1383 |
#define FMULHRSW(d, s) ((int16_t) d * (int16_t) s + 0x4000) >> 15 |
|
1384 |
SSE_HELPER_W(helper_pmulhrsw, FMULHRSW) |
|
1385 |
|
|
1386 |
#define FSIGNB(d, s) s <= INT8_MAX ? s ? d : 0 : -(int8_t ) d |
|
1387 |
#define FSIGNW(d, s) s <= INT16_MAX ? s ? d : 0 : -(int16_t) d |
|
1388 |
#define FSIGNL(d, s) s <= INT32_MAX ? s ? d : 0 : -(int32_t) d |
|
1389 |
SSE_HELPER_B(helper_psignb, FSIGNB) |
|
1390 |
SSE_HELPER_W(helper_psignw, FSIGNW) |
|
1391 |
SSE_HELPER_L(helper_psignd, FSIGNL) |
|
1392 |
|
|
1393 |
void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift) |
|
1394 |
{ |
|
1395 |
Reg r; |
|
1396 |
|
|
1397 |
/* XXX could be checked during translation */ |
|
1398 |
if (shift >= (16 << SHIFT)) { |
|
1399 |
r.Q(0) = 0; |
|
1400 |
XMM_ONLY(r.Q(1) = 0); |
|
1401 |
} else { |
|
1402 |
shift <<= 3; |
|
1403 |
#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) |
|
1404 |
#if SHIFT == 0 |
|
1405 |
r.Q(0) = SHR(s->Q(0), shift - 0) | |
|
1406 |
SHR(d->Q(0), shift - 64); |
|
1407 |
#else |
|
1408 |
r.Q(0) = SHR(s->Q(0), shift - 0) | |
|
1409 |
SHR(s->Q(1), shift - 64) | |
|
1410 |
SHR(d->Q(0), shift - 128) | |
|
1411 |
SHR(d->Q(1), shift - 192); |
|
1412 |
r.Q(1) = SHR(s->Q(0), shift + 64) | |
|
1413 |
SHR(s->Q(1), shift - 0) | |
|
1414 |
SHR(d->Q(0), shift - 64) | |
|
1415 |
SHR(d->Q(1), shift - 128); |
|
1416 |
#endif |
|
1417 |
#undef SHR |
|
1418 |
} |
|
1419 |
|
|
1420 |
*d = r; |
|
1421 |
} |
|
1422 |
|
|
1278 | 1423 |
#undef SHIFT |
1279 | 1424 |
#undef XMM_ONLY |
1280 | 1425 |
#undef Reg |
Also available in: Unified diff