Revision 4242b1bd target-i386/ops_sse.h

b/target-i386/ops_sse.h
1 1
/*
2
 *  MMX/3DNow!/SSE/SSE2/SSE3/PNI support
2
 *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support
3 3
 *
4 4
 *  Copyright (c) 2005 Fabrice Bellard
5 5
 *
......
1275 1275
}
1276 1276
#endif
1277 1277

  
1278
/* SSSE3 op helpers */
1279
void glue(helper_pshufb, SUFFIX) (Reg *d, Reg *s)
1280
{
1281
    int i;
1282
    Reg r;
1283

  
1284
    for (i = 0; i < (8 << SHIFT); i++)
1285
        r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
1286

  
1287
    *d = r;
1288
}
1289

  
1290
void glue(helper_phaddw, SUFFIX) (Reg *d, Reg *s)
1291
{
1292
    d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
1293
    d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
1294
    XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
1295
    XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
1296
    d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
1297
    d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
1298
    XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
1299
    XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
1300
}
1301

  
1302
void glue(helper_phaddd, SUFFIX) (Reg *d, Reg *s)
1303
{
1304
    d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
1305
    XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
1306
    d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
1307
    XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
1308
}
1309

  
1310
void glue(helper_phaddsw, SUFFIX) (Reg *d, Reg *s)
1311
{
1312
    d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
1313
    d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
1314
    XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
1315
    XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
1316
    d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
1317
    d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
1318
    XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
1319
    XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
1320
}
1321

  
1322
void glue(helper_pmaddubsw, SUFFIX) (Reg *d, Reg *s)
1323
{
1324
    d->W(0) = satsw((int8_t)s->B( 0) * (uint8_t)d->B( 0) +
1325
                    (int8_t)s->B( 1) * (uint8_t)d->B( 1));
1326
    d->W(1) = satsw((int8_t)s->B( 2) * (uint8_t)d->B( 2) +
1327
                    (int8_t)s->B( 3) * (uint8_t)d->B( 3));
1328
    d->W(2) = satsw((int8_t)s->B( 4) * (uint8_t)d->B( 4) +
1329
                    (int8_t)s->B( 5) * (uint8_t)d->B( 5));
1330
    d->W(3) = satsw((int8_t)s->B( 6) * (uint8_t)d->B( 6) +
1331
                    (int8_t)s->B( 7) * (uint8_t)d->B( 7));
1332
#if SHIFT == 1
1333
    d->W(4) = satsw((int8_t)s->B( 8) * (uint8_t)d->B( 8) +
1334
                    (int8_t)s->B( 9) * (uint8_t)d->B( 9));
1335
    d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
1336
                    (int8_t)s->B(11) * (uint8_t)d->B(11));
1337
    d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
1338
                    (int8_t)s->B(13) * (uint8_t)d->B(13));
1339
    d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
1340
                    (int8_t)s->B(15) * (uint8_t)d->B(15));
1341
#endif
1342
}
1343

  
1344
void glue(helper_phsubw, SUFFIX) (Reg *d, Reg *s)
1345
{
1346
    d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
1347
    d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
1348
    XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
1349
    XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
1350
    d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
1351
    d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
1352
    XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
1353
    XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
1354
}
1355

  
1356
void glue(helper_phsubd, SUFFIX) (Reg *d, Reg *s)
1357
{
1358
    d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
1359
    XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
1360
    d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
1361
    XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
1362
}
1363

  
1364
void glue(helper_phsubsw, SUFFIX) (Reg *d, Reg *s)
1365
{
1366
    d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
1367
    d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
1368
    XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
1369
    XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
1370
    d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
1371
    d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
1372
    XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
1373
    XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
1374
}
1375

  
1376
#define FABSB(_, x) x > INT8_MAX  ? -(int8_t ) x : x
1377
#define FABSW(_, x) x > INT16_MAX ? -(int16_t) x : x
1378
#define FABSL(_, x) x > INT32_MAX ? -(int32_t) x : x
1379
SSE_HELPER_B(helper_pabsb, FABSB)
1380
SSE_HELPER_W(helper_pabsw, FABSW)
1381
SSE_HELPER_L(helper_pabsd, FABSL)
1382

  
1383
#define FMULHRSW(d, s) ((int16_t) d * (int16_t) s + 0x4000) >> 15
1384
SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
1385

  
1386
#define FSIGNB(d, s) s <= INT8_MAX  ? s ? d : 0 : -(int8_t ) d
1387
#define FSIGNW(d, s) s <= INT16_MAX ? s ? d : 0 : -(int16_t) d
1388
#define FSIGNL(d, s) s <= INT32_MAX ? s ? d : 0 : -(int32_t) d
1389
SSE_HELPER_B(helper_psignb, FSIGNB)
1390
SSE_HELPER_W(helper_psignw, FSIGNW)
1391
SSE_HELPER_L(helper_psignd, FSIGNL)
1392

  
1393
void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift)
1394
{
1395
    Reg r;
1396

  
1397
    /* XXX could be checked during translation */
1398
    if (shift >= (16 << SHIFT)) {
1399
        r.Q(0) = 0;
1400
        XMM_ONLY(r.Q(1) = 0);
1401
    } else {
1402
        shift <<= 3;
1403
#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
1404
#if SHIFT == 0
1405
        r.Q(0) = SHR(s->Q(0), shift -   0) |
1406
                 SHR(d->Q(0), shift -  64);
1407
#else
1408
        r.Q(0) = SHR(s->Q(0), shift -   0) |
1409
                 SHR(s->Q(1), shift -  64) |
1410
                 SHR(d->Q(0), shift - 128) |
1411
                 SHR(d->Q(1), shift - 192);
1412
        r.Q(1) = SHR(s->Q(0), shift +  64) |
1413
                 SHR(s->Q(1), shift -   0) |
1414
                 SHR(d->Q(0), shift -  64) |
1415
                 SHR(d->Q(1), shift - 128);
1416
#endif
1417
#undef SHR
1418
    }
1419

  
1420
    *d = r;
1421
}
1422

  
1278 1423
#undef SHIFT
1279 1424
#undef XMM_ONLY
1280 1425
#undef Reg

Also available in: Unified diff