Revision 222a3336

b/target-i386/ops_sse.h
1 1
/*
2
 *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support
2
 *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3 3
 *
4 4
 *  Copyright (c) 2005 Fabrice Bellard
5
 *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski@intel.com>
5 6
 *
6 7
 * This library is free software; you can redistribute it and/or
7 8
 * modify it under the terms of the GNU Lesser General Public
......
1420 1421
    *d = r;
1421 1422
}
1422 1423

  
1424
#define XMM0 env->xmm_regs[0]
1425

  
1426
#if SHIFT == 1
1427
#define SSE_HELPER_V(name, elem, num, F)\
1428
void glue(name, SUFFIX) (Reg *d, Reg *s)\
1429
{\
1430
    d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\
1431
    d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\
1432
    if (num > 2) {\
1433
        d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\
1434
        d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\
1435
        if (num > 4) {\
1436
            d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\
1437
            d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\
1438
            d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\
1439
            d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\
1440
            if (num > 8) {\
1441
                d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\
1442
                d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\
1443
                d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\
1444
                d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\
1445
                d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\
1446
                d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\
1447
                d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\
1448
                d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\
1449
            }\
1450
        }\
1451
    }\
1452
}
1453

  
1454
#define SSE_HELPER_I(name, elem, num, F)\
1455
void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\
1456
{\
1457
    d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\
1458
    d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\
1459
    if (num > 2) {\
1460
        d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\
1461
        d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\
1462
        if (num > 4) {\
1463
            d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\
1464
            d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\
1465
            d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\
1466
            d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\
1467
            if (num > 8) {\
1468
                d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\
1469
                d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\
1470
                d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\
1471
                d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\
1472
                d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\
1473
                d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\
1474
                d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\
1475
                d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\
1476
            }\
1477
        }\
1478
    }\
1479
}
1480

  
1481
/* SSE4.1 op helpers */
1482
#define FBLENDVB(d, s, m) (m & 0x80) ? s : d
1483
#define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d
1484
#define FBLENDVPD(d, s, m) (m & 0x8000000000000000) ? s : d
1485
SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
1486
SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
1487
SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
1488

  
1489
void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s)
1490
{
1491
    uint64_t zf = (s->Q(0) &  d->Q(0)) | (s->Q(1) &  d->Q(1));
1492
    uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
1493

  
1494
    CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
1495
}
1496

  
1497
#define SSE_HELPER_F(name, elem, num, F)\
1498
void glue(name, SUFFIX) (Reg *d, Reg *s)\
1499
{\
1500
    d->elem(0) = F(0);\
1501
    d->elem(1) = F(1);\
1502
    d->elem(2) = F(2);\
1503
    d->elem(3) = F(3);\
1504
    if (num > 3) {\
1505
        d->elem(4) = F(4);\
1506
        d->elem(5) = F(5);\
1507
        if (num > 5) {\
1508
            d->elem(6) = F(6);\
1509
            d->elem(7) = F(7);\
1510
        }\
1511
    }\
1512
}
1513

  
1514
SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
1515
SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
1516
SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
1517
SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
1518
SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
1519
SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
1520
SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
1521
SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
1522
SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
1523
SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
1524
SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
1525
SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
1526

  
1527
void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s)
1528
{
1529
    d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0);
1530
    d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2);
1531
}
1532

  
1533
#define FCMPEQQ(d, s) d == s ? -1 : 0
1534
SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
1535

  
1536
void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s)
1537
{
1538
    d->W(0) = satuw((int32_t) d->L(0));
1539
    d->W(1) = satuw((int32_t) d->L(1));
1540
    d->W(2) = satuw((int32_t) d->L(2));
1541
    d->W(3) = satuw((int32_t) d->L(3));
1542
    d->W(4) = satuw((int32_t) s->L(0));
1543
    d->W(5) = satuw((int32_t) s->L(1));
1544
    d->W(6) = satuw((int32_t) s->L(2));
1545
    d->W(7) = satuw((int32_t) s->L(3));
1546
}
1547

  
1548
#define FMINSB(d, s) MIN((int8_t) d, (int8_t) s)
1549
#define FMINSD(d, s) MIN((int32_t) d, (int32_t) s)
1550
#define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s)
1551
#define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s)
1552
SSE_HELPER_B(helper_pminsb, FMINSB)
1553
SSE_HELPER_L(helper_pminsd, FMINSD)
1554
SSE_HELPER_W(helper_pminuw, MIN)
1555
SSE_HELPER_L(helper_pminud, MIN)
1556
SSE_HELPER_B(helper_pmaxsb, FMAXSB)
1557
SSE_HELPER_L(helper_pmaxsd, FMAXSD)
1558
SSE_HELPER_W(helper_pmaxuw, MAX)
1559
SSE_HELPER_L(helper_pmaxud, MAX)
1560

  
1561
#define FMULLD(d, s) (int32_t) d * (int32_t) s
1562
SSE_HELPER_L(helper_pmulld, FMULLD)
1563

  
1564
void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s)
1565
{
1566
    int idx = 0;
1567

  
1568
    if (s->W(1) < s->W(idx))
1569
        idx = 1;
1570
    if (s->W(2) < s->W(idx))
1571
        idx = 2;
1572
    if (s->W(3) < s->W(idx))
1573
        idx = 3;
1574
    if (s->W(4) < s->W(idx))
1575
        idx = 4;
1576
    if (s->W(5) < s->W(idx))
1577
        idx = 5;
1578
    if (s->W(6) < s->W(idx))
1579
        idx = 6;
1580
    if (s->W(7) < s->W(idx))
1581
        idx = 7;
1582

  
1583
    d->Q(1) = 0;
1584
    d->L(1) = 0;
1585
    d->W(1) = idx;
1586
    d->W(0) = s->W(idx);
1587
}
1588

  
1589
void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
1590
{
1591
    signed char prev_rounding_mode;
1592

  
1593
    prev_rounding_mode = env->sse_status.float_rounding_mode;
1594
    if (!(mode & (1 << 2)))
1595
        switch (mode & 3) {
1596
        case 0:
1597
            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1598
            break;
1599
        case 1:
1600
            set_float_rounding_mode(float_round_down, &env->sse_status);
1601
            break;
1602
        case 2:
1603
            set_float_rounding_mode(float_round_up, &env->sse_status);
1604
            break;
1605
        case 3:
1606
            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1607
            break;
1608
        }
1609

  
1610
    d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
1611
    d->L(1) = float64_round_to_int(s->L(1), &env->sse_status);
1612
    d->L(2) = float64_round_to_int(s->L(2), &env->sse_status);
1613
    d->L(3) = float64_round_to_int(s->L(3), &env->sse_status);
1614

  
1615
#if 0 /* TODO */
1616
    if (mode & (1 << 3))
1617
        set_float_exception_flags(
1618
                        get_float_exception_flags(&env->sse_status) &
1619
                        ~float_flag_inexact,
1620
                        &env->sse_status);
1621
#endif
1622
    env->sse_status.float_rounding_mode = prev_rounding_mode;
1623
}
1624

  
1625
void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
1626
{
1627
    signed char prev_rounding_mode;
1628

  
1629
    prev_rounding_mode = env->sse_status.float_rounding_mode;
1630
    if (!(mode & (1 << 2)))
1631
        switch (mode & 3) {
1632
        case 0:
1633
            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1634
            break;
1635
        case 1:
1636
            set_float_rounding_mode(float_round_down, &env->sse_status);
1637
            break;
1638
        case 2:
1639
            set_float_rounding_mode(float_round_up, &env->sse_status);
1640
            break;
1641
        case 3:
1642
            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1643
            break;
1644
        }
1645

  
1646
    d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
1647
    d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status);
1648

  
1649
#if 0 /* TODO */
1650
    if (mode & (1 << 3))
1651
        set_float_exception_flags(
1652
                        get_float_exception_flags(&env->sse_status) &
1653
                        ~float_flag_inexact,
1654
                        &env->sse_status);
1655
#endif
1656
    env->sse_status.float_rounding_mode = prev_rounding_mode;
1657
}
1658

  
1659
void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
1660
{
1661
    signed char prev_rounding_mode;
1662

  
1663
    prev_rounding_mode = env->sse_status.float_rounding_mode;
1664
    if (!(mode & (1 << 2)))
1665
        switch (mode & 3) {
1666
        case 0:
1667
            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1668
            break;
1669
        case 1:
1670
            set_float_rounding_mode(float_round_down, &env->sse_status);
1671
            break;
1672
        case 2:
1673
            set_float_rounding_mode(float_round_up, &env->sse_status);
1674
            break;
1675
        case 3:
1676
            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1677
            break;
1678
        }
1679

  
1680
    d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
1681

  
1682
#if 0 /* TODO */
1683
    if (mode & (1 << 3))
1684
        set_float_exception_flags(
1685
                        get_float_exception_flags(&env->sse_status) &
1686
                        ~float_flag_inexact,
1687
                        &env->sse_status);
1688
#endif
1689
    env->sse_status.float_rounding_mode = prev_rounding_mode;
1690
}
1691

  
1692
void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
1693
{
1694
    signed char prev_rounding_mode;
1695

  
1696
    prev_rounding_mode = env->sse_status.float_rounding_mode;
1697
    if (!(mode & (1 << 2)))
1698
        switch (mode & 3) {
1699
        case 0:
1700
            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1701
            break;
1702
        case 1:
1703
            set_float_rounding_mode(float_round_down, &env->sse_status);
1704
            break;
1705
        case 2:
1706
            set_float_rounding_mode(float_round_up, &env->sse_status);
1707
            break;
1708
        case 3:
1709
            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1710
            break;
1711
        }
1712

  
1713
    d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
1714

  
1715
#if 0 /* TODO */
1716
    if (mode & (1 << 3))
1717
        set_float_exception_flags(
1718
                        get_float_exception_flags(&env->sse_status) &
1719
                        ~float_flag_inexact,
1720
                        &env->sse_status);
1721
#endif
1722
    env->sse_status.float_rounding_mode = prev_rounding_mode;
1723
}
1724

  
1725
#define FBLENDP(d, s, m) m ? s : d
1726
SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
1727
SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
1728
SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
1729

  
1730
void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
1731
{
1732
    float32 iresult = 0 /*float32_zero*/;
1733

  
1734
    if (mask & (1 << 4))
1735
        iresult = float32_add(iresult,
1736
                        float32_mul(d->L(0), s->L(0), &env->sse_status),
1737
                        &env->sse_status);
1738
    if (mask & (1 << 5))
1739
        iresult = float32_add(iresult,
1740
                        float32_mul(d->L(1), s->L(1), &env->sse_status),
1741
                        &env->sse_status);
1742
    if (mask & (1 << 6))
1743
        iresult = float32_add(iresult,
1744
                        float32_mul(d->L(2), s->L(2), &env->sse_status),
1745
                        &env->sse_status);
1746
    if (mask & (1 << 7))
1747
        iresult = float32_add(iresult,
1748
                        float32_mul(d->L(3), s->L(3), &env->sse_status),
1749
                        &env->sse_status);
1750
    d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/;
1751
    d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/;
1752
    d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/;
1753
    d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/;
1754
}
1755

  
1756
void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
1757
{
1758
    float64 iresult = 0 /*float64_zero*/;
1759

  
1760
    if (mask & (1 << 4))
1761
        iresult = float64_add(iresult,
1762
                        float64_mul(d->Q(0), s->Q(0), &env->sse_status),
1763
                        &env->sse_status);
1764
    if (mask & (1 << 5))
1765
        iresult = float64_add(iresult,
1766
                        float64_mul(d->Q(1), s->Q(1), &env->sse_status),
1767
                        &env->sse_status);
1768
    d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/;
1769
    d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/;
1770
}
1771

  
1772
void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset)
1773
{
1774
    int s0 = (offset & 3) << 2;
1775
    int d0 = (offset & 4) << 0;
1776
    int i;
1777
    Reg r;
1778

  
1779
    for (i = 0; i < 8; i++, d0++) {
1780
        r.W(i) = 0;
1781
        r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
1782
        r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
1783
        r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
1784
        r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
1785
    }
1786

  
1787
    *d = r;
1788
}
1789

  
1790
/* SSE4.2 op helpers */
1791
/* it's unclear whether signed or unsigned */
1792
#define FCMPGTQ(d, s) d > s ? -1 : 0
1793
SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
1794

  
1795
static inline int pcmp_elen(int reg, uint32_t ctrl)
1796
{
1797
    int val;
1798

  
1799
    /* Presence of REX.W is indicated by a bit higher than 7 set */
1800
    if (ctrl >> 8)
1801
        val = abs1((int64_t) env->regs[reg]);
1802
    else
1803
        val = abs1((int32_t) env->regs[reg]);
1804

  
1805
    if (ctrl & 1) {
1806
        if (val > 8)
1807
            return 8;
1808
    } else
1809
        if (val > 16)
1810
            return 16;
1811

  
1812
    return val;
1813
}
1814

  
1815
static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
1816
{
1817
    int val = 0;
1818

  
1819
    if (ctrl & 1) {
1820
        while (val < 8 && r->W(val))
1821
            val++;
1822
    } else
1823
        while (val < 16 && r->B(val))
1824
            val++;
1825

  
1826
    return val;
1827
}
1828

  
1829
static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
1830
{
1831
    switch ((ctrl >> 0) & 3) {
1832
    case 0:
1833
        return r->B(i);
1834
    case 1:
1835
        return r->W(i);
1836
    case 2:
1837
        return (int8_t) r->B(i);
1838
    case 3:
1839
    default:
1840
        return (int16_t) r->W(i);
1841
    }
1842
}
1843

  
1844
static inline unsigned pcmpxstrx(Reg *d, Reg *s,
1845
                int8_t ctrl, int valids, int validd)
1846
{
1847
    unsigned int res = 0;
1848
    int v;
1849
    int j, i;
1850
    int upper = (ctrl & 1) ? 7 : 15;
1851

  
1852
    valids--;
1853
    validd--;
1854

  
1855
    CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
1856

  
1857
    switch ((ctrl >> 2) & 3) {
1858
    case 0:
1859
        for (j = valids; j >= 0; j--) {
1860
            res <<= 1;
1861
            v = pcmp_val(s, ctrl, j);
1862
            for (i = validd; i >= 0; i--)
1863
                res |= (v == pcmp_val(d, ctrl, i));
1864
        }
1865
        break;
1866
    case 1:
1867
        for (j = valids; j >= 0; j--) {
1868
            res <<= 1;
1869
            v = pcmp_val(s, ctrl, j);
1870
            for (i = ((validd - 1) | 1); i >= 0; i -= 2)
1871
                res |= (pcmp_val(d, ctrl, i - 0) <= v &&
1872
                        pcmp_val(d, ctrl, i - 1) >= v);
1873
        }
1874
        break;
1875
    case 2:
1876
        res = (2 << (upper - MAX(valids, validd))) - 1;
1877
        res <<= MAX(valids, validd) - MIN(valids, validd);
1878
        for (i = MIN(valids, validd); i >= 0; i--) {
1879
            res <<= 1;
1880
            v = pcmp_val(s, ctrl, i);
1881
            res |= (v == pcmp_val(d, ctrl, i));
1882
        }
1883
        break;
1884
    case 3:
1885
        for (j = valids - validd; j >= 0; j--) {
1886
            res <<= 1;
1887
            res |= 1;
1888
            for (i = MIN(upper - j, validd); i >= 0; i--)
1889
                res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
1890
        }
1891
        break;
1892
    }
1893

  
1894
    switch ((ctrl >> 4) & 3) {
1895
    case 1:
1896
        res ^= (2 << upper) - 1;
1897
        break;
1898
    case 3:
1899
        res ^= (2 << valids) - 1;
1900
        break;
1901
    }
1902

  
1903
    if (res)
1904
       CC_SRC |= CC_C;
1905
    if (res & 1)
1906
       CC_SRC |= CC_O;
1907

  
1908
    return res;
1909
}
1910

  
1911
static inline int rffs1(unsigned int val)
1912
{
1913
    int ret = 1, hi;
1914

  
1915
    for (hi = sizeof(val) * 4; hi; hi /= 2)
1916
        if (val >> hi) {
1917
            val >>= hi;
1918
            ret += hi;
1919
        }
1920

  
1921
    return ret;
1922
}
1923

  
1924
static inline int ffs1(unsigned int val)
1925
{
1926
    int ret = 1, hi;
1927

  
1928
    for (hi = sizeof(val) * 4; hi; hi /= 2)
1929
        if (val << hi) {
1930
            val <<= hi;
1931
            ret += hi;
1932
        }
1933

  
1934
    return ret;
1935
}
1936

  
1937
void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
1938
{
1939
    unsigned int res = pcmpxstrx(d, s, ctrl,
1940
                    pcmp_elen(R_EDX, ctrl),
1941
                    pcmp_elen(R_EAX, ctrl));
1942

  
1943
    if (res)
1944
        env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
1945
    else
1946
        env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
1947
}
1948

  
1949
void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
1950
{
1951
    int i;
1952
    unsigned int res = pcmpxstrx(d, s, ctrl,
1953
                    pcmp_elen(R_EDX, ctrl),
1954
                    pcmp_elen(R_EAX, ctrl));
1955

  
1956
    if ((ctrl >> 6) & 1) {
1957
        if (ctrl & 1)
1958
            for (i = 0; i <= 8; i--, res >>= 1)
1959
                d->W(i) = (res & 1) ? ~0 : 0;
1960
        else
1961
            for (i = 0; i <= 16; i--, res >>= 1)
1962
                d->B(i) = (res & 1) ? ~0 : 0;
1963
    } else {
1964
        d->Q(1) = 0;
1965
        d->Q(0) = res;
1966
    }
1967
}
1968

  
1969
void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
1970
{
1971
    unsigned int res = pcmpxstrx(d, s, ctrl,
1972
                    pcmp_ilen(s, ctrl),
1973
                    pcmp_ilen(d, ctrl));
1974

  
1975
    if (res)
1976
        env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
1977
    else
1978
        env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
1979
}
1980

  
1981
void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
1982
{
1983
    int i;
1984
    unsigned int res = pcmpxstrx(d, s, ctrl,
1985
                    pcmp_ilen(s, ctrl),
1986
                    pcmp_ilen(d, ctrl));
1987

  
1988
    if ((ctrl >> 6) & 1) {
1989
        if (ctrl & 1)
1990
            for (i = 0; i <= 8; i--, res >>= 1)
1991
                d->W(i) = (res & 1) ? ~0 : 0;
1992
        else
1993
            for (i = 0; i <= 16; i--, res >>= 1)
1994
                d->B(i) = (res & 1) ? ~0 : 0;
1995
    } else {
1996
        d->Q(1) = 0;
1997
        d->Q(0) = res;
1998
    }
1999
}
2000

  
2001
#define CRCPOLY        0x1edc6f41
2002
#define CRCPOLY_BITREV 0x82f63b78
2003
target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
2004
{
2005
    target_ulong crc = (msg & ((target_ulong) -1 >>
2006
                            (TARGET_LONG_BITS - len))) ^ crc1;
2007

  
2008
    while (len--)
2009
        crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
2010

  
2011
    return crc;
2012
}
2013

  
2014
#define POPMASK(i)     ((target_ulong) -1 / ((1LL << (1 << i)) + 1))
2015
#define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i))
2016
target_ulong helper_popcnt(target_ulong n, uint32_t type)
2017
{
2018
    CC_SRC = n ? 0 : CC_Z;
2019

  
2020
    n = POPCOUNT(n, 0);
2021
    n = POPCOUNT(n, 1);
2022
    n = POPCOUNT(n, 2);
2023
    n = POPCOUNT(n, 3);
2024
    if (type == 1)
2025
        return n & 0xff;
2026

  
2027
    n = POPCOUNT(n, 4);
2028
#ifndef TARGET_X86_64
2029
    return n;
2030
#else
2031
    if (type == 2)
2032
        return n & 0xff;
2033

  
2034
    return POPCOUNT(n, 5);
2035
#endif
2036
}
2037
#endif
2038

  
1423 2039
#undef SHIFT
1424 2040
#undef XMM_ONLY
1425 2041
#undef Reg
b/target-i386/ops_sse_header.h
1 1
/*
2
 *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support
2
 *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3 3
 *
4 4
 *  Copyright (c) 2005 Fabrice Bellard
5 5
 *
......
269 269
DEF_HELPER(void, glue(helper_psignd, SUFFIX), (Reg *d, Reg *s))
270 270
DEF_HELPER(void, glue(helper_palignr, SUFFIX), (Reg *d, Reg *s, int32_t shift))
271 271

  
272
/* SSE4.1 op helpers */
273
#if SHIFT == 1
274
DEF_HELPER(void, glue(helper_pblendvb, SUFFIX), (Reg *d, Reg *s))
275
DEF_HELPER(void, glue(helper_blendvps, SUFFIX), (Reg *d, Reg *s))
276
DEF_HELPER(void, glue(helper_blendvpd, SUFFIX), (Reg *d, Reg *s))
277
DEF_HELPER(void, glue(helper_ptest, SUFFIX), (Reg *d, Reg *s))
278
DEF_HELPER(void, glue(helper_pmovsxbw, SUFFIX), (Reg *d, Reg *s))
279
DEF_HELPER(void, glue(helper_pmovsxbd, SUFFIX), (Reg *d, Reg *s))
280
DEF_HELPER(void, glue(helper_pmovsxbq, SUFFIX), (Reg *d, Reg *s))
281
DEF_HELPER(void, glue(helper_pmovsxwd, SUFFIX), (Reg *d, Reg *s))
282
DEF_HELPER(void, glue(helper_pmovsxwq, SUFFIX), (Reg *d, Reg *s))
283
DEF_HELPER(void, glue(helper_pmovsxdq, SUFFIX), (Reg *d, Reg *s))
284
DEF_HELPER(void, glue(helper_pmovzxbw, SUFFIX), (Reg *d, Reg *s))
285
DEF_HELPER(void, glue(helper_pmovzxbd, SUFFIX), (Reg *d, Reg *s))
286
DEF_HELPER(void, glue(helper_pmovzxbq, SUFFIX), (Reg *d, Reg *s))
287
DEF_HELPER(void, glue(helper_pmovzxwd, SUFFIX), (Reg *d, Reg *s))
288
DEF_HELPER(void, glue(helper_pmovzxwq, SUFFIX), (Reg *d, Reg *s))
289
DEF_HELPER(void, glue(helper_pmovzxdq, SUFFIX), (Reg *d, Reg *s))
290
DEF_HELPER(void, glue(helper_pmuldq, SUFFIX), (Reg *d, Reg *s))
291
DEF_HELPER(void, glue(helper_pcmpeqq, SUFFIX), (Reg *d, Reg *s))
292
DEF_HELPER(void, glue(helper_packusdw, SUFFIX), (Reg *d, Reg *s))
293
DEF_HELPER(void, glue(helper_pminsb, SUFFIX), (Reg *d, Reg *s))
294
DEF_HELPER(void, glue(helper_pminsd, SUFFIX), (Reg *d, Reg *s))
295
DEF_HELPER(void, glue(helper_pminuw, SUFFIX), (Reg *d, Reg *s))
296
DEF_HELPER(void, glue(helper_pminud, SUFFIX), (Reg *d, Reg *s))
297
DEF_HELPER(void, glue(helper_pmaxsb, SUFFIX), (Reg *d, Reg *s))
298
DEF_HELPER(void, glue(helper_pmaxsd, SUFFIX), (Reg *d, Reg *s))
299
DEF_HELPER(void, glue(helper_pmaxuw, SUFFIX), (Reg *d, Reg *s))
300
DEF_HELPER(void, glue(helper_pmaxud, SUFFIX), (Reg *d, Reg *s))
301
DEF_HELPER(void, glue(helper_pmulld, SUFFIX), (Reg *d, Reg *s))
302
DEF_HELPER(void, glue(helper_phminposuw, SUFFIX), (Reg *d, Reg *s))
303
DEF_HELPER(void, glue(helper_roundps, SUFFIX), (Reg *d, Reg *s, uint32_t mode))
304
DEF_HELPER(void, glue(helper_roundpd, SUFFIX), (Reg *d, Reg *s, uint32_t mode))
305
DEF_HELPER(void, glue(helper_roundss, SUFFIX), (Reg *d, Reg *s, uint32_t mode))
306
DEF_HELPER(void, glue(helper_roundsd, SUFFIX), (Reg *d, Reg *s, uint32_t mode))
307
DEF_HELPER(void, glue(helper_blendps, SUFFIX), (Reg *d, Reg *s, uint32_t imm))
308
DEF_HELPER(void, glue(helper_blendpd, SUFFIX), (Reg *d, Reg *s, uint32_t imm))
309
DEF_HELPER(void, glue(helper_pblendw, SUFFIX), (Reg *d, Reg *s, uint32_t imm))
310
DEF_HELPER(void, glue(helper_dpps, SUFFIX), (Reg *d, Reg *s, uint32_t mask))
311
DEF_HELPER(void, glue(helper_dppd, SUFFIX), (Reg *d, Reg *s, uint32_t mask))
312
DEF_HELPER(void, glue(helper_mpsadbw, SUFFIX), (Reg *d, Reg *s, uint32_t off))
313
#endif
314

  
315
/* SSE4.2 op helpers */
316
#if SHIFT == 1
317
DEF_HELPER(void, glue(helper_pcmpgtq, SUFFIX), (Reg *d, Reg *s))
318
DEF_HELPER(void, glue(helper_pcmpestri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl))
319
DEF_HELPER(void, glue(helper_pcmpestrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl))
320
DEF_HELPER(void, glue(helper_pcmpistri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl))
321
DEF_HELPER(void, glue(helper_pcmpistrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl))
322
DEF_HELPER(target_ulong, helper_crc32,
323
                (uint32_t crc1, target_ulong msg, uint32_t len))
324
DEF_HELPER(target_ulong, helper_popcnt, (target_ulong n, uint32_t type))
325
#endif
326

  
272 327
#undef SHIFT
273 328
#undef Reg
274 329
#undef SUFFIX
b/target-i386/translate.c
2140 2140
    }
2141 2141
}
2142 2142

  
2143
/* generate modrm memory load or store of 'reg'. TMP0 is used if reg !=
2143
/* generate modrm memory load or store of 'reg'. TMP0 is used if reg ==
2144 2144
   OR_TMP0 */
2145 2145
static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store)
2146 2146
{
......
2770 2770
    [0xc2] = SSE_FOP(cmpeq),
2771 2771
    [0xc6] = { helper_shufps, helper_shufpd },
2772 2772

  
2773
    [0x38] = { SSE_SPECIAL, SSE_SPECIAL },  /* SSSE3 */
2774
    [0x3a] = { SSE_SPECIAL, SSE_SPECIAL },  /* SSSE3 */
2773
    [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */
2774
    [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */
2775 2775

  
2776 2776
    /* MMX ops and their SSE extensions */
2777 2777
    [0x60] = MMX_OP2(punpcklbw),
......
2924 2924
    [0xbf] = helper_pavgb_mmx /* pavgusb */
2925 2925
};
2926 2926

  
2927
static void *sse_op_table6[256][2] = {
2928
    [0x00] = MMX_OP2(pshufb),
2929
    [0x01] = MMX_OP2(phaddw),
2930
    [0x02] = MMX_OP2(phaddd),
2931
    [0x03] = MMX_OP2(phaddsw),
2932
    [0x04] = MMX_OP2(pmaddubsw),
2933
    [0x05] = MMX_OP2(phsubw),
2934
    [0x06] = MMX_OP2(phsubd),
2935
    [0x07] = MMX_OP2(phsubsw),
2936
    [0x08] = MMX_OP2(psignb),
2937
    [0x09] = MMX_OP2(psignw),
2938
    [0x0a] = MMX_OP2(psignd),
2939
    [0x0b] = MMX_OP2(pmulhrsw),
2940
    [0x1c] = MMX_OP2(pabsb),
2941
    [0x1d] = MMX_OP2(pabsw),
2942
    [0x1e] = MMX_OP2(pabsd),
2927
struct sse_op_helper_s {
2928
    void *op[2]; uint32_t ext_mask;
2929
};
2930
#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 }
2931
#define SSE41_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE41 }
2932
#define SSE42_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE42 }
2933
#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 }
2934
static struct sse_op_helper_s sse_op_table6[256] = {
2935
    [0x00] = SSSE3_OP(pshufb),
2936
    [0x01] = SSSE3_OP(phaddw),
2937
    [0x02] = SSSE3_OP(phaddd),
2938
    [0x03] = SSSE3_OP(phaddsw),
2939
    [0x04] = SSSE3_OP(pmaddubsw),
2940
    [0x05] = SSSE3_OP(phsubw),
2941
    [0x06] = SSSE3_OP(phsubd),
2942
    [0x07] = SSSE3_OP(phsubsw),
2943
    [0x08] = SSSE3_OP(psignb),
2944
    [0x09] = SSSE3_OP(psignw),
2945
    [0x0a] = SSSE3_OP(psignd),
2946
    [0x0b] = SSSE3_OP(pmulhrsw),
2947
    [0x10] = SSE41_OP(pblendvb),
2948
    [0x14] = SSE41_OP(blendvps),
2949
    [0x15] = SSE41_OP(blendvpd),
2950
    [0x17] = SSE41_OP(ptest),
2951
    [0x1c] = SSSE3_OP(pabsb),
2952
    [0x1d] = SSSE3_OP(pabsw),
2953
    [0x1e] = SSSE3_OP(pabsd),
2954
    [0x20] = SSE41_OP(pmovsxbw),
2955
    [0x21] = SSE41_OP(pmovsxbd),
2956
    [0x22] = SSE41_OP(pmovsxbq),
2957
    [0x23] = SSE41_OP(pmovsxwd),
2958
    [0x24] = SSE41_OP(pmovsxwq),
2959
    [0x25] = SSE41_OP(pmovsxdq),
2960
    [0x28] = SSE41_OP(pmuldq),
2961
    [0x29] = SSE41_OP(pcmpeqq),
2962
    [0x2a] = SSE41_SPECIAL, /* movntqda */
2963
    [0x2b] = SSE41_OP(packusdw),
2964
    [0x30] = SSE41_OP(pmovzxbw),
2965
    [0x31] = SSE41_OP(pmovzxbd),
2966
    [0x32] = SSE41_OP(pmovzxbq),
2967
    [0x33] = SSE41_OP(pmovzxwd),
2968
    [0x34] = SSE41_OP(pmovzxwq),
2969
    [0x35] = SSE41_OP(pmovzxdq),
2970
    [0x37] = SSE42_OP(pcmpgtq),
2971
    [0x38] = SSE41_OP(pminsb),
2972
    [0x39] = SSE41_OP(pminsd),
2973
    [0x3a] = SSE41_OP(pminuw),
2974
    [0x3b] = SSE41_OP(pminud),
2975
    [0x3c] = SSE41_OP(pmaxsb),
2976
    [0x3d] = SSE41_OP(pmaxsd),
2977
    [0x3e] = SSE41_OP(pmaxuw),
2978
    [0x3f] = SSE41_OP(pmaxud),
2979
    [0x40] = SSE41_OP(pmulld),
2980
    [0x41] = SSE41_OP(phminposuw),
2943 2981
};
2944 2982

  
2945
static void *sse_op_table7[256][2] = {
2946
    [0x0f] = MMX_OP2(palignr),
2983
static struct sse_op_helper_s sse_op_table7[256] = {
2984
    [0x08] = SSE41_OP(roundps),
2985
    [0x09] = SSE41_OP(roundpd),
2986
    [0x0a] = SSE41_OP(roundss),
2987
    [0x0b] = SSE41_OP(roundsd),
2988
    [0x0c] = SSE41_OP(blendps),
2989
    [0x0d] = SSE41_OP(blendpd),
2990
    [0x0e] = SSE41_OP(pblendw),
2991
    [0x0f] = SSSE3_OP(palignr),
2992
    [0x14] = SSE41_SPECIAL, /* pextrb */
2993
    [0x15] = SSE41_SPECIAL, /* pextrw */
2994
    [0x16] = SSE41_SPECIAL, /* pextrd/pextrq */
2995
    [0x17] = SSE41_SPECIAL, /* extractps */
2996
    [0x20] = SSE41_SPECIAL, /* pinsrb */
2997
    [0x21] = SSE41_SPECIAL, /* insertps */
2998
    [0x22] = SSE41_SPECIAL, /* pinsrd/pinsrq */
2999
    [0x40] = SSE41_OP(dpps),
3000
    [0x41] = SSE41_OP(dppd),
3001
    [0x42] = SSE41_OP(mpsadbw),
3002
    [0x60] = SSE42_OP(pcmpestrm),
3003
    [0x61] = SSE42_OP(pcmpestri),
3004
    [0x62] = SSE42_OP(pcmpistrm),
3005
    [0x63] = SSE42_OP(pcmpistri),
2947 3006
};
2948 3007

  
2949 3008
static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
......
3511 3570
            break;
3512 3571
        case 0x038:
3513 3572
        case 0x138:
3514
            if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3))
3515
                goto illegal_op;
3516

  
3517 3573
            b = modrm;
3518 3574
            modrm = ldub_code(s->pc++);
3519 3575
            rm = modrm & 7;
3520 3576
            reg = ((modrm >> 3) & 7) | rex_r;
3521 3577
            mod = (modrm >> 6) & 3;
3522 3578

  
3523
            sse_op2 = sse_op_table6[b][b1];
3579
            if (s->prefix & PREFIX_REPNZ)
3580
                goto crc32;
3581

  
3582
            sse_op2 = sse_op_table6[b].op[b1];
3524 3583
            if (!sse_op2)
3525 3584
                goto illegal_op;
3585
            if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask))
3586
                goto illegal_op;
3526 3587

  
3527 3588
            if (b1) {
3528 3589
                op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
......
3531 3592
                } else {
3532 3593
                    op2_offset = offsetof(CPUX86State,xmm_t0);
3533 3594
                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
3534
                    gen_ldo_env_A0(s->mem_index, op2_offset);
3595
                    switch (b) {
3596
                    case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */
3597
                    case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */
3598
                    case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */
3599
                        gen_ldq_env_A0(s->mem_index, op2_offset +
3600
                                        offsetof(XMMReg, XMM_Q(0)));
3601
                        break;
3602
                    case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */
3603
                    case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */
3604
                        tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0,
3605
                                          (s->mem_index >> 2) - 1);
3606
                        tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, op2_offset +
3607
                                        offsetof(XMMReg, XMM_L(0)));
3608
                        break;
3609
                    case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */
3610
                        tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0,
3611
                                          (s->mem_index >> 2) - 1);
3612
                        tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset +
3613
                                        offsetof(XMMReg, XMM_W(0)));
3614
                        break;
3615
                    case 0x2a:            /* movntqda */
3616
                        gen_ldo_env_A0(s->mem_index, op1_offset);
3617
                        return;
3618
                    default:
3619
                        gen_ldo_env_A0(s->mem_index, op2_offset);
3620
                    }
3535 3621
                }
3536 3622
            } else {
3537 3623
                op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
......
3543 3629
                    gen_ldq_env_A0(s->mem_index, op2_offset);
3544 3630
                }
3545 3631
            }
3632
            if (sse_op2 == SSE_SPECIAL)
3633
                goto illegal_op;
3634

  
3546 3635
            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
3547 3636
            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
3548 3637
            tcg_gen_helper_0_2(sse_op2, cpu_ptr0, cpu_ptr1);
3638

  
3639
            if (b == 0x17)
3640
                s->cc_op = CC_OP_EFLAGS;
3549 3641
            break;
3550
        case 0x03a:
3551
        case 0x13a:
3552
            if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3))
3642
        case 0x338: /* crc32 */
3643
        crc32:
3644
            b = modrm;
3645
            modrm = ldub_code(s->pc++);
3646
            reg = ((modrm >> 3) & 7) | rex_r;
3647

  
3648
            if (b != 0xf0 && b != 0xf1)
3649
                goto illegal_op;
3650
            if (!(s->cpuid_ext_features & CPUID_EXT_SSE42))
3553 3651
                goto illegal_op;
3554 3652

  
3653
            if (b == 0xf0)
3654
                ot = OT_BYTE;
3655
            else if (b == 0xf1 && s->dflag != 2)
3656
                if (s->prefix & PREFIX_DATA)
3657
                    ot = OT_WORD;
3658
                else
3659
                    ot = OT_LONG;
3660
            else
3661
                ot = OT_QUAD;
3662

  
3663
            gen_op_mov_TN_reg(OT_LONG, 0, reg);
3664
            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
3665
            gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
3666
            tcg_gen_helper_1_3(helper_crc32, cpu_T[0], cpu_tmp2_i32,
3667
                            cpu_T[0], tcg_const_i32(8 << ot));
3668

  
3669
            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
3670
            gen_op_mov_reg_T0(ot, reg);
3671
            break;
3672
        case 0x03a:
3673
        case 0x13a:
3555 3674
            b = modrm;
3556 3675
            modrm = ldub_code(s->pc++);
3557 3676
            rm = modrm & 7;
3558 3677
            reg = ((modrm >> 3) & 7) | rex_r;
3559 3678
            mod = (modrm >> 6) & 3;
3560 3679

  
3561
            sse_op2 = sse_op_table7[b][b1];
3680
            sse_op2 = sse_op_table7[b].op[b1];
3562 3681
            if (!sse_op2)
3563 3682
                goto illegal_op;
3683
            if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask))
3684
                goto illegal_op;
3685

  
3686
            if (sse_op2 == SSE_SPECIAL) {
3687
                ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
3688
                rm = (modrm & 7) | REX_B(s);
3689
                if (mod != 3)
3690
                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
3691
                reg = ((modrm >> 3) & 7) | rex_r;
3692
                val = ldub_code(s->pc++);
3693
                switch (b) {
3694
                case 0x14: /* pextrb */
3695
                    tcg_gen_ld8u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
3696
                                            xmm_regs[reg].XMM_B(val & 15)));
3697
                    if (mod == 3)
3698
                        gen_op_mov_reg_T0(ot, rm);
3699
                    else
3700
                        tcg_gen_qemu_st8(cpu_T[0], cpu_A0,
3701
                                        (s->mem_index >> 2) - 1);
3702
                    break;
3703
                case 0x15: /* pextrw */
3704
                    tcg_gen_ld16u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
3705
                                            xmm_regs[reg].XMM_W(val & 7)));
3706
                    if (mod == 3)
3707
                        gen_op_mov_reg_T0(ot, rm);
3708
                    else
3709
                        tcg_gen_qemu_st16(cpu_T[0], cpu_A0,
3710
                                        (s->mem_index >> 2) - 1);
3711
                    break;
3712
                case 0x16:
3713
                    if (ot == OT_LONG) { /* pextrd */
3714
                        tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env,
3715
                                        offsetof(CPUX86State,
3716
                                                xmm_regs[reg].XMM_L(val & 3)));
3717
                        if (mod == 3)
3718
                            gen_op_mov_reg_v(ot, rm, cpu_tmp2_i32);
3719
                        else
3720
                            tcg_gen_qemu_st32(cpu_tmp2_i32, cpu_A0,
3721
                                            (s->mem_index >> 2) - 1);
3722
                    } else { /* pextrq */
3723
                        tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env,
3724
                                        offsetof(CPUX86State,
3725
                                                xmm_regs[reg].XMM_Q(val & 1)));
3726
                        if (mod == 3)
3727
                            gen_op_mov_reg_v(ot, rm, cpu_tmp1_i64);
3728
                        else
3729
                            tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0,
3730
                                            (s->mem_index >> 2) - 1);
3731
                    }
3732
                    break;
3733
                case 0x17: /* extractps */
3734
                    tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
3735
                                            xmm_regs[reg].XMM_L(val & 3)));
3736
                    if (mod == 3)
3737
                        gen_op_mov_reg_T0(ot, rm);
3738
                    else
3739
                        tcg_gen_qemu_st32(cpu_T[0], cpu_A0,
3740
                                        (s->mem_index >> 2) - 1);
3741
                    break;
3742
                case 0x20: /* pinsrb */
3743
                    if (mod == 3)
3744
                        gen_op_mov_TN_reg(OT_LONG, 0, rm);
3745
                    else
3746
                        tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0,
3747
                                        (s->mem_index >> 2) - 1);
3748
                    tcg_gen_st8_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
3749
                                            xmm_regs[reg].XMM_B(val & 15)));
3750
                    break;
3751
                case 0x21: /* insertps */
3752
                    if (mod == 3)
3753
                        tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env,
3754
                                        offsetof(CPUX86State,xmm_regs[rm]
3755
                                                .XMM_L((val >> 6) & 3)));
3756
                    else
3757
                        tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0,
3758
                                        (s->mem_index >> 2) - 1);
3759
                    tcg_gen_st_i32(cpu_tmp2_i32, cpu_env,
3760
                                    offsetof(CPUX86State,xmm_regs[reg]
3761
                                            .XMM_L((val >> 4) & 3)));
3762
                    if ((val >> 0) & 1)
3763
                        tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
3764
                                        cpu_env, offsetof(CPUX86State,
3765
                                                xmm_regs[reg].XMM_L(0)));
3766
                    if ((val >> 1) & 1)
3767
                        tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
3768
                                        cpu_env, offsetof(CPUX86State,
3769
                                                xmm_regs[reg].XMM_L(1)));
3770
                    if ((val >> 2) & 1)
3771
                        tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
3772
                                        cpu_env, offsetof(CPUX86State,
3773
                                                xmm_regs[reg].XMM_L(2)));
3774
                    if ((val >> 3) & 1)
3775
                        tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
3776
                                        cpu_env, offsetof(CPUX86State,
3777
                                                xmm_regs[reg].XMM_L(3)));
3778
                    break;
3779
                case 0x22:
3780
                    if (ot == OT_LONG) { /* pinsrd */
3781
                        if (mod == 3)
3782
                            gen_op_mov_v_reg(ot, cpu_tmp2_i32, rm);
3783
                        else
3784
                            tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0,
3785
                                            (s->mem_index >> 2) - 1);
3786
                        tcg_gen_st_i32(cpu_tmp2_i32, cpu_env,
3787
                                        offsetof(CPUX86State,
3788
                                                xmm_regs[reg].XMM_L(val & 3)));
3789
                    } else { /* pinsrq */
3790
                        if (mod == 3)
3791
                            gen_op_mov_v_reg(ot, cpu_tmp1_i64, rm);
3792
                        else
3793
                            tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0,
3794
                                            (s->mem_index >> 2) - 1);
3795
                        tcg_gen_st_i64(cpu_tmp1_i64, cpu_env,
3796
                                        offsetof(CPUX86State,
3797
                                                xmm_regs[reg].XMM_Q(val & 1)));
3798
                    }
3799
                    break;
3800
                }
3801
                return;
3802
            }
3564 3803

  
3565 3804
            if (b1) {
3566 3805
                op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
......
3583 3822
            }
3584 3823
            val = ldub_code(s->pc++);
3585 3824

  
3825
            if ((b & 0xfc) == 0x60) { /* pcmpXstrX */
3826
                s->cc_op = CC_OP_EFLAGS;
3827

  
3828
                if (s->dflag == 2)
3829
                    /* The helper must use entire 64-bit gp registers */
3830
                    val |= 1 << 8;
3831
            }
3832

  
3586 3833
            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
3587 3834
            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
3588 3835
            tcg_gen_helper_0_3(sse_op2, cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
......
7094 7341
            gen_eob(s);
7095 7342
        }
7096 7343
        break;
7097
    /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3 support */
7344
    /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4 support */
7098 7345
    case 0x1c3: /* MOVNTI reg, mem */
7099 7346
        if (!(s->cpuid_features & CPUID_SSE2))
7100 7347
            goto illegal_op;
......
7202 7449
        tcg_gen_helper_0_0(helper_rsm);
7203 7450
        gen_eob(s);
7204 7451
        break;
7452
    case 0x1b8: /* SSE4.2 popcnt */
7453
        if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | PREFIX_REPNZ)) !=
7454
             PREFIX_REPZ)
7455
            goto illegal_op;
7456
        if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT))
7457
            goto illegal_op;
7458

  
7459
        modrm = ldub_code(s->pc++);
7460
        reg = ((modrm >> 3) & 7);
7461

  
7462
        if (s->prefix & PREFIX_DATA)
7463
            ot = OT_WORD;
7464
        else if (s->dflag != 2)
7465
            ot = OT_LONG;
7466
        else
7467
            ot = OT_QUAD;
7468

  
7469
        gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
7470
        tcg_gen_helper_1_2(helper_popcnt,
7471
                cpu_T[0], cpu_T[0], tcg_const_i32(ot));
7472
        gen_op_mov_reg_T0(ot, reg);
7473
        break;
7205 7474
    case 0x10e ... 0x10f:
7206 7475
        /* 3DNow! instructions, ignore prefixes */
7207 7476
        s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA);
b/tests/test-i386-ssse3.c
1 1
/* See if various MMX/SSE SSSE3 instructions give expected results */
2 2
#include <stdio.h>
3 3
#include <string.h>
4
#include <stdint.h>
4 5

  
5 6
int main(int argc, char *argv[]) {
6 7
	char hello[16];
......
9 10

  
10 11
	uint64_t a = 0x0000000000090007;
11 12
	uint64_t b = 0x0000000000000000;
13
	uint32_t c;
14
	uint16_t d;
12 15

  
13
	const char c[16] = "LLOaaaaaaaaaaaaa";
14
	const char d[16] = "aaaaaaaaaaaaaaHE";
16
	const char e[16] = "LLOaaaaaaaaaaaaa";
17
	const char f[16] = "aaaaaaaaaaaaaaHE";
15 18

  
16 19
	/* pshufb mm1/xmm1, mm2/xmm2 */
17 20
	asm volatile ("movq    (%0), %%mm0" : : "r" (ehlo) : "mm0", "mm1");
......
33 36
	printf("%i - %i = %i\n", 9, 7, -(int16_t) a);
34 37

  
35 38
	/* palignr mm1/xmm1, m64/m128, imm8 */
36
	asm volatile ("movdqa  (%0), %%xmm0" : : "r" (c) : "xmm0");
37
	asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (d));
39
	asm volatile ("movdqa  (%0), %%xmm0" : : "r" (e) : "xmm0");
40
	asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (f));
38 41
	asm volatile ("movdqa  %%xmm0, (%0)" : : "r" (hello));
39 42
	printf("%5.5s\n", hello);
40 43

  
44
#if 1 /* SSE4 */
45
	/* popcnt r64, r/m64 */
46
	asm volatile ("movq    $0x8421000010009c63, %%rax" : : : "rax");
47
	asm volatile ("popcnt  %%ax, %%dx" : : : "dx");
48
	asm volatile ("popcnt  %%eax, %%ecx" : : : "ecx");
49
	asm volatile ("popcnt  %rax, %rax");
50
	asm volatile ("movq    %%rax, %0" : "=m" (a));
51
	asm volatile ("movl    %%ecx, %0" : "=m" (c));
52
	asm volatile ("movw    %%dx, %0" : "=m" (d));
53
	printf("%i = %i\n%i = %i = %i\n", 13, (int) a, 9, c, d + 1);
54
#endif
55

  
41 56
	return 0;
42 57
}

Also available in: Unified diff