Statistics
| Branch: | Revision:

root / target-i386 / kvm.c @ a8170e5e

History | View | Annotate | Download (62.7 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright (C) 2006-2008 Qumranet Technologies
5
 * Copyright IBM, Corp. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *
10
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
11
 * See the COPYING file in the top-level directory.
12
 *
13
 */
14

    
15
#include <sys/types.h>
16
#include <sys/ioctl.h>
17
#include <sys/mman.h>
18
#include <sys/utsname.h>
19

    
20
#include <linux/kvm.h>
21
#include <linux/kvm_para.h>
22

    
23
#include "qemu-common.h"
24
#include "sysemu.h"
25
#include "kvm.h"
26
#include "kvm_i386.h"
27
#include "cpu.h"
28
#include "gdbstub.h"
29
#include "host-utils.h"
30
#include "hw/pc.h"
31
#include "hw/apic.h"
32
#include "ioport.h"
33
#include "hyperv.h"
34
#include "hw/pci.h"
35

    
36
//#define DEBUG_KVM
37

    
38
#ifdef DEBUG_KVM
39
#define DPRINTF(fmt, ...) \
40
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
41
#else
42
#define DPRINTF(fmt, ...) \
43
    do { } while (0)
44
#endif
45

    
46
#define MSR_KVM_WALL_CLOCK  0x11
47
#define MSR_KVM_SYSTEM_TIME 0x12
48

    
49
#ifndef BUS_MCEERR_AR
50
#define BUS_MCEERR_AR 4
51
#endif
52
#ifndef BUS_MCEERR_AO
53
#define BUS_MCEERR_AO 5
54
#endif
55

    
56
const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
57
    KVM_CAP_INFO(SET_TSS_ADDR),
58
    KVM_CAP_INFO(EXT_CPUID),
59
    KVM_CAP_INFO(MP_STATE),
60
    KVM_CAP_LAST_INFO
61
};
62

    
63
static bool has_msr_star;
64
static bool has_msr_hsave_pa;
65
static bool has_msr_tsc_deadline;
66
static bool has_msr_async_pf_en;
67
static bool has_msr_pv_eoi_en;
68
static bool has_msr_misc_enable;
69
static int lm_capable_kernel;
70

    
71
bool kvm_allows_irq0_override(void)
72
{
73
    return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
74
}
75

    
76
static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
77
{
78
    struct kvm_cpuid2 *cpuid;
79
    int r, size;
80

    
81
    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
82
    cpuid = (struct kvm_cpuid2 *)g_malloc0(size);
83
    cpuid->nent = max;
84
    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
85
    if (r == 0 && cpuid->nent >= max) {
86
        r = -E2BIG;
87
    }
88
    if (r < 0) {
89
        if (r == -E2BIG) {
90
            g_free(cpuid);
91
            return NULL;
92
        } else {
93
            fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
94
                    strerror(-r));
95
            exit(1);
96
        }
97
    }
98
    return cpuid;
99
}
100

    
101
struct kvm_para_features {
102
    int cap;
103
    int feature;
104
} para_features[] = {
105
    { KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
106
    { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
107
    { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
108
    { KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
109
    { -1, -1 }
110
};
111

    
112
static int get_para_features(KVMState *s)
113
{
114
    int i, features = 0;
115

    
116
    for (i = 0; i < ARRAY_SIZE(para_features) - 1; i++) {
117
        if (kvm_check_extension(s, para_features[i].cap)) {
118
            features |= (1 << para_features[i].feature);
119
        }
120
    }
121

    
122
    return features;
123
}
124

    
125

    
126
uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
127
                                      uint32_t index, int reg)
128
{
129
    struct kvm_cpuid2 *cpuid;
130
    int i, max;
131
    uint32_t ret = 0;
132
    uint32_t cpuid_1_edx;
133
    int has_kvm_features = 0;
134

    
135
    max = 1;
136
    while ((cpuid = try_get_cpuid(s, max)) == NULL) {
137
        max *= 2;
138
    }
139

    
140
    for (i = 0; i < cpuid->nent; ++i) {
141
        if (cpuid->entries[i].function == function &&
142
            cpuid->entries[i].index == index) {
143
            if (cpuid->entries[i].function == KVM_CPUID_FEATURES) {
144
                has_kvm_features = 1;
145
            }
146
            switch (reg) {
147
            case R_EAX:
148
                ret = cpuid->entries[i].eax;
149
                break;
150
            case R_EBX:
151
                ret = cpuid->entries[i].ebx;
152
                break;
153
            case R_ECX:
154
                ret = cpuid->entries[i].ecx;
155
                break;
156
            case R_EDX:
157
                ret = cpuid->entries[i].edx;
158
                switch (function) {
159
                case 1:
160
                    /* KVM before 2.6.30 misreports the following features */
161
                    ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
162
                    break;
163
                case 0x80000001:
164
                    /* On Intel, kvm returns cpuid according to the Intel spec,
165
                     * so add missing bits according to the AMD spec:
166
                     */
167
                    cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
168
                    ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
169
                    break;
170
                }
171
                break;
172
            }
173
        }
174
    }
175

    
176
    g_free(cpuid);
177

    
178
    /* fallback for older kernels */
179
    if (!has_kvm_features && (function == KVM_CPUID_FEATURES)) {
180
        ret = get_para_features(s);
181
    }
182

    
183
    return ret;
184
}
185

    
186
typedef struct HWPoisonPage {
187
    ram_addr_t ram_addr;
188
    QLIST_ENTRY(HWPoisonPage) list;
189
} HWPoisonPage;
190

    
191
static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
192
    QLIST_HEAD_INITIALIZER(hwpoison_page_list);
193

    
194
static void kvm_unpoison_all(void *param)
195
{
196
    HWPoisonPage *page, *next_page;
197

    
198
    QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
199
        QLIST_REMOVE(page, list);
200
        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
201
        g_free(page);
202
    }
203
}
204

    
205
static void kvm_hwpoison_page_add(ram_addr_t ram_addr)
206
{
207
    HWPoisonPage *page;
208

    
209
    QLIST_FOREACH(page, &hwpoison_page_list, list) {
210
        if (page->ram_addr == ram_addr) {
211
            return;
212
        }
213
    }
214
    page = g_malloc(sizeof(HWPoisonPage));
215
    page->ram_addr = ram_addr;
216
    QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
217
}
218

    
219
static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
220
                                     int *max_banks)
221
{
222
    int r;
223

    
224
    r = kvm_check_extension(s, KVM_CAP_MCE);
225
    if (r > 0) {
226
        *max_banks = r;
227
        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
228
    }
229
    return -ENOSYS;
230
}
231

    
232
static void kvm_mce_inject(CPUX86State *env, hwaddr paddr, int code)
233
{
234
    uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
235
                      MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
236
    uint64_t mcg_status = MCG_STATUS_MCIP;
237

    
238
    if (code == BUS_MCEERR_AR) {
239
        status |= MCI_STATUS_AR | 0x134;
240
        mcg_status |= MCG_STATUS_EIPV;
241
    } else {
242
        status |= 0xc0;
243
        mcg_status |= MCG_STATUS_RIPV;
244
    }
245
    cpu_x86_inject_mce(NULL, env, 9, status, mcg_status, paddr,
246
                       (MCM_ADDR_PHYS << 6) | 0xc,
247
                       cpu_x86_support_mca_broadcast(env) ?
248
                       MCE_INJECT_BROADCAST : 0);
249
}
250

    
251
static void hardware_memory_error(void)
252
{
253
    fprintf(stderr, "Hardware memory error!\n");
254
    exit(1);
255
}
256

    
257
int kvm_arch_on_sigbus_vcpu(CPUX86State *env, int code, void *addr)
258
{
259
    ram_addr_t ram_addr;
260
    hwaddr paddr;
261

    
262
    if ((env->mcg_cap & MCG_SER_P) && addr
263
        && (code == BUS_MCEERR_AR || code == BUS_MCEERR_AO)) {
264
        if (qemu_ram_addr_from_host(addr, &ram_addr) ||
265
            !kvm_physical_memory_addr_from_host(env->kvm_state, addr, &paddr)) {
266
            fprintf(stderr, "Hardware memory error for memory used by "
267
                    "QEMU itself instead of guest system!\n");
268
            /* Hope we are lucky for AO MCE */
269
            if (code == BUS_MCEERR_AO) {
270
                return 0;
271
            } else {
272
                hardware_memory_error();
273
            }
274
        }
275
        kvm_hwpoison_page_add(ram_addr);
276
        kvm_mce_inject(env, paddr, code);
277
    } else {
278
        if (code == BUS_MCEERR_AO) {
279
            return 0;
280
        } else if (code == BUS_MCEERR_AR) {
281
            hardware_memory_error();
282
        } else {
283
            return 1;
284
        }
285
    }
286
    return 0;
287
}
288

    
289
int kvm_arch_on_sigbus(int code, void *addr)
290
{
291
    if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
292
        ram_addr_t ram_addr;
293
        hwaddr paddr;
294

    
295
        /* Hope we are lucky for AO MCE */
296
        if (qemu_ram_addr_from_host(addr, &ram_addr) ||
297
            !kvm_physical_memory_addr_from_host(first_cpu->kvm_state, addr,
298
                                                &paddr)) {
299
            fprintf(stderr, "Hardware memory error for memory used by "
300
                    "QEMU itself instead of guest system!: %p\n", addr);
301
            return 0;
302
        }
303
        kvm_hwpoison_page_add(ram_addr);
304
        kvm_mce_inject(first_cpu, paddr, code);
305
    } else {
306
        if (code == BUS_MCEERR_AO) {
307
            return 0;
308
        } else if (code == BUS_MCEERR_AR) {
309
            hardware_memory_error();
310
        } else {
311
            return 1;
312
        }
313
    }
314
    return 0;
315
}
316

    
317
static int kvm_inject_mce_oldstyle(CPUX86State *env)
318
{
319
    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
320
        unsigned int bank, bank_num = env->mcg_cap & 0xff;
321
        struct kvm_x86_mce mce;
322

    
323
        env->exception_injected = -1;
324

    
325
        /*
326
         * There must be at least one bank in use if an MCE is pending.
327
         * Find it and use its values for the event injection.
328
         */
329
        for (bank = 0; bank < bank_num; bank++) {
330
            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
331
                break;
332
            }
333
        }
334
        assert(bank < bank_num);
335

    
336
        mce.bank = bank;
337
        mce.status = env->mce_banks[bank * 4 + 1];
338
        mce.mcg_status = env->mcg_status;
339
        mce.addr = env->mce_banks[bank * 4 + 2];
340
        mce.misc = env->mce_banks[bank * 4 + 3];
341

    
342
        return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, &mce);
343
    }
344
    return 0;
345
}
346

    
347
static void cpu_update_state(void *opaque, int running, RunState state)
348
{
349
    CPUX86State *env = opaque;
350

    
351
    if (running) {
352
        env->tsc_valid = false;
353
    }
354
}
355

    
356
int kvm_arch_init_vcpu(CPUX86State *env)
357
{
358
    struct {
359
        struct kvm_cpuid2 cpuid;
360
        struct kvm_cpuid_entry2 entries[100];
361
    } QEMU_PACKED cpuid_data;
362
    KVMState *s = env->kvm_state;
363
    uint32_t limit, i, j, cpuid_i;
364
    uint32_t unused;
365
    struct kvm_cpuid_entry2 *c;
366
    uint32_t signature[3];
367
    int r;
368

    
369
    env->cpuid_features &= kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
370

    
371
    i = env->cpuid_ext_features & CPUID_EXT_HYPERVISOR;
372
    j = env->cpuid_ext_features & CPUID_EXT_TSC_DEADLINE_TIMER;
373
    env->cpuid_ext_features &= kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX);
374
    env->cpuid_ext_features |= i;
375
    if (j && kvm_irqchip_in_kernel() &&
376
        kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
377
        env->cpuid_ext_features |= CPUID_EXT_TSC_DEADLINE_TIMER;
378
    }
379

    
380
    env->cpuid_ext2_features &= kvm_arch_get_supported_cpuid(s, 0x80000001,
381
                                                             0, R_EDX);
382
    env->cpuid_ext3_features &= kvm_arch_get_supported_cpuid(s, 0x80000001,
383
                                                             0, R_ECX);
384
    env->cpuid_svm_features  &= kvm_arch_get_supported_cpuid(s, 0x8000000A,
385
                                                             0, R_EDX);
386

    
387
    cpuid_i = 0;
388

    
389
    /* Paravirtualization CPUIDs */
390
    c = &cpuid_data.entries[cpuid_i++];
391
    memset(c, 0, sizeof(*c));
392
    c->function = KVM_CPUID_SIGNATURE;
393
    if (!hyperv_enabled()) {
394
        memcpy(signature, "KVMKVMKVM\0\0\0", 12);
395
        c->eax = 0;
396
    } else {
397
        memcpy(signature, "Microsoft Hv", 12);
398
        c->eax = HYPERV_CPUID_MIN;
399
    }
400
    c->ebx = signature[0];
401
    c->ecx = signature[1];
402
    c->edx = signature[2];
403

    
404
    c = &cpuid_data.entries[cpuid_i++];
405
    memset(c, 0, sizeof(*c));
406
    c->function = KVM_CPUID_FEATURES;
407
    c->eax = env->cpuid_kvm_features &
408
        kvm_arch_get_supported_cpuid(s, KVM_CPUID_FEATURES, 0, R_EAX);
409

    
410
    if (hyperv_enabled()) {
411
        memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
412
        c->eax = signature[0];
413

    
414
        c = &cpuid_data.entries[cpuid_i++];
415
        memset(c, 0, sizeof(*c));
416
        c->function = HYPERV_CPUID_VERSION;
417
        c->eax = 0x00001bbc;
418
        c->ebx = 0x00060001;
419

    
420
        c = &cpuid_data.entries[cpuid_i++];
421
        memset(c, 0, sizeof(*c));
422
        c->function = HYPERV_CPUID_FEATURES;
423
        if (hyperv_relaxed_timing_enabled()) {
424
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
425
        }
426
        if (hyperv_vapic_recommended()) {
427
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
428
            c->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE;
429
        }
430

    
431
        c = &cpuid_data.entries[cpuid_i++];
432
        memset(c, 0, sizeof(*c));
433
        c->function = HYPERV_CPUID_ENLIGHTMENT_INFO;
434
        if (hyperv_relaxed_timing_enabled()) {
435
            c->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
436
        }
437
        if (hyperv_vapic_recommended()) {
438
            c->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
439
        }
440
        c->ebx = hyperv_get_spinlock_retries();
441

    
442
        c = &cpuid_data.entries[cpuid_i++];
443
        memset(c, 0, sizeof(*c));
444
        c->function = HYPERV_CPUID_IMPLEMENT_LIMITS;
445
        c->eax = 0x40;
446
        c->ebx = 0x40;
447

    
448
        c = &cpuid_data.entries[cpuid_i++];
449
        memset(c, 0, sizeof(*c));
450
        c->function = KVM_CPUID_SIGNATURE_NEXT;
451
        memcpy(signature, "KVMKVMKVM\0\0\0", 12);
452
        c->eax = 0;
453
        c->ebx = signature[0];
454
        c->ecx = signature[1];
455
        c->edx = signature[2];
456
    }
457

    
458
    has_msr_async_pf_en = c->eax & (1 << KVM_FEATURE_ASYNC_PF);
459

    
460
    has_msr_pv_eoi_en = c->eax & (1 << KVM_FEATURE_PV_EOI);
461

    
462
    cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
463

    
464
    for (i = 0; i <= limit; i++) {
465
        c = &cpuid_data.entries[cpuid_i++];
466

    
467
        switch (i) {
468
        case 2: {
469
            /* Keep reading function 2 till all the input is received */
470
            int times;
471

    
472
            c->function = i;
473
            c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
474
                       KVM_CPUID_FLAG_STATE_READ_NEXT;
475
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
476
            times = c->eax & 0xff;
477

    
478
            for (j = 1; j < times; ++j) {
479
                c = &cpuid_data.entries[cpuid_i++];
480
                c->function = i;
481
                c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
482
                cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
483
            }
484
            break;
485
        }
486
        case 4:
487
        case 0xb:
488
        case 0xd:
489
            for (j = 0; ; j++) {
490
                if (i == 0xd && j == 64) {
491
                    break;
492
                }
493
                c->function = i;
494
                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
495
                c->index = j;
496
                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
497

    
498
                if (i == 4 && c->eax == 0) {
499
                    break;
500
                }
501
                if (i == 0xb && !(c->ecx & 0xff00)) {
502
                    break;
503
                }
504
                if (i == 0xd && c->eax == 0) {
505
                    continue;
506
                }
507
                c = &cpuid_data.entries[cpuid_i++];
508
            }
509
            break;
510
        default:
511
            c->function = i;
512
            c->flags = 0;
513
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
514
            break;
515
        }
516
    }
517
    cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
518

    
519
    for (i = 0x80000000; i <= limit; i++) {
520
        c = &cpuid_data.entries[cpuid_i++];
521

    
522
        c->function = i;
523
        c->flags = 0;
524
        cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
525
    }
526

    
527
    /* Call Centaur's CPUID instructions they are supported. */
528
    if (env->cpuid_xlevel2 > 0) {
529
        env->cpuid_ext4_features &=
530
            kvm_arch_get_supported_cpuid(s, 0xC0000001, 0, R_EDX);
531
        cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
532

    
533
        for (i = 0xC0000000; i <= limit; i++) {
534
            c = &cpuid_data.entries[cpuid_i++];
535

    
536
            c->function = i;
537
            c->flags = 0;
538
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
539
        }
540
    }
541

    
542
    cpuid_data.cpuid.nent = cpuid_i;
543

    
544
    if (((env->cpuid_version >> 8)&0xF) >= 6
545
        && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
546
        && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
547
        uint64_t mcg_cap;
548
        int banks;
549
        int ret;
550

    
551
        ret = kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks);
552
        if (ret < 0) {
553
            fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
554
            return ret;
555
        }
556

    
557
        if (banks > MCE_BANKS_DEF) {
558
            banks = MCE_BANKS_DEF;
559
        }
560
        mcg_cap &= MCE_CAP_DEF;
561
        mcg_cap |= banks;
562
        ret = kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, &mcg_cap);
563
        if (ret < 0) {
564
            fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
565
            return ret;
566
        }
567

    
568
        env->mcg_cap = mcg_cap;
569
    }
570

    
571
    qemu_add_vm_change_state_handler(cpu_update_state, env);
572

    
573
    cpuid_data.cpuid.padding = 0;
574
    r = kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
575
    if (r) {
576
        return r;
577
    }
578

    
579
    r = kvm_check_extension(env->kvm_state, KVM_CAP_TSC_CONTROL);
580
    if (r && env->tsc_khz) {
581
        r = kvm_vcpu_ioctl(env, KVM_SET_TSC_KHZ, env->tsc_khz);
582
        if (r < 0) {
583
            fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
584
            return r;
585
        }
586
    }
587

    
588
    if (kvm_has_xsave()) {
589
        env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
590
    }
591

    
592
    return 0;
593
}
594

    
595
void kvm_arch_reset_vcpu(CPUX86State *env)
596
{
597
    X86CPU *cpu = x86_env_get_cpu(env);
598

    
599
    env->exception_injected = -1;
600
    env->interrupt_injected = -1;
601
    env->xcr0 = 1;
602
    if (kvm_irqchip_in_kernel()) {
603
        env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
604
                                          KVM_MP_STATE_UNINITIALIZED;
605
    } else {
606
        env->mp_state = KVM_MP_STATE_RUNNABLE;
607
    }
608
}
609

    
610
static int kvm_get_supported_msrs(KVMState *s)
611
{
612
    static int kvm_supported_msrs;
613
    int ret = 0;
614

    
615
    /* first time */
616
    if (kvm_supported_msrs == 0) {
617
        struct kvm_msr_list msr_list, *kvm_msr_list;
618

    
619
        kvm_supported_msrs = -1;
620

    
621
        /* Obtain MSR list from KVM.  These are the MSRs that we must
622
         * save/restore */
623
        msr_list.nmsrs = 0;
624
        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
625
        if (ret < 0 && ret != -E2BIG) {
626
            return ret;
627
        }
628
        /* Old kernel modules had a bug and could write beyond the provided
629
           memory. Allocate at least a safe amount of 1K. */
630
        kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
631
                                              msr_list.nmsrs *
632
                                              sizeof(msr_list.indices[0])));
633

    
634
        kvm_msr_list->nmsrs = msr_list.nmsrs;
635
        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
636
        if (ret >= 0) {
637
            int i;
638

    
639
            for (i = 0; i < kvm_msr_list->nmsrs; i++) {
640
                if (kvm_msr_list->indices[i] == MSR_STAR) {
641
                    has_msr_star = true;
642
                    continue;
643
                }
644
                if (kvm_msr_list->indices[i] == MSR_VM_HSAVE_PA) {
645
                    has_msr_hsave_pa = true;
646
                    continue;
647
                }
648
                if (kvm_msr_list->indices[i] == MSR_IA32_TSCDEADLINE) {
649
                    has_msr_tsc_deadline = true;
650
                    continue;
651
                }
652
                if (kvm_msr_list->indices[i] == MSR_IA32_MISC_ENABLE) {
653
                    has_msr_misc_enable = true;
654
                    continue;
655
                }
656
            }
657
        }
658

    
659
        g_free(kvm_msr_list);
660
    }
661

    
662
    return ret;
663
}
664

    
665
int kvm_arch_init(KVMState *s)
666
{
667
    QemuOptsList *list = qemu_find_opts("machine");
668
    uint64_t identity_base = 0xfffbc000;
669
    uint64_t shadow_mem;
670
    int ret;
671
    struct utsname utsname;
672

    
673
    ret = kvm_get_supported_msrs(s);
674
    if (ret < 0) {
675
        return ret;
676
    }
677

    
678
    uname(&utsname);
679
    lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
680

    
681
    /*
682
     * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
683
     * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
684
     * Since these must be part of guest physical memory, we need to allocate
685
     * them, both by setting their start addresses in the kernel and by
686
     * creating a corresponding e820 entry. We need 4 pages before the BIOS.
687
     *
688
     * Older KVM versions may not support setting the identity map base. In
689
     * that case we need to stick with the default, i.e. a 256K maximum BIOS
690
     * size.
691
     */
692
    if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
693
        /* Allows up to 16M BIOSes. */
694
        identity_base = 0xfeffc000;
695

    
696
        ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
697
        if (ret < 0) {
698
            return ret;
699
        }
700
    }
701

    
702
    /* Set TSS base one page after EPT identity map. */
703
    ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
704
    if (ret < 0) {
705
        return ret;
706
    }
707

    
708
    /* Tell fw_cfg to notify the BIOS to reserve the range. */
709
    ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
710
    if (ret < 0) {
711
        fprintf(stderr, "e820_add_entry() table is full\n");
712
        return ret;
713
    }
714
    qemu_register_reset(kvm_unpoison_all, NULL);
715

    
716
    if (!QTAILQ_EMPTY(&list->head)) {
717
        shadow_mem = qemu_opt_get_size(QTAILQ_FIRST(&list->head),
718
                                       "kvm_shadow_mem", -1);
719
        if (shadow_mem != -1) {
720
            shadow_mem /= 4096;
721
            ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
722
            if (ret < 0) {
723
                return ret;
724
            }
725
        }
726
    }
727
    return 0;
728
}
729

    
730
static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
731
{
732
    lhs->selector = rhs->selector;
733
    lhs->base = rhs->base;
734
    lhs->limit = rhs->limit;
735
    lhs->type = 3;
736
    lhs->present = 1;
737
    lhs->dpl = 3;
738
    lhs->db = 0;
739
    lhs->s = 1;
740
    lhs->l = 0;
741
    lhs->g = 0;
742
    lhs->avl = 0;
743
    lhs->unusable = 0;
744
}
745

    
746
static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
747
{
748
    unsigned flags = rhs->flags;
749
    lhs->selector = rhs->selector;
750
    lhs->base = rhs->base;
751
    lhs->limit = rhs->limit;
752
    lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
753
    lhs->present = (flags & DESC_P_MASK) != 0;
754
    lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
755
    lhs->db = (flags >> DESC_B_SHIFT) & 1;
756
    lhs->s = (flags & DESC_S_MASK) != 0;
757
    lhs->l = (flags >> DESC_L_SHIFT) & 1;
758
    lhs->g = (flags & DESC_G_MASK) != 0;
759
    lhs->avl = (flags & DESC_AVL_MASK) != 0;
760
    lhs->unusable = 0;
761
    lhs->padding = 0;
762
}
763

    
764
static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
765
{
766
    lhs->selector = rhs->selector;
767
    lhs->base = rhs->base;
768
    lhs->limit = rhs->limit;
769
    lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
770
                 (rhs->present * DESC_P_MASK) |
771
                 (rhs->dpl << DESC_DPL_SHIFT) |
772
                 (rhs->db << DESC_B_SHIFT) |
773
                 (rhs->s * DESC_S_MASK) |
774
                 (rhs->l << DESC_L_SHIFT) |
775
                 (rhs->g * DESC_G_MASK) |
776
                 (rhs->avl * DESC_AVL_MASK);
777
}
778

    
779
static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
780
{
781
    if (set) {
782
        *kvm_reg = *qemu_reg;
783
    } else {
784
        *qemu_reg = *kvm_reg;
785
    }
786
}
787

    
788
static int kvm_getput_regs(CPUX86State *env, int set)
789
{
790
    struct kvm_regs regs;
791
    int ret = 0;
792

    
793
    if (!set) {
794
        ret = kvm_vcpu_ioctl(env, KVM_GET_REGS, &regs);
795
        if (ret < 0) {
796
            return ret;
797
        }
798
    }
799

    
800
    kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
801
    kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
802
    kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
803
    kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
804
    kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
805
    kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
806
    kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
807
    kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
808
#ifdef TARGET_X86_64
809
    kvm_getput_reg(&regs.r8, &env->regs[8], set);
810
    kvm_getput_reg(&regs.r9, &env->regs[9], set);
811
    kvm_getput_reg(&regs.r10, &env->regs[10], set);
812
    kvm_getput_reg(&regs.r11, &env->regs[11], set);
813
    kvm_getput_reg(&regs.r12, &env->regs[12], set);
814
    kvm_getput_reg(&regs.r13, &env->regs[13], set);
815
    kvm_getput_reg(&regs.r14, &env->regs[14], set);
816
    kvm_getput_reg(&regs.r15, &env->regs[15], set);
817
#endif
818

    
819
    kvm_getput_reg(&regs.rflags, &env->eflags, set);
820
    kvm_getput_reg(&regs.rip, &env->eip, set);
821

    
822
    if (set) {
823
        ret = kvm_vcpu_ioctl(env, KVM_SET_REGS, &regs);
824
    }
825

    
826
    return ret;
827
}
828

    
829
static int kvm_put_fpu(CPUX86State *env)
830
{
831
    struct kvm_fpu fpu;
832
    int i;
833

    
834
    memset(&fpu, 0, sizeof fpu);
835
    fpu.fsw = env->fpus & ~(7 << 11);
836
    fpu.fsw |= (env->fpstt & 7) << 11;
837
    fpu.fcw = env->fpuc;
838
    fpu.last_opcode = env->fpop;
839
    fpu.last_ip = env->fpip;
840
    fpu.last_dp = env->fpdp;
841
    for (i = 0; i < 8; ++i) {
842
        fpu.ftwx |= (!env->fptags[i]) << i;
843
    }
844
    memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
845
    memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
846
    fpu.mxcsr = env->mxcsr;
847

    
848
    return kvm_vcpu_ioctl(env, KVM_SET_FPU, &fpu);
849
}
850

    
851
#define XSAVE_FCW_FSW     0
852
#define XSAVE_FTW_FOP     1
853
#define XSAVE_CWD_RIP     2
854
#define XSAVE_CWD_RDP     4
855
#define XSAVE_MXCSR       6
856
#define XSAVE_ST_SPACE    8
857
#define XSAVE_XMM_SPACE   40
858
#define XSAVE_XSTATE_BV   128
859
#define XSAVE_YMMH_SPACE  144
860

    
861
static int kvm_put_xsave(CPUX86State *env)
862
{
863
    struct kvm_xsave* xsave = env->kvm_xsave_buf;
864
    uint16_t cwd, swd, twd;
865
    int i, r;
866

    
867
    if (!kvm_has_xsave()) {
868
        return kvm_put_fpu(env);
869
    }
870

    
871
    memset(xsave, 0, sizeof(struct kvm_xsave));
872
    twd = 0;
873
    swd = env->fpus & ~(7 << 11);
874
    swd |= (env->fpstt & 7) << 11;
875
    cwd = env->fpuc;
876
    for (i = 0; i < 8; ++i) {
877
        twd |= (!env->fptags[i]) << i;
878
    }
879
    xsave->region[XSAVE_FCW_FSW] = (uint32_t)(swd << 16) + cwd;
880
    xsave->region[XSAVE_FTW_FOP] = (uint32_t)(env->fpop << 16) + twd;
881
    memcpy(&xsave->region[XSAVE_CWD_RIP], &env->fpip, sizeof(env->fpip));
882
    memcpy(&xsave->region[XSAVE_CWD_RDP], &env->fpdp, sizeof(env->fpdp));
883
    memcpy(&xsave->region[XSAVE_ST_SPACE], env->fpregs,
884
            sizeof env->fpregs);
885
    memcpy(&xsave->region[XSAVE_XMM_SPACE], env->xmm_regs,
886
            sizeof env->xmm_regs);
887
    xsave->region[XSAVE_MXCSR] = env->mxcsr;
888
    *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV] = env->xstate_bv;
889
    memcpy(&xsave->region[XSAVE_YMMH_SPACE], env->ymmh_regs,
890
            sizeof env->ymmh_regs);
891
    r = kvm_vcpu_ioctl(env, KVM_SET_XSAVE, xsave);
892
    return r;
893
}
894

    
895
static int kvm_put_xcrs(CPUX86State *env)
896
{
897
    struct kvm_xcrs xcrs;
898

    
899
    if (!kvm_has_xcrs()) {
900
        return 0;
901
    }
902

    
903
    xcrs.nr_xcrs = 1;
904
    xcrs.flags = 0;
905
    xcrs.xcrs[0].xcr = 0;
906
    xcrs.xcrs[0].value = env->xcr0;
907
    return kvm_vcpu_ioctl(env, KVM_SET_XCRS, &xcrs);
908
}
909

    
910
static int kvm_put_sregs(CPUX86State *env)
911
{
912
    struct kvm_sregs sregs;
913

    
914
    memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
915
    if (env->interrupt_injected >= 0) {
916
        sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
917
                (uint64_t)1 << (env->interrupt_injected % 64);
918
    }
919

    
920
    if ((env->eflags & VM_MASK)) {
921
        set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
922
        set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
923
        set_v8086_seg(&sregs.es, &env->segs[R_ES]);
924
        set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
925
        set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
926
        set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
927
    } else {
928
        set_seg(&sregs.cs, &env->segs[R_CS]);
929
        set_seg(&sregs.ds, &env->segs[R_DS]);
930
        set_seg(&sregs.es, &env->segs[R_ES]);
931
        set_seg(&sregs.fs, &env->segs[R_FS]);
932
        set_seg(&sregs.gs, &env->segs[R_GS]);
933
        set_seg(&sregs.ss, &env->segs[R_SS]);
934
    }
935

    
936
    set_seg(&sregs.tr, &env->tr);
937
    set_seg(&sregs.ldt, &env->ldt);
938

    
939
    sregs.idt.limit = env->idt.limit;
940
    sregs.idt.base = env->idt.base;
941
    memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
942
    sregs.gdt.limit = env->gdt.limit;
943
    sregs.gdt.base = env->gdt.base;
944
    memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
945

    
946
    sregs.cr0 = env->cr[0];
947
    sregs.cr2 = env->cr[2];
948
    sregs.cr3 = env->cr[3];
949
    sregs.cr4 = env->cr[4];
950

    
951
    sregs.cr8 = cpu_get_apic_tpr(env->apic_state);
952
    sregs.apic_base = cpu_get_apic_base(env->apic_state);
953

    
954
    sregs.efer = env->efer;
955

    
956
    return kvm_vcpu_ioctl(env, KVM_SET_SREGS, &sregs);
957
}
958

    
959
static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
960
                              uint32_t index, uint64_t value)
961
{
962
    entry->index = index;
963
    entry->data = value;
964
}
965

    
966
static int kvm_put_msrs(CPUX86State *env, int level)
967
{
968
    struct {
969
        struct kvm_msrs info;
970
        struct kvm_msr_entry entries[100];
971
    } msr_data;
972
    struct kvm_msr_entry *msrs = msr_data.entries;
973
    int n = 0;
974

    
975
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
976
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
977
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
978
    kvm_msr_entry_set(&msrs[n++], MSR_PAT, env->pat);
979
    if (has_msr_star) {
980
        kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
981
    }
982
    if (has_msr_hsave_pa) {
983
        kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
984
    }
985
    if (has_msr_tsc_deadline) {
986
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSCDEADLINE, env->tsc_deadline);
987
    }
988
    if (has_msr_misc_enable) {
989
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
990
                          env->msr_ia32_misc_enable);
991
    }
992
#ifdef TARGET_X86_64
993
    if (lm_capable_kernel) {
994
        kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
995
        kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
996
        kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
997
        kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
998
    }
999
#endif
1000
    if (level == KVM_PUT_FULL_STATE) {
1001
        /*
1002
         * KVM is yet unable to synchronize TSC values of multiple VCPUs on
1003
         * writeback. Until this is fixed, we only write the offset to SMP
1004
         * guests after migration, desynchronizing the VCPUs, but avoiding
1005
         * huge jump-backs that would occur without any writeback at all.
1006
         */
1007
        if (smp_cpus == 1 || env->tsc != 0) {
1008
            kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
1009
        }
1010
    }
1011
    /*
1012
     * The following paravirtual MSRs have side effects on the guest or are
1013
     * too heavy for normal writeback. Limit them to reset or full state
1014
     * updates.
1015
     */
1016
    if (level >= KVM_PUT_RESET_STATE) {
1017
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
1018
                          env->system_time_msr);
1019
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
1020
        if (has_msr_async_pf_en) {
1021
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_ASYNC_PF_EN,
1022
                              env->async_pf_en_msr);
1023
        }
1024
        if (has_msr_pv_eoi_en) {
1025
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_PV_EOI_EN,
1026
                              env->pv_eoi_en_msr);
1027
        }
1028
        if (hyperv_hypercall_available()) {
1029
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_GUEST_OS_ID, 0);
1030
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_HYPERCALL, 0);
1031
        }
1032
        if (hyperv_vapic_recommended()) {
1033
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE, 0);
1034
        }
1035
    }
1036
    if (env->mcg_cap) {
1037
        int i;
1038

    
1039
        kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
1040
        kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
1041
        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
1042
            kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
1043
        }
1044
    }
1045

    
1046
    msr_data.info.nmsrs = n;
1047

    
1048
    return kvm_vcpu_ioctl(env, KVM_SET_MSRS, &msr_data);
1049

    
1050
}
1051

    
1052

    
1053
static int kvm_get_fpu(CPUX86State *env)
1054
{
1055
    struct kvm_fpu fpu;
1056
    int i, ret;
1057

    
1058
    ret = kvm_vcpu_ioctl(env, KVM_GET_FPU, &fpu);
1059
    if (ret < 0) {
1060
        return ret;
1061
    }
1062

    
1063
    env->fpstt = (fpu.fsw >> 11) & 7;
1064
    env->fpus = fpu.fsw;
1065
    env->fpuc = fpu.fcw;
1066
    env->fpop = fpu.last_opcode;
1067
    env->fpip = fpu.last_ip;
1068
    env->fpdp = fpu.last_dp;
1069
    for (i = 0; i < 8; ++i) {
1070
        env->fptags[i] = !((fpu.ftwx >> i) & 1);
1071
    }
1072
    memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
1073
    memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
1074
    env->mxcsr = fpu.mxcsr;
1075

    
1076
    return 0;
1077
}
1078

    
1079
static int kvm_get_xsave(CPUX86State *env)
1080
{
1081
    struct kvm_xsave* xsave = env->kvm_xsave_buf;
1082
    int ret, i;
1083
    uint16_t cwd, swd, twd;
1084

    
1085
    if (!kvm_has_xsave()) {
1086
        return kvm_get_fpu(env);
1087
    }
1088

    
1089
    ret = kvm_vcpu_ioctl(env, KVM_GET_XSAVE, xsave);
1090
    if (ret < 0) {
1091
        return ret;
1092
    }
1093

    
1094
    cwd = (uint16_t)xsave->region[XSAVE_FCW_FSW];
1095
    swd = (uint16_t)(xsave->region[XSAVE_FCW_FSW] >> 16);
1096
    twd = (uint16_t)xsave->region[XSAVE_FTW_FOP];
1097
    env->fpop = (uint16_t)(xsave->region[XSAVE_FTW_FOP] >> 16);
1098
    env->fpstt = (swd >> 11) & 7;
1099
    env->fpus = swd;
1100
    env->fpuc = cwd;
1101
    for (i = 0; i < 8; ++i) {
1102
        env->fptags[i] = !((twd >> i) & 1);
1103
    }
1104
    memcpy(&env->fpip, &xsave->region[XSAVE_CWD_RIP], sizeof(env->fpip));
1105
    memcpy(&env->fpdp, &xsave->region[XSAVE_CWD_RDP], sizeof(env->fpdp));
1106
    env->mxcsr = xsave->region[XSAVE_MXCSR];
1107
    memcpy(env->fpregs, &xsave->region[XSAVE_ST_SPACE],
1108
            sizeof env->fpregs);
1109
    memcpy(env->xmm_regs, &xsave->region[XSAVE_XMM_SPACE],
1110
            sizeof env->xmm_regs);
1111
    env->xstate_bv = *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV];
1112
    memcpy(env->ymmh_regs, &xsave->region[XSAVE_YMMH_SPACE],
1113
            sizeof env->ymmh_regs);
1114
    return 0;
1115
}
1116

    
1117
static int kvm_get_xcrs(CPUX86State *env)
1118
{
1119
    int i, ret;
1120
    struct kvm_xcrs xcrs;
1121

    
1122
    if (!kvm_has_xcrs()) {
1123
        return 0;
1124
    }
1125

    
1126
    ret = kvm_vcpu_ioctl(env, KVM_GET_XCRS, &xcrs);
1127
    if (ret < 0) {
1128
        return ret;
1129
    }
1130

    
1131
    for (i = 0; i < xcrs.nr_xcrs; i++) {
1132
        /* Only support xcr0 now */
1133
        if (xcrs.xcrs[0].xcr == 0) {
1134
            env->xcr0 = xcrs.xcrs[0].value;
1135
            break;
1136
        }
1137
    }
1138
    return 0;
1139
}
1140

    
1141
static int kvm_get_sregs(CPUX86State *env)
1142
{
1143
    struct kvm_sregs sregs;
1144
    uint32_t hflags;
1145
    int bit, i, ret;
1146

    
1147
    ret = kvm_vcpu_ioctl(env, KVM_GET_SREGS, &sregs);
1148
    if (ret < 0) {
1149
        return ret;
1150
    }
1151

    
1152
    /* There can only be one pending IRQ set in the bitmap at a time, so try
1153
       to find it and save its number instead (-1 for none). */
1154
    env->interrupt_injected = -1;
1155
    for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
1156
        if (sregs.interrupt_bitmap[i]) {
1157
            bit = ctz64(sregs.interrupt_bitmap[i]);
1158
            env->interrupt_injected = i * 64 + bit;
1159
            break;
1160
        }
1161
    }
1162

    
1163
    get_seg(&env->segs[R_CS], &sregs.cs);
1164
    get_seg(&env->segs[R_DS], &sregs.ds);
1165
    get_seg(&env->segs[R_ES], &sregs.es);
1166
    get_seg(&env->segs[R_FS], &sregs.fs);
1167
    get_seg(&env->segs[R_GS], &sregs.gs);
1168
    get_seg(&env->segs[R_SS], &sregs.ss);
1169

    
1170
    get_seg(&env->tr, &sregs.tr);
1171
    get_seg(&env->ldt, &sregs.ldt);
1172

    
1173
    env->idt.limit = sregs.idt.limit;
1174
    env->idt.base = sregs.idt.base;
1175
    env->gdt.limit = sregs.gdt.limit;
1176
    env->gdt.base = sregs.gdt.base;
1177

    
1178
    env->cr[0] = sregs.cr0;
1179
    env->cr[2] = sregs.cr2;
1180
    env->cr[3] = sregs.cr3;
1181
    env->cr[4] = sregs.cr4;
1182

    
1183
    env->efer = sregs.efer;
1184

    
1185
    /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
1186

    
1187
#define HFLAG_COPY_MASK \
1188
    ~( HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
1189
       HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
1190
       HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
1191
       HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
1192

    
1193
    hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
1194
    hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
1195
    hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
1196
                (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
1197
    hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
1198
    hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
1199
                (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
1200

    
1201
    if (env->efer & MSR_EFER_LMA) {
1202
        hflags |= HF_LMA_MASK;
1203
    }
1204

    
1205
    if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
1206
        hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
1207
    } else {
1208
        hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
1209
                    (DESC_B_SHIFT - HF_CS32_SHIFT);
1210
        hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
1211
                    (DESC_B_SHIFT - HF_SS32_SHIFT);
1212
        if (!(env->cr[0] & CR0_PE_MASK) || (env->eflags & VM_MASK) ||
1213
            !(hflags & HF_CS32_MASK)) {
1214
            hflags |= HF_ADDSEG_MASK;
1215
        } else {
1216
            hflags |= ((env->segs[R_DS].base | env->segs[R_ES].base |
1217
                        env->segs[R_SS].base) != 0) << HF_ADDSEG_SHIFT;
1218
        }
1219
    }
1220
    env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
1221

    
1222
    return 0;
1223
}
1224

    
1225
static int kvm_get_msrs(CPUX86State *env)
1226
{
1227
    struct {
1228
        struct kvm_msrs info;
1229
        struct kvm_msr_entry entries[100];
1230
    } msr_data;
1231
    struct kvm_msr_entry *msrs = msr_data.entries;
1232
    int ret, i, n;
1233

    
1234
    n = 0;
1235
    msrs[n++].index = MSR_IA32_SYSENTER_CS;
1236
    msrs[n++].index = MSR_IA32_SYSENTER_ESP;
1237
    msrs[n++].index = MSR_IA32_SYSENTER_EIP;
1238
    msrs[n++].index = MSR_PAT;
1239
    if (has_msr_star) {
1240
        msrs[n++].index = MSR_STAR;
1241
    }
1242
    if (has_msr_hsave_pa) {
1243
        msrs[n++].index = MSR_VM_HSAVE_PA;
1244
    }
1245
    if (has_msr_tsc_deadline) {
1246
        msrs[n++].index = MSR_IA32_TSCDEADLINE;
1247
    }
1248
    if (has_msr_misc_enable) {
1249
        msrs[n++].index = MSR_IA32_MISC_ENABLE;
1250
    }
1251

    
1252
    if (!env->tsc_valid) {
1253
        msrs[n++].index = MSR_IA32_TSC;
1254
        env->tsc_valid = !runstate_is_running();
1255
    }
1256

    
1257
#ifdef TARGET_X86_64
1258
    if (lm_capable_kernel) {
1259
        msrs[n++].index = MSR_CSTAR;
1260
        msrs[n++].index = MSR_KERNELGSBASE;
1261
        msrs[n++].index = MSR_FMASK;
1262
        msrs[n++].index = MSR_LSTAR;
1263
    }
1264
#endif
1265
    msrs[n++].index = MSR_KVM_SYSTEM_TIME;
1266
    msrs[n++].index = MSR_KVM_WALL_CLOCK;
1267
    if (has_msr_async_pf_en) {
1268
        msrs[n++].index = MSR_KVM_ASYNC_PF_EN;
1269
    }
1270
    if (has_msr_pv_eoi_en) {
1271
        msrs[n++].index = MSR_KVM_PV_EOI_EN;
1272
    }
1273

    
1274
    if (env->mcg_cap) {
1275
        msrs[n++].index = MSR_MCG_STATUS;
1276
        msrs[n++].index = MSR_MCG_CTL;
1277
        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
1278
            msrs[n++].index = MSR_MC0_CTL + i;
1279
        }
1280
    }
1281

    
1282
    msr_data.info.nmsrs = n;
1283
    ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
1284
    if (ret < 0) {
1285
        return ret;
1286
    }
1287

    
1288
    for (i = 0; i < ret; i++) {
1289
        switch (msrs[i].index) {
1290
        case MSR_IA32_SYSENTER_CS:
1291
            env->sysenter_cs = msrs[i].data;
1292
            break;
1293
        case MSR_IA32_SYSENTER_ESP:
1294
            env->sysenter_esp = msrs[i].data;
1295
            break;
1296
        case MSR_IA32_SYSENTER_EIP:
1297
            env->sysenter_eip = msrs[i].data;
1298
            break;
1299
        case MSR_PAT:
1300
            env->pat = msrs[i].data;
1301
            break;
1302
        case MSR_STAR:
1303
            env->star = msrs[i].data;
1304
            break;
1305
#ifdef TARGET_X86_64
1306
        case MSR_CSTAR:
1307
            env->cstar = msrs[i].data;
1308
            break;
1309
        case MSR_KERNELGSBASE:
1310
            env->kernelgsbase = msrs[i].data;
1311
            break;
1312
        case MSR_FMASK:
1313
            env->fmask = msrs[i].data;
1314
            break;
1315
        case MSR_LSTAR:
1316
            env->lstar = msrs[i].data;
1317
            break;
1318
#endif
1319
        case MSR_IA32_TSC:
1320
            env->tsc = msrs[i].data;
1321
            break;
1322
        case MSR_IA32_TSCDEADLINE:
1323
            env->tsc_deadline = msrs[i].data;
1324
            break;
1325
        case MSR_VM_HSAVE_PA:
1326
            env->vm_hsave = msrs[i].data;
1327
            break;
1328
        case MSR_KVM_SYSTEM_TIME:
1329
            env->system_time_msr = msrs[i].data;
1330
            break;
1331
        case MSR_KVM_WALL_CLOCK:
1332
            env->wall_clock_msr = msrs[i].data;
1333
            break;
1334
        case MSR_MCG_STATUS:
1335
            env->mcg_status = msrs[i].data;
1336
            break;
1337
        case MSR_MCG_CTL:
1338
            env->mcg_ctl = msrs[i].data;
1339
            break;
1340
        case MSR_IA32_MISC_ENABLE:
1341
            env->msr_ia32_misc_enable = msrs[i].data;
1342
            break;
1343
        default:
1344
            if (msrs[i].index >= MSR_MC0_CTL &&
1345
                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
1346
                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
1347
            }
1348
            break;
1349
        case MSR_KVM_ASYNC_PF_EN:
1350
            env->async_pf_en_msr = msrs[i].data;
1351
            break;
1352
        case MSR_KVM_PV_EOI_EN:
1353
            env->pv_eoi_en_msr = msrs[i].data;
1354
            break;
1355
        }
1356
    }
1357

    
1358
    return 0;
1359
}
1360

    
1361
static int kvm_put_mp_state(CPUX86State *env)
1362
{
1363
    struct kvm_mp_state mp_state = { .mp_state = env->mp_state };
1364

    
1365
    return kvm_vcpu_ioctl(env, KVM_SET_MP_STATE, &mp_state);
1366
}
1367

    
1368
static int kvm_get_mp_state(CPUX86State *env)
1369
{
1370
    struct kvm_mp_state mp_state;
1371
    int ret;
1372

    
1373
    ret = kvm_vcpu_ioctl(env, KVM_GET_MP_STATE, &mp_state);
1374
    if (ret < 0) {
1375
        return ret;
1376
    }
1377
    env->mp_state = mp_state.mp_state;
1378
    if (kvm_irqchip_in_kernel()) {
1379
        env->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
1380
    }
1381
    return 0;
1382
}
1383

    
1384
static int kvm_get_apic(CPUX86State *env)
1385
{
1386
    DeviceState *apic = env->apic_state;
1387
    struct kvm_lapic_state kapic;
1388
    int ret;
1389

    
1390
    if (apic && kvm_irqchip_in_kernel()) {
1391
        ret = kvm_vcpu_ioctl(env, KVM_GET_LAPIC, &kapic);
1392
        if (ret < 0) {
1393
            return ret;
1394
        }
1395

    
1396
        kvm_get_apic_state(apic, &kapic);
1397
    }
1398
    return 0;
1399
}
1400

    
1401
static int kvm_put_apic(CPUX86State *env)
1402
{
1403
    DeviceState *apic = env->apic_state;
1404
    struct kvm_lapic_state kapic;
1405

    
1406
    if (apic && kvm_irqchip_in_kernel()) {
1407
        kvm_put_apic_state(apic, &kapic);
1408

    
1409
        return kvm_vcpu_ioctl(env, KVM_SET_LAPIC, &kapic);
1410
    }
1411
    return 0;
1412
}
1413

    
1414
static int kvm_put_vcpu_events(CPUX86State *env, int level)
1415
{
1416
    struct kvm_vcpu_events events;
1417

    
1418
    if (!kvm_has_vcpu_events()) {
1419
        return 0;
1420
    }
1421

    
1422
    events.exception.injected = (env->exception_injected >= 0);
1423
    events.exception.nr = env->exception_injected;
1424
    events.exception.has_error_code = env->has_error_code;
1425
    events.exception.error_code = env->error_code;
1426
    events.exception.pad = 0;
1427

    
1428
    events.interrupt.injected = (env->interrupt_injected >= 0);
1429
    events.interrupt.nr = env->interrupt_injected;
1430
    events.interrupt.soft = env->soft_interrupt;
1431

    
1432
    events.nmi.injected = env->nmi_injected;
1433
    events.nmi.pending = env->nmi_pending;
1434
    events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
1435
    events.nmi.pad = 0;
1436

    
1437
    events.sipi_vector = env->sipi_vector;
1438

    
1439
    events.flags = 0;
1440
    if (level >= KVM_PUT_RESET_STATE) {
1441
        events.flags |=
1442
            KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
1443
    }
1444

    
1445
    return kvm_vcpu_ioctl(env, KVM_SET_VCPU_EVENTS, &events);
1446
}
1447

    
1448
static int kvm_get_vcpu_events(CPUX86State *env)
1449
{
1450
    struct kvm_vcpu_events events;
1451
    int ret;
1452

    
1453
    if (!kvm_has_vcpu_events()) {
1454
        return 0;
1455
    }
1456

    
1457
    ret = kvm_vcpu_ioctl(env, KVM_GET_VCPU_EVENTS, &events);
1458
    if (ret < 0) {
1459
       return ret;
1460
    }
1461
    env->exception_injected =
1462
       events.exception.injected ? events.exception.nr : -1;
1463
    env->has_error_code = events.exception.has_error_code;
1464
    env->error_code = events.exception.error_code;
1465

    
1466
    env->interrupt_injected =
1467
        events.interrupt.injected ? events.interrupt.nr : -1;
1468
    env->soft_interrupt = events.interrupt.soft;
1469

    
1470
    env->nmi_injected = events.nmi.injected;
1471
    env->nmi_pending = events.nmi.pending;
1472
    if (events.nmi.masked) {
1473
        env->hflags2 |= HF2_NMI_MASK;
1474
    } else {
1475
        env->hflags2 &= ~HF2_NMI_MASK;
1476
    }
1477

    
1478
    env->sipi_vector = events.sipi_vector;
1479

    
1480
    return 0;
1481
}
1482

    
1483
static int kvm_guest_debug_workarounds(CPUX86State *env)
1484
{
1485
    int ret = 0;
1486
    unsigned long reinject_trap = 0;
1487

    
1488
    if (!kvm_has_vcpu_events()) {
1489
        if (env->exception_injected == 1) {
1490
            reinject_trap = KVM_GUESTDBG_INJECT_DB;
1491
        } else if (env->exception_injected == 3) {
1492
            reinject_trap = KVM_GUESTDBG_INJECT_BP;
1493
        }
1494
        env->exception_injected = -1;
1495
    }
1496

    
1497
    /*
1498
     * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
1499
     * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
1500
     * by updating the debug state once again if single-stepping is on.
1501
     * Another reason to call kvm_update_guest_debug here is a pending debug
1502
     * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
1503
     * reinject them via SET_GUEST_DEBUG.
1504
     */
1505
    if (reinject_trap ||
1506
        (!kvm_has_robust_singlestep() && env->singlestep_enabled)) {
1507
        ret = kvm_update_guest_debug(env, reinject_trap);
1508
    }
1509
    return ret;
1510
}
1511

    
1512
static int kvm_put_debugregs(CPUX86State *env)
1513
{
1514
    struct kvm_debugregs dbgregs;
1515
    int i;
1516

    
1517
    if (!kvm_has_debugregs()) {
1518
        return 0;
1519
    }
1520

    
1521
    for (i = 0; i < 4; i++) {
1522
        dbgregs.db[i] = env->dr[i];
1523
    }
1524
    dbgregs.dr6 = env->dr[6];
1525
    dbgregs.dr7 = env->dr[7];
1526
    dbgregs.flags = 0;
1527

    
1528
    return kvm_vcpu_ioctl(env, KVM_SET_DEBUGREGS, &dbgregs);
1529
}
1530

    
1531
static int kvm_get_debugregs(CPUX86State *env)
1532
{
1533
    struct kvm_debugregs dbgregs;
1534
    int i, ret;
1535

    
1536
    if (!kvm_has_debugregs()) {
1537
        return 0;
1538
    }
1539

    
1540
    ret = kvm_vcpu_ioctl(env, KVM_GET_DEBUGREGS, &dbgregs);
1541
    if (ret < 0) {
1542
        return ret;
1543
    }
1544
    for (i = 0; i < 4; i++) {
1545
        env->dr[i] = dbgregs.db[i];
1546
    }
1547
    env->dr[4] = env->dr[6] = dbgregs.dr6;
1548
    env->dr[5] = env->dr[7] = dbgregs.dr7;
1549

    
1550
    return 0;
1551
}
1552

    
1553
int kvm_arch_put_registers(CPUX86State *env, int level)
1554
{
1555
    int ret;
1556

    
1557
    assert(cpu_is_stopped(env) || qemu_cpu_is_self(env));
1558

    
1559
    ret = kvm_getput_regs(env, 1);
1560
    if (ret < 0) {
1561
        return ret;
1562
    }
1563
    ret = kvm_put_xsave(env);
1564
    if (ret < 0) {
1565
        return ret;
1566
    }
1567
    ret = kvm_put_xcrs(env);
1568
    if (ret < 0) {
1569
        return ret;
1570
    }
1571
    ret = kvm_put_sregs(env);
1572
    if (ret < 0) {
1573
        return ret;
1574
    }
1575
    /* must be before kvm_put_msrs */
1576
    ret = kvm_inject_mce_oldstyle(env);
1577
    if (ret < 0) {
1578
        return ret;
1579
    }
1580
    ret = kvm_put_msrs(env, level);
1581
    if (ret < 0) {
1582
        return ret;
1583
    }
1584
    if (level >= KVM_PUT_RESET_STATE) {
1585
        ret = kvm_put_mp_state(env);
1586
        if (ret < 0) {
1587
            return ret;
1588
        }
1589
        ret = kvm_put_apic(env);
1590
        if (ret < 0) {
1591
            return ret;
1592
        }
1593
    }
1594
    ret = kvm_put_vcpu_events(env, level);
1595
    if (ret < 0) {
1596
        return ret;
1597
    }
1598
    ret = kvm_put_debugregs(env);
1599
    if (ret < 0) {
1600
        return ret;
1601
    }
1602
    /* must be last */
1603
    ret = kvm_guest_debug_workarounds(env);
1604
    if (ret < 0) {
1605
        return ret;
1606
    }
1607
    return 0;
1608
}
1609

    
1610
int kvm_arch_get_registers(CPUX86State *env)
1611
{
1612
    int ret;
1613

    
1614
    assert(cpu_is_stopped(env) || qemu_cpu_is_self(env));
1615

    
1616
    ret = kvm_getput_regs(env, 0);
1617
    if (ret < 0) {
1618
        return ret;
1619
    }
1620
    ret = kvm_get_xsave(env);
1621
    if (ret < 0) {
1622
        return ret;
1623
    }
1624
    ret = kvm_get_xcrs(env);
1625
    if (ret < 0) {
1626
        return ret;
1627
    }
1628
    ret = kvm_get_sregs(env);
1629
    if (ret < 0) {
1630
        return ret;
1631
    }
1632
    ret = kvm_get_msrs(env);
1633
    if (ret < 0) {
1634
        return ret;
1635
    }
1636
    ret = kvm_get_mp_state(env);
1637
    if (ret < 0) {
1638
        return ret;
1639
    }
1640
    ret = kvm_get_apic(env);
1641
    if (ret < 0) {
1642
        return ret;
1643
    }
1644
    ret = kvm_get_vcpu_events(env);
1645
    if (ret < 0) {
1646
        return ret;
1647
    }
1648
    ret = kvm_get_debugregs(env);
1649
    if (ret < 0) {
1650
        return ret;
1651
    }
1652
    return 0;
1653
}
1654

    
1655
void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
1656
{
1657
    int ret;
1658

    
1659
    /* Inject NMI */
1660
    if (env->interrupt_request & CPU_INTERRUPT_NMI) {
1661
        env->interrupt_request &= ~CPU_INTERRUPT_NMI;
1662
        DPRINTF("injected NMI\n");
1663
        ret = kvm_vcpu_ioctl(env, KVM_NMI);
1664
        if (ret < 0) {
1665
            fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
1666
                    strerror(-ret));
1667
        }
1668
    }
1669

    
1670
    if (!kvm_irqchip_in_kernel()) {
1671
        /* Force the VCPU out of its inner loop to process any INIT requests
1672
         * or pending TPR access reports. */
1673
        if (env->interrupt_request &
1674
            (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1675
            env->exit_request = 1;
1676
        }
1677

    
1678
        /* Try to inject an interrupt if the guest can accept it */
1679
        if (run->ready_for_interrupt_injection &&
1680
            (env->interrupt_request & CPU_INTERRUPT_HARD) &&
1681
            (env->eflags & IF_MASK)) {
1682
            int irq;
1683

    
1684
            env->interrupt_request &= ~CPU_INTERRUPT_HARD;
1685
            irq = cpu_get_pic_interrupt(env);
1686
            if (irq >= 0) {
1687
                struct kvm_interrupt intr;
1688

    
1689
                intr.irq = irq;
1690
                DPRINTF("injected interrupt %d\n", irq);
1691
                ret = kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
1692
                if (ret < 0) {
1693
                    fprintf(stderr,
1694
                            "KVM: injection failed, interrupt lost (%s)\n",
1695
                            strerror(-ret));
1696
                }
1697
            }
1698
        }
1699

    
1700
        /* If we have an interrupt but the guest is not ready to receive an
1701
         * interrupt, request an interrupt window exit.  This will
1702
         * cause a return to userspace as soon as the guest is ready to
1703
         * receive interrupts. */
1704
        if ((env->interrupt_request & CPU_INTERRUPT_HARD)) {
1705
            run->request_interrupt_window = 1;
1706
        } else {
1707
            run->request_interrupt_window = 0;
1708
        }
1709

    
1710
        DPRINTF("setting tpr\n");
1711
        run->cr8 = cpu_get_apic_tpr(env->apic_state);
1712
    }
1713
}
1714

    
1715
void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run)
1716
{
1717
    if (run->if_flag) {
1718
        env->eflags |= IF_MASK;
1719
    } else {
1720
        env->eflags &= ~IF_MASK;
1721
    }
1722
    cpu_set_apic_tpr(env->apic_state, run->cr8);
1723
    cpu_set_apic_base(env->apic_state, run->apic_base);
1724
}
1725

    
1726
int kvm_arch_process_async_events(CPUX86State *env)
1727
{
1728
    X86CPU *cpu = x86_env_get_cpu(env);
1729

    
1730
    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
1731
        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
1732
        assert(env->mcg_cap);
1733

    
1734
        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
1735

    
1736
        kvm_cpu_synchronize_state(env);
1737

    
1738
        if (env->exception_injected == EXCP08_DBLE) {
1739
            /* this means triple fault */
1740
            qemu_system_reset_request();
1741
            env->exit_request = 1;
1742
            return 0;
1743
        }
1744
        env->exception_injected = EXCP12_MCHK;
1745
        env->has_error_code = 0;
1746

    
1747
        env->halted = 0;
1748
        if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
1749
            env->mp_state = KVM_MP_STATE_RUNNABLE;
1750
        }
1751
    }
1752

    
1753
    if (kvm_irqchip_in_kernel()) {
1754
        return 0;
1755
    }
1756

    
1757
    if (env->interrupt_request & CPU_INTERRUPT_POLL) {
1758
        env->interrupt_request &= ~CPU_INTERRUPT_POLL;
1759
        apic_poll_irq(env->apic_state);
1760
    }
1761
    if (((env->interrupt_request & CPU_INTERRUPT_HARD) &&
1762
         (env->eflags & IF_MASK)) ||
1763
        (env->interrupt_request & CPU_INTERRUPT_NMI)) {
1764
        env->halted = 0;
1765
    }
1766
    if (env->interrupt_request & CPU_INTERRUPT_INIT) {
1767
        kvm_cpu_synchronize_state(env);
1768
        do_cpu_init(cpu);
1769
    }
1770
    if (env->interrupt_request & CPU_INTERRUPT_SIPI) {
1771
        kvm_cpu_synchronize_state(env);
1772
        do_cpu_sipi(cpu);
1773
    }
1774
    if (env->interrupt_request & CPU_INTERRUPT_TPR) {
1775
        env->interrupt_request &= ~CPU_INTERRUPT_TPR;
1776
        kvm_cpu_synchronize_state(env);
1777
        apic_handle_tpr_access_report(env->apic_state, env->eip,
1778
                                      env->tpr_access_type);
1779
    }
1780

    
1781
    return env->halted;
1782
}
1783

    
1784
static int kvm_handle_halt(CPUX86State *env)
1785
{
1786
    if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
1787
          (env->eflags & IF_MASK)) &&
1788
        !(env->interrupt_request & CPU_INTERRUPT_NMI)) {
1789
        env->halted = 1;
1790
        return EXCP_HLT;
1791
    }
1792

    
1793
    return 0;
1794
}
1795

    
1796
static int kvm_handle_tpr_access(CPUX86State *env)
1797
{
1798
    struct kvm_run *run = env->kvm_run;
1799

    
1800
    apic_handle_tpr_access_report(env->apic_state, run->tpr_access.rip,
1801
                                  run->tpr_access.is_write ? TPR_ACCESS_WRITE
1802
                                                           : TPR_ACCESS_READ);
1803
    return 1;
1804
}
1805

    
1806
int kvm_arch_insert_sw_breakpoint(CPUX86State *env, struct kvm_sw_breakpoint *bp)
1807
{
1808
    static const uint8_t int3 = 0xcc;
1809

    
1810
    if (cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
1811
        cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&int3, 1, 1)) {
1812
        return -EINVAL;
1813
    }
1814
    return 0;
1815
}
1816

    
1817
int kvm_arch_remove_sw_breakpoint(CPUX86State *env, struct kvm_sw_breakpoint *bp)
1818
{
1819
    uint8_t int3;
1820

    
1821
    if (cpu_memory_rw_debug(env, bp->pc, &int3, 1, 0) || int3 != 0xcc ||
1822
        cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
1823
        return -EINVAL;
1824
    }
1825
    return 0;
1826
}
1827

    
1828
static struct {
1829
    target_ulong addr;
1830
    int len;
1831
    int type;
1832
} hw_breakpoint[4];
1833

    
1834
static int nb_hw_breakpoint;
1835

    
1836
static int find_hw_breakpoint(target_ulong addr, int len, int type)
1837
{
1838
    int n;
1839

    
1840
    for (n = 0; n < nb_hw_breakpoint; n++) {
1841
        if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
1842
            (hw_breakpoint[n].len == len || len == -1)) {
1843
            return n;
1844
        }
1845
    }
1846
    return -1;
1847
}
1848

    
1849
int kvm_arch_insert_hw_breakpoint(target_ulong addr,
1850
                                  target_ulong len, int type)
1851
{
1852
    switch (type) {
1853
    case GDB_BREAKPOINT_HW:
1854
        len = 1;
1855
        break;
1856
    case GDB_WATCHPOINT_WRITE:
1857
    case GDB_WATCHPOINT_ACCESS:
1858
        switch (len) {
1859
        case 1:
1860
            break;
1861
        case 2:
1862
        case 4:
1863
        case 8:
1864
            if (addr & (len - 1)) {
1865
                return -EINVAL;
1866
            }
1867
            break;
1868
        default:
1869
            return -EINVAL;
1870
        }
1871
        break;
1872
    default:
1873
        return -ENOSYS;
1874
    }
1875

    
1876
    if (nb_hw_breakpoint == 4) {
1877
        return -ENOBUFS;
1878
    }
1879
    if (find_hw_breakpoint(addr, len, type) >= 0) {
1880
        return -EEXIST;
1881
    }
1882
    hw_breakpoint[nb_hw_breakpoint].addr = addr;
1883
    hw_breakpoint[nb_hw_breakpoint].len = len;
1884
    hw_breakpoint[nb_hw_breakpoint].type = type;
1885
    nb_hw_breakpoint++;
1886

    
1887
    return 0;
1888
}
1889

    
1890
int kvm_arch_remove_hw_breakpoint(target_ulong addr,
1891
                                  target_ulong len, int type)
1892
{
1893
    int n;
1894

    
1895
    n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
1896
    if (n < 0) {
1897
        return -ENOENT;
1898
    }
1899
    nb_hw_breakpoint--;
1900
    hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
1901

    
1902
    return 0;
1903
}
1904

    
1905
void kvm_arch_remove_all_hw_breakpoints(void)
1906
{
1907
    nb_hw_breakpoint = 0;
1908
}
1909

    
1910
static CPUWatchpoint hw_watchpoint;
1911

    
1912
static int kvm_handle_debug(struct kvm_debug_exit_arch *arch_info)
1913
{
1914
    int ret = 0;
1915
    int n;
1916

    
1917
    if (arch_info->exception == 1) {
1918
        if (arch_info->dr6 & (1 << 14)) {
1919
            if (cpu_single_env->singlestep_enabled) {
1920
                ret = EXCP_DEBUG;
1921
            }
1922
        } else {
1923
            for (n = 0; n < 4; n++) {
1924
                if (arch_info->dr6 & (1 << n)) {
1925
                    switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
1926
                    case 0x0:
1927
                        ret = EXCP_DEBUG;
1928
                        break;
1929
                    case 0x1:
1930
                        ret = EXCP_DEBUG;
1931
                        cpu_single_env->watchpoint_hit = &hw_watchpoint;
1932
                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
1933
                        hw_watchpoint.flags = BP_MEM_WRITE;
1934
                        break;
1935
                    case 0x3:
1936
                        ret = EXCP_DEBUG;
1937
                        cpu_single_env->watchpoint_hit = &hw_watchpoint;
1938
                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
1939
                        hw_watchpoint.flags = BP_MEM_ACCESS;
1940
                        break;
1941
                    }
1942
                }
1943
            }
1944
        }
1945
    } else if (kvm_find_sw_breakpoint(cpu_single_env, arch_info->pc)) {
1946
        ret = EXCP_DEBUG;
1947
    }
1948
    if (ret == 0) {
1949
        cpu_synchronize_state(cpu_single_env);
1950
        assert(cpu_single_env->exception_injected == -1);
1951

    
1952
        /* pass to guest */
1953
        cpu_single_env->exception_injected = arch_info->exception;
1954
        cpu_single_env->has_error_code = 0;
1955
    }
1956

    
1957
    return ret;
1958
}
1959

    
1960
void kvm_arch_update_guest_debug(CPUX86State *env, struct kvm_guest_debug *dbg)
1961
{
1962
    const uint8_t type_code[] = {
1963
        [GDB_BREAKPOINT_HW] = 0x0,
1964
        [GDB_WATCHPOINT_WRITE] = 0x1,
1965
        [GDB_WATCHPOINT_ACCESS] = 0x3
1966
    };
1967
    const uint8_t len_code[] = {
1968
        [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
1969
    };
1970
    int n;
1971

    
1972
    if (kvm_sw_breakpoints_active(env)) {
1973
        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
1974
    }
1975
    if (nb_hw_breakpoint > 0) {
1976
        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
1977
        dbg->arch.debugreg[7] = 0x0600;
1978
        for (n = 0; n < nb_hw_breakpoint; n++) {
1979
            dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
1980
            dbg->arch.debugreg[7] |= (2 << (n * 2)) |
1981
                (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
1982
                ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
1983
        }
1984
    }
1985
}
1986

    
1987
static bool host_supports_vmx(void)
1988
{
1989
    uint32_t ecx, unused;
1990

    
1991
    host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
1992
    return ecx & CPUID_EXT_VMX;
1993
}
1994

    
1995
#define VMX_INVALID_GUEST_STATE 0x80000021
1996

    
1997
int kvm_arch_handle_exit(CPUX86State *env, struct kvm_run *run)
1998
{
1999
    uint64_t code;
2000
    int ret;
2001

    
2002
    switch (run->exit_reason) {
2003
    case KVM_EXIT_HLT:
2004
        DPRINTF("handle_hlt\n");
2005
        ret = kvm_handle_halt(env);
2006
        break;
2007
    case KVM_EXIT_SET_TPR:
2008
        ret = 0;
2009
        break;
2010
    case KVM_EXIT_TPR_ACCESS:
2011
        ret = kvm_handle_tpr_access(env);
2012
        break;
2013
    case KVM_EXIT_FAIL_ENTRY:
2014
        code = run->fail_entry.hardware_entry_failure_reason;
2015
        fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
2016
                code);
2017
        if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
2018
            fprintf(stderr,
2019
                    "\nIf you're running a guest on an Intel machine without "
2020
                        "unrestricted mode\n"
2021
                    "support, the failure can be most likely due to the guest "
2022
                        "entering an invalid\n"
2023
                    "state for Intel VT. For example, the guest maybe running "
2024
                        "in big real mode\n"
2025
                    "which is not supported on less recent Intel processors."
2026
                        "\n\n");
2027
        }
2028
        ret = -1;
2029
        break;
2030
    case KVM_EXIT_EXCEPTION:
2031
        fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
2032
                run->ex.exception, run->ex.error_code);
2033
        ret = -1;
2034
        break;
2035
    case KVM_EXIT_DEBUG:
2036
        DPRINTF("kvm_exit_debug\n");
2037
        ret = kvm_handle_debug(&run->debug.arch);
2038
        break;
2039
    default:
2040
        fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
2041
        ret = -1;
2042
        break;
2043
    }
2044

    
2045
    return ret;
2046
}
2047

    
2048
bool kvm_arch_stop_on_emulation_error(CPUX86State *env)
2049
{
2050
    kvm_cpu_synchronize_state(env);
2051
    return !(env->cr[0] & CR0_PE_MASK) ||
2052
           ((env->segs[R_CS].selector  & 3) != 3);
2053
}
2054

    
2055
void kvm_arch_init_irq_routing(KVMState *s)
2056
{
2057
    if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
2058
        /* If kernel can't do irq routing, interrupt source
2059
         * override 0->2 cannot be set up as required by HPET.
2060
         * So we have to disable it.
2061
         */
2062
        no_hpet = 1;
2063
    }
2064
    /* We know at this point that we're using the in-kernel
2065
     * irqchip, so we can use irqfds, and on x86 we know
2066
     * we can use msi via irqfd and GSI routing.
2067
     */
2068
    kvm_irqfds_allowed = true;
2069
    kvm_msi_via_irqfd_allowed = true;
2070
    kvm_gsi_routing_allowed = true;
2071
}
2072

    
2073
/* Classic KVM device assignment interface. Will remain x86 only. */
2074
int kvm_device_pci_assign(KVMState *s, PCIHostDeviceAddress *dev_addr,
2075
                          uint32_t flags, uint32_t *dev_id)
2076
{
2077
    struct kvm_assigned_pci_dev dev_data = {
2078
        .segnr = dev_addr->domain,
2079
        .busnr = dev_addr->bus,
2080
        .devfn = PCI_DEVFN(dev_addr->slot, dev_addr->function),
2081
        .flags = flags,
2082
    };
2083
    int ret;
2084

    
2085
    dev_data.assigned_dev_id =
2086
        (dev_addr->domain << 16) | (dev_addr->bus << 8) | dev_data.devfn;
2087

    
2088
    ret = kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data);
2089
    if (ret < 0) {
2090
        return ret;
2091
    }
2092

    
2093
    *dev_id = dev_data.assigned_dev_id;
2094

    
2095
    return 0;
2096
}
2097

    
2098
int kvm_device_pci_deassign(KVMState *s, uint32_t dev_id)
2099
{
2100
    struct kvm_assigned_pci_dev dev_data = {
2101
        .assigned_dev_id = dev_id,
2102
    };
2103

    
2104
    return kvm_vm_ioctl(s, KVM_DEASSIGN_PCI_DEVICE, &dev_data);
2105
}
2106

    
2107
static int kvm_assign_irq_internal(KVMState *s, uint32_t dev_id,
2108
                                   uint32_t irq_type, uint32_t guest_irq)
2109
{
2110
    struct kvm_assigned_irq assigned_irq = {
2111
        .assigned_dev_id = dev_id,
2112
        .guest_irq = guest_irq,
2113
        .flags = irq_type,
2114
    };
2115

    
2116
    if (kvm_check_extension(s, KVM_CAP_ASSIGN_DEV_IRQ)) {
2117
        return kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ, &assigned_irq);
2118
    } else {
2119
        return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ, &assigned_irq);
2120
    }
2121
}
2122

    
2123
int kvm_device_intx_assign(KVMState *s, uint32_t dev_id, bool use_host_msi,
2124
                           uint32_t guest_irq)
2125
{
2126
    uint32_t irq_type = KVM_DEV_IRQ_GUEST_INTX |
2127
        (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX);
2128

    
2129
    return kvm_assign_irq_internal(s, dev_id, irq_type, guest_irq);
2130
}
2131

    
2132
int kvm_device_intx_set_mask(KVMState *s, uint32_t dev_id, bool masked)
2133
{
2134
    struct kvm_assigned_pci_dev dev_data = {
2135
        .assigned_dev_id = dev_id,
2136
        .flags = masked ? KVM_DEV_ASSIGN_MASK_INTX : 0,
2137
    };
2138

    
2139
    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_INTX_MASK, &dev_data);
2140
}
2141

    
2142
static int kvm_deassign_irq_internal(KVMState *s, uint32_t dev_id,
2143
                                     uint32_t type)
2144
{
2145
    struct kvm_assigned_irq assigned_irq = {
2146
        .assigned_dev_id = dev_id,
2147
        .flags = type,
2148
    };
2149

    
2150
    return kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ, &assigned_irq);
2151
}
2152

    
2153
int kvm_device_intx_deassign(KVMState *s, uint32_t dev_id, bool use_host_msi)
2154
{
2155
    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_INTX |
2156
        (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX));
2157
}
2158

    
2159
int kvm_device_msi_assign(KVMState *s, uint32_t dev_id, int virq)
2160
{
2161
    return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSI |
2162
                                              KVM_DEV_IRQ_GUEST_MSI, virq);
2163
}
2164

    
2165
int kvm_device_msi_deassign(KVMState *s, uint32_t dev_id)
2166
{
2167
    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSI |
2168
                                                KVM_DEV_IRQ_HOST_MSI);
2169
}
2170

    
2171
bool kvm_device_msix_supported(KVMState *s)
2172
{
2173
    /* The kernel lacks a corresponding KVM_CAP, so we probe by calling
2174
     * KVM_ASSIGN_SET_MSIX_NR with an invalid parameter. */
2175
    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, NULL) == -EFAULT;
2176
}
2177

    
2178
int kvm_device_msix_init_vectors(KVMState *s, uint32_t dev_id,
2179
                                 uint32_t nr_vectors)
2180
{
2181
    struct kvm_assigned_msix_nr msix_nr = {
2182
        .assigned_dev_id = dev_id,
2183
        .entry_nr = nr_vectors,
2184
    };
2185

    
2186
    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, &msix_nr);
2187
}
2188

    
2189
int kvm_device_msix_set_vector(KVMState *s, uint32_t dev_id, uint32_t vector,
2190
                               int virq)
2191
{
2192
    struct kvm_assigned_msix_entry msix_entry = {
2193
        .assigned_dev_id = dev_id,
2194
        .gsi = virq,
2195
        .entry = vector,
2196
    };
2197

    
2198
    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_ENTRY, &msix_entry);
2199
}
2200

    
2201
int kvm_device_msix_assign(KVMState *s, uint32_t dev_id)
2202
{
2203
    return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSIX |
2204
                                              KVM_DEV_IRQ_GUEST_MSIX, 0);
2205
}
2206

    
2207
int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id)
2208
{
2209
    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSIX |
2210
                                                KVM_DEV_IRQ_HOST_MSIX);
2211
}