Statistics
| Branch: | Revision:

root / target-i386 / kvm.c @ 42673936

History | View | Annotate | Download (32.5 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright (C) 2006-2008 Qumranet Technologies
5
 * Copyright IBM, Corp. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *
10
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
11
 * See the COPYING file in the top-level directory.
12
 *
13
 */
14

    
15
#include <sys/types.h>
16
#include <sys/ioctl.h>
17
#include <sys/mman.h>
18

    
19
#include <linux/kvm.h>
20

    
21
#include "qemu-common.h"
22
#include "sysemu.h"
23
#include "kvm.h"
24
#include "cpu.h"
25
#include "gdbstub.h"
26
#include "host-utils.h"
27
#include "hw/pc.h"
28
#include "ioport.h"
29

    
30
#ifdef CONFIG_KVM_PARA
31
#include <linux/kvm_para.h>
32
#endif
33
//
34
//#define DEBUG_KVM
35

    
36
#ifdef DEBUG_KVM
37
#define dprintf(fmt, ...) \
38
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
39
#else
40
#define dprintf(fmt, ...) \
41
    do { } while (0)
42
#endif
43

    
44
#define MSR_KVM_WALL_CLOCK  0x11
45
#define MSR_KVM_SYSTEM_TIME 0x12
46

    
47
#ifdef KVM_CAP_EXT_CPUID
48

    
49
static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
50
{
51
    struct kvm_cpuid2 *cpuid;
52
    int r, size;
53

    
54
    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
55
    cpuid = (struct kvm_cpuid2 *)qemu_mallocz(size);
56
    cpuid->nent = max;
57
    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
58
    if (r == 0 && cpuid->nent >= max) {
59
        r = -E2BIG;
60
    }
61
    if (r < 0) {
62
        if (r == -E2BIG) {
63
            qemu_free(cpuid);
64
            return NULL;
65
        } else {
66
            fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
67
                    strerror(-r));
68
            exit(1);
69
        }
70
    }
71
    return cpuid;
72
}
73

    
74
uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function, int reg)
75
{
76
    struct kvm_cpuid2 *cpuid;
77
    int i, max;
78
    uint32_t ret = 0;
79
    uint32_t cpuid_1_edx;
80

    
81
    if (!kvm_check_extension(env->kvm_state, KVM_CAP_EXT_CPUID)) {
82
        return -1U;
83
    }
84

    
85
    max = 1;
86
    while ((cpuid = try_get_cpuid(env->kvm_state, max)) == NULL) {
87
        max *= 2;
88
    }
89

    
90
    for (i = 0; i < cpuid->nent; ++i) {
91
        if (cpuid->entries[i].function == function) {
92
            switch (reg) {
93
            case R_EAX:
94
                ret = cpuid->entries[i].eax;
95
                break;
96
            case R_EBX:
97
                ret = cpuid->entries[i].ebx;
98
                break;
99
            case R_ECX:
100
                ret = cpuid->entries[i].ecx;
101
                break;
102
            case R_EDX:
103
                ret = cpuid->entries[i].edx;
104
                switch (function) {
105
                case 1:
106
                    /* KVM before 2.6.30 misreports the following features */
107
                    ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
108
                    break;
109
                case 0x80000001:
110
                    /* On Intel, kvm returns cpuid according to the Intel spec,
111
                     * so add missing bits according to the AMD spec:
112
                     */
113
                    cpuid_1_edx = kvm_arch_get_supported_cpuid(env, 1, R_EDX);
114
                    ret |= cpuid_1_edx & 0xdfeff7ff;
115
                    break;
116
                }
117
                break;
118
            }
119
        }
120
    }
121

    
122
    qemu_free(cpuid);
123

    
124
    return ret;
125
}
126

    
127
#else
128

    
129
uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function, int reg)
130
{
131
    return -1U;
132
}
133

    
134
#endif
135

    
136
static void kvm_trim_features(uint32_t *features, uint32_t supported)
137
{
138
    int i;
139
    uint32_t mask;
140

    
141
    for (i = 0; i < 32; ++i) {
142
        mask = 1U << i;
143
        if ((*features & mask) && !(supported & mask)) {
144
            *features &= ~mask;
145
        }
146
    }
147
}
148

    
149
#ifdef CONFIG_KVM_PARA
150
struct kvm_para_features {
151
        int cap;
152
        int feature;
153
} para_features[] = {
154
#ifdef KVM_CAP_CLOCKSOURCE
155
        { KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
156
#endif
157
#ifdef KVM_CAP_NOP_IO_DELAY
158
        { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
159
#endif
160
#ifdef KVM_CAP_PV_MMU
161
        { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
162
#endif
163
        { -1, -1 }
164
};
165

    
166
static int get_para_features(CPUState *env)
167
{
168
        int i, features = 0;
169

    
170
        for (i = 0; i < ARRAY_SIZE(para_features) - 1; i++) {
171
                if (kvm_check_extension(env->kvm_state, para_features[i].cap))
172
                        features |= (1 << para_features[i].feature);
173
        }
174

    
175
        return features;
176
}
177
#endif
178

    
179
int kvm_arch_init_vcpu(CPUState *env)
180
{
181
    struct {
182
        struct kvm_cpuid2 cpuid;
183
        struct kvm_cpuid_entry2 entries[100];
184
    } __attribute__((packed)) cpuid_data;
185
    uint32_t limit, i, j, cpuid_i;
186
    uint32_t unused;
187
    struct kvm_cpuid_entry2 *c;
188
#ifdef KVM_CPUID_SIGNATURE
189
    uint32_t signature[3];
190
#endif
191

    
192
    env->mp_state = KVM_MP_STATE_RUNNABLE;
193

    
194
    kvm_trim_features(&env->cpuid_features,
195
        kvm_arch_get_supported_cpuid(env, 1, R_EDX));
196

    
197
    i = env->cpuid_ext_features & CPUID_EXT_HYPERVISOR;
198
    kvm_trim_features(&env->cpuid_ext_features,
199
        kvm_arch_get_supported_cpuid(env, 1, R_ECX));
200
    env->cpuid_ext_features |= i;
201

    
202
    kvm_trim_features(&env->cpuid_ext2_features,
203
        kvm_arch_get_supported_cpuid(env, 0x80000001, R_EDX));
204
    kvm_trim_features(&env->cpuid_ext3_features,
205
        kvm_arch_get_supported_cpuid(env, 0x80000001, R_ECX));
206

    
207
    cpuid_i = 0;
208

    
209
#ifdef CONFIG_KVM_PARA
210
    /* Paravirtualization CPUIDs */
211
    memcpy(signature, "KVMKVMKVM\0\0\0", 12);
212
    c = &cpuid_data.entries[cpuid_i++];
213
    memset(c, 0, sizeof(*c));
214
    c->function = KVM_CPUID_SIGNATURE;
215
    c->eax = 0;
216
    c->ebx = signature[0];
217
    c->ecx = signature[1];
218
    c->edx = signature[2];
219

    
220
    c = &cpuid_data.entries[cpuid_i++];
221
    memset(c, 0, sizeof(*c));
222
    c->function = KVM_CPUID_FEATURES;
223
    c->eax = env->cpuid_kvm_features & get_para_features(env);
224
#endif
225

    
226
    cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
227

    
228
    for (i = 0; i <= limit; i++) {
229
        c = &cpuid_data.entries[cpuid_i++];
230

    
231
        switch (i) {
232
        case 2: {
233
            /* Keep reading function 2 till all the input is received */
234
            int times;
235

    
236
            c->function = i;
237
            c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
238
                       KVM_CPUID_FLAG_STATE_READ_NEXT;
239
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
240
            times = c->eax & 0xff;
241

    
242
            for (j = 1; j < times; ++j) {
243
                c = &cpuid_data.entries[cpuid_i++];
244
                c->function = i;
245
                c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
246
                cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
247
            }
248
            break;
249
        }
250
        case 4:
251
        case 0xb:
252
        case 0xd:
253
            for (j = 0; ; j++) {
254
                c->function = i;
255
                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
256
                c->index = j;
257
                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
258

    
259
                if (i == 4 && c->eax == 0)
260
                    break;
261
                if (i == 0xb && !(c->ecx & 0xff00))
262
                    break;
263
                if (i == 0xd && c->eax == 0)
264
                    break;
265

    
266
                c = &cpuid_data.entries[cpuid_i++];
267
            }
268
            break;
269
        default:
270
            c->function = i;
271
            c->flags = 0;
272
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
273
            break;
274
        }
275
    }
276
    cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
277

    
278
    for (i = 0x80000000; i <= limit; i++) {
279
        c = &cpuid_data.entries[cpuid_i++];
280

    
281
        c->function = i;
282
        c->flags = 0;
283
        cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
284
    }
285

    
286
    cpuid_data.cpuid.nent = cpuid_i;
287

    
288
    return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
289
}
290

    
291
void kvm_arch_reset_vcpu(CPUState *env)
292
{
293
    env->exception_injected = -1;
294
    env->interrupt_injected = -1;
295
    env->nmi_injected = 0;
296
    env->nmi_pending = 0;
297
}
298

    
299
static int kvm_has_msr_star(CPUState *env)
300
{
301
    static int has_msr_star;
302
    int ret;
303

    
304
    /* first time */
305
    if (has_msr_star == 0) {        
306
        struct kvm_msr_list msr_list, *kvm_msr_list;
307

    
308
        has_msr_star = -1;
309

    
310
        /* Obtain MSR list from KVM.  These are the MSRs that we must
311
         * save/restore */
312
        msr_list.nmsrs = 0;
313
        ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, &msr_list);
314
        if (ret < 0 && ret != -E2BIG) {
315
            return 0;
316
        }
317
        /* Old kernel modules had a bug and could write beyond the provided
318
           memory. Allocate at least a safe amount of 1K. */
319
        kvm_msr_list = qemu_mallocz(MAX(1024, sizeof(msr_list) +
320
                                              msr_list.nmsrs *
321
                                              sizeof(msr_list.indices[0])));
322

    
323
        kvm_msr_list->nmsrs = msr_list.nmsrs;
324
        ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
325
        if (ret >= 0) {
326
            int i;
327

    
328
            for (i = 0; i < kvm_msr_list->nmsrs; i++) {
329
                if (kvm_msr_list->indices[i] == MSR_STAR) {
330
                    has_msr_star = 1;
331
                    break;
332
                }
333
            }
334
        }
335

    
336
        free(kvm_msr_list);
337
    }
338

    
339
    if (has_msr_star == 1)
340
        return 1;
341
    return 0;
342
}
343

    
344
int kvm_arch_init(KVMState *s, int smp_cpus)
345
{
346
    int ret;
347

    
348
    /* create vm86 tss.  KVM uses vm86 mode to emulate 16-bit code
349
     * directly.  In order to use vm86 mode, a TSS is needed.  Since this
350
     * must be part of guest physical memory, we need to allocate it.  Older
351
     * versions of KVM just assumed that it would be at the end of physical
352
     * memory but that doesn't work with more than 4GB of memory.  We simply
353
     * refuse to work with those older versions of KVM. */
354
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_SET_TSS_ADDR);
355
    if (ret <= 0) {
356
        fprintf(stderr, "kvm does not support KVM_CAP_SET_TSS_ADDR\n");
357
        return ret;
358
    }
359

    
360
    /* this address is 3 pages before the bios, and the bios should present
361
     * as unavaible memory.  FIXME, need to ensure the e820 map deals with
362
     * this?
363
     */
364
    /*
365
     * Tell fw_cfg to notify the BIOS to reserve the range.
366
     */
367
    if (e820_add_entry(0xfffbc000, 0x4000, E820_RESERVED) < 0) {
368
        perror("e820_add_entry() table is full");
369
        exit(1);
370
    }
371
    return kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, 0xfffbd000);
372
}
373
                    
374
static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
375
{
376
    lhs->selector = rhs->selector;
377
    lhs->base = rhs->base;
378
    lhs->limit = rhs->limit;
379
    lhs->type = 3;
380
    lhs->present = 1;
381
    lhs->dpl = 3;
382
    lhs->db = 0;
383
    lhs->s = 1;
384
    lhs->l = 0;
385
    lhs->g = 0;
386
    lhs->avl = 0;
387
    lhs->unusable = 0;
388
}
389

    
390
static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
391
{
392
    unsigned flags = rhs->flags;
393
    lhs->selector = rhs->selector;
394
    lhs->base = rhs->base;
395
    lhs->limit = rhs->limit;
396
    lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
397
    lhs->present = (flags & DESC_P_MASK) != 0;
398
    lhs->dpl = rhs->selector & 3;
399
    lhs->db = (flags >> DESC_B_SHIFT) & 1;
400
    lhs->s = (flags & DESC_S_MASK) != 0;
401
    lhs->l = (flags >> DESC_L_SHIFT) & 1;
402
    lhs->g = (flags & DESC_G_MASK) != 0;
403
    lhs->avl = (flags & DESC_AVL_MASK) != 0;
404
    lhs->unusable = 0;
405
}
406

    
407
static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
408
{
409
    lhs->selector = rhs->selector;
410
    lhs->base = rhs->base;
411
    lhs->limit = rhs->limit;
412
    lhs->flags =
413
        (rhs->type << DESC_TYPE_SHIFT)
414
        | (rhs->present * DESC_P_MASK)
415
        | (rhs->dpl << DESC_DPL_SHIFT)
416
        | (rhs->db << DESC_B_SHIFT)
417
        | (rhs->s * DESC_S_MASK)
418
        | (rhs->l << DESC_L_SHIFT)
419
        | (rhs->g * DESC_G_MASK)
420
        | (rhs->avl * DESC_AVL_MASK);
421
}
422

    
423
static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
424
{
425
    if (set)
426
        *kvm_reg = *qemu_reg;
427
    else
428
        *qemu_reg = *kvm_reg;
429
}
430

    
431
static int kvm_getput_regs(CPUState *env, int set)
432
{
433
    struct kvm_regs regs;
434
    int ret = 0;
435

    
436
    if (!set) {
437
        ret = kvm_vcpu_ioctl(env, KVM_GET_REGS, &regs);
438
        if (ret < 0)
439
            return ret;
440
    }
441

    
442
    kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
443
    kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
444
    kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
445
    kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
446
    kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
447
    kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
448
    kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
449
    kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
450
#ifdef TARGET_X86_64
451
    kvm_getput_reg(&regs.r8, &env->regs[8], set);
452
    kvm_getput_reg(&regs.r9, &env->regs[9], set);
453
    kvm_getput_reg(&regs.r10, &env->regs[10], set);
454
    kvm_getput_reg(&regs.r11, &env->regs[11], set);
455
    kvm_getput_reg(&regs.r12, &env->regs[12], set);
456
    kvm_getput_reg(&regs.r13, &env->regs[13], set);
457
    kvm_getput_reg(&regs.r14, &env->regs[14], set);
458
    kvm_getput_reg(&regs.r15, &env->regs[15], set);
459
#endif
460

    
461
    kvm_getput_reg(&regs.rflags, &env->eflags, set);
462
    kvm_getput_reg(&regs.rip, &env->eip, set);
463

    
464
    if (set)
465
        ret = kvm_vcpu_ioctl(env, KVM_SET_REGS, &regs);
466

    
467
    return ret;
468
}
469

    
470
static int kvm_put_fpu(CPUState *env)
471
{
472
    struct kvm_fpu fpu;
473
    int i;
474

    
475
    memset(&fpu, 0, sizeof fpu);
476
    fpu.fsw = env->fpus & ~(7 << 11);
477
    fpu.fsw |= (env->fpstt & 7) << 11;
478
    fpu.fcw = env->fpuc;
479
    for (i = 0; i < 8; ++i)
480
        fpu.ftwx |= (!env->fptags[i]) << i;
481
    memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
482
    memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
483
    fpu.mxcsr = env->mxcsr;
484

    
485
    return kvm_vcpu_ioctl(env, KVM_SET_FPU, &fpu);
486
}
487

    
488
static int kvm_put_sregs(CPUState *env)
489
{
490
    struct kvm_sregs sregs;
491

    
492
    memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
493
    if (env->interrupt_injected >= 0) {
494
        sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
495
                (uint64_t)1 << (env->interrupt_injected % 64);
496
    }
497

    
498
    if ((env->eflags & VM_MASK)) {
499
            set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
500
            set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
501
            set_v8086_seg(&sregs.es, &env->segs[R_ES]);
502
            set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
503
            set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
504
            set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
505
    } else {
506
            set_seg(&sregs.cs, &env->segs[R_CS]);
507
            set_seg(&sregs.ds, &env->segs[R_DS]);
508
            set_seg(&sregs.es, &env->segs[R_ES]);
509
            set_seg(&sregs.fs, &env->segs[R_FS]);
510
            set_seg(&sregs.gs, &env->segs[R_GS]);
511
            set_seg(&sregs.ss, &env->segs[R_SS]);
512

    
513
            if (env->cr[0] & CR0_PE_MASK) {
514
                /* force ss cpl to cs cpl */
515
                sregs.ss.selector = (sregs.ss.selector & ~3) |
516
                        (sregs.cs.selector & 3);
517
                sregs.ss.dpl = sregs.ss.selector & 3;
518
            }
519
    }
520

    
521
    set_seg(&sregs.tr, &env->tr);
522
    set_seg(&sregs.ldt, &env->ldt);
523

    
524
    sregs.idt.limit = env->idt.limit;
525
    sregs.idt.base = env->idt.base;
526
    sregs.gdt.limit = env->gdt.limit;
527
    sregs.gdt.base = env->gdt.base;
528

    
529
    sregs.cr0 = env->cr[0];
530
    sregs.cr2 = env->cr[2];
531
    sregs.cr3 = env->cr[3];
532
    sregs.cr4 = env->cr[4];
533

    
534
    sregs.cr8 = cpu_get_apic_tpr(env);
535
    sregs.apic_base = cpu_get_apic_base(env);
536

    
537
    sregs.efer = env->efer;
538

    
539
    return kvm_vcpu_ioctl(env, KVM_SET_SREGS, &sregs);
540
}
541

    
542
static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
543
                              uint32_t index, uint64_t value)
544
{
545
    entry->index = index;
546
    entry->data = value;
547
}
548

    
549
static int kvm_put_msrs(CPUState *env, int level)
550
{
551
    struct {
552
        struct kvm_msrs info;
553
        struct kvm_msr_entry entries[100];
554
    } msr_data;
555
    struct kvm_msr_entry *msrs = msr_data.entries;
556
    int n = 0;
557

    
558
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
559
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
560
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
561
    if (kvm_has_msr_star(env))
562
        kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
563
#ifdef TARGET_X86_64
564
    /* FIXME if lm capable */
565
    kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
566
    kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
567
    kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
568
    kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
569
#endif
570
    if (level == KVM_PUT_FULL_STATE) {
571
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
572
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
573
                          env->system_time_msr);
574
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
575
    }
576

    
577
    msr_data.info.nmsrs = n;
578

    
579
    return kvm_vcpu_ioctl(env, KVM_SET_MSRS, &msr_data);
580

    
581
}
582

    
583

    
584
static int kvm_get_fpu(CPUState *env)
585
{
586
    struct kvm_fpu fpu;
587
    int i, ret;
588

    
589
    ret = kvm_vcpu_ioctl(env, KVM_GET_FPU, &fpu);
590
    if (ret < 0)
591
        return ret;
592

    
593
    env->fpstt = (fpu.fsw >> 11) & 7;
594
    env->fpus = fpu.fsw;
595
    env->fpuc = fpu.fcw;
596
    for (i = 0; i < 8; ++i)
597
        env->fptags[i] = !((fpu.ftwx >> i) & 1);
598
    memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
599
    memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
600
    env->mxcsr = fpu.mxcsr;
601

    
602
    return 0;
603
}
604

    
605
static int kvm_get_sregs(CPUState *env)
606
{
607
    struct kvm_sregs sregs;
608
    uint32_t hflags;
609
    int bit, i, ret;
610

    
611
    ret = kvm_vcpu_ioctl(env, KVM_GET_SREGS, &sregs);
612
    if (ret < 0)
613
        return ret;
614

    
615
    /* There can only be one pending IRQ set in the bitmap at a time, so try
616
       to find it and save its number instead (-1 for none). */
617
    env->interrupt_injected = -1;
618
    for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
619
        if (sregs.interrupt_bitmap[i]) {
620
            bit = ctz64(sregs.interrupt_bitmap[i]);
621
            env->interrupt_injected = i * 64 + bit;
622
            break;
623
        }
624
    }
625

    
626
    get_seg(&env->segs[R_CS], &sregs.cs);
627
    get_seg(&env->segs[R_DS], &sregs.ds);
628
    get_seg(&env->segs[R_ES], &sregs.es);
629
    get_seg(&env->segs[R_FS], &sregs.fs);
630
    get_seg(&env->segs[R_GS], &sregs.gs);
631
    get_seg(&env->segs[R_SS], &sregs.ss);
632

    
633
    get_seg(&env->tr, &sregs.tr);
634
    get_seg(&env->ldt, &sregs.ldt);
635

    
636
    env->idt.limit = sregs.idt.limit;
637
    env->idt.base = sregs.idt.base;
638
    env->gdt.limit = sregs.gdt.limit;
639
    env->gdt.base = sregs.gdt.base;
640

    
641
    env->cr[0] = sregs.cr0;
642
    env->cr[2] = sregs.cr2;
643
    env->cr[3] = sregs.cr3;
644
    env->cr[4] = sregs.cr4;
645

    
646
    cpu_set_apic_base(env, sregs.apic_base);
647

    
648
    env->efer = sregs.efer;
649
    //cpu_set_apic_tpr(env, sregs.cr8);
650

    
651
#define HFLAG_COPY_MASK ~( \
652
                        HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
653
                        HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
654
                        HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
655
                        HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
656

    
657

    
658

    
659
    hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
660
    hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
661
    hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
662
            (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
663
    hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
664
    hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
665
            (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
666

    
667
    if (env->efer & MSR_EFER_LMA) {
668
        hflags |= HF_LMA_MASK;
669
    }
670

    
671
    if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
672
        hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
673
    } else {
674
        hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
675
                (DESC_B_SHIFT - HF_CS32_SHIFT);
676
        hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
677
                (DESC_B_SHIFT - HF_SS32_SHIFT);
678
        if (!(env->cr[0] & CR0_PE_MASK) ||
679
                   (env->eflags & VM_MASK) ||
680
                   !(hflags & HF_CS32_MASK)) {
681
                hflags |= HF_ADDSEG_MASK;
682
            } else {
683
                hflags |= ((env->segs[R_DS].base |
684
                                env->segs[R_ES].base |
685
                                env->segs[R_SS].base) != 0) <<
686
                    HF_ADDSEG_SHIFT;
687
            }
688
    }
689
    env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
690

    
691
    return 0;
692
}
693

    
694
static int kvm_get_msrs(CPUState *env)
695
{
696
    struct {
697
        struct kvm_msrs info;
698
        struct kvm_msr_entry entries[100];
699
    } msr_data;
700
    struct kvm_msr_entry *msrs = msr_data.entries;
701
    int ret, i, n;
702

    
703
    n = 0;
704
    msrs[n++].index = MSR_IA32_SYSENTER_CS;
705
    msrs[n++].index = MSR_IA32_SYSENTER_ESP;
706
    msrs[n++].index = MSR_IA32_SYSENTER_EIP;
707
    if (kvm_has_msr_star(env))
708
        msrs[n++].index = MSR_STAR;
709
    msrs[n++].index = MSR_IA32_TSC;
710
#ifdef TARGET_X86_64
711
    /* FIXME lm_capable_kernel */
712
    msrs[n++].index = MSR_CSTAR;
713
    msrs[n++].index = MSR_KERNELGSBASE;
714
    msrs[n++].index = MSR_FMASK;
715
    msrs[n++].index = MSR_LSTAR;
716
#endif
717
    msrs[n++].index = MSR_KVM_SYSTEM_TIME;
718
    msrs[n++].index = MSR_KVM_WALL_CLOCK;
719

    
720
    msr_data.info.nmsrs = n;
721
    ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
722
    if (ret < 0)
723
        return ret;
724

    
725
    for (i = 0; i < ret; i++) {
726
        switch (msrs[i].index) {
727
        case MSR_IA32_SYSENTER_CS:
728
            env->sysenter_cs = msrs[i].data;
729
            break;
730
        case MSR_IA32_SYSENTER_ESP:
731
            env->sysenter_esp = msrs[i].data;
732
            break;
733
        case MSR_IA32_SYSENTER_EIP:
734
            env->sysenter_eip = msrs[i].data;
735
            break;
736
        case MSR_STAR:
737
            env->star = msrs[i].data;
738
            break;
739
#ifdef TARGET_X86_64
740
        case MSR_CSTAR:
741
            env->cstar = msrs[i].data;
742
            break;
743
        case MSR_KERNELGSBASE:
744
            env->kernelgsbase = msrs[i].data;
745
            break;
746
        case MSR_FMASK:
747
            env->fmask = msrs[i].data;
748
            break;
749
        case MSR_LSTAR:
750
            env->lstar = msrs[i].data;
751
            break;
752
#endif
753
        case MSR_IA32_TSC:
754
            env->tsc = msrs[i].data;
755
            break;
756
        case MSR_KVM_SYSTEM_TIME:
757
            env->system_time_msr = msrs[i].data;
758
            break;
759
        case MSR_KVM_WALL_CLOCK:
760
            env->wall_clock_msr = msrs[i].data;
761
            break;
762
        }
763
    }
764

    
765
    return 0;
766
}
767

    
768
static int kvm_put_mp_state(CPUState *env)
769
{
770
    struct kvm_mp_state mp_state = { .mp_state = env->mp_state };
771

    
772
    return kvm_vcpu_ioctl(env, KVM_SET_MP_STATE, &mp_state);
773
}
774

    
775
static int kvm_get_mp_state(CPUState *env)
776
{
777
    struct kvm_mp_state mp_state;
778
    int ret;
779

    
780
    ret = kvm_vcpu_ioctl(env, KVM_GET_MP_STATE, &mp_state);
781
    if (ret < 0) {
782
        return ret;
783
    }
784
    env->mp_state = mp_state.mp_state;
785
    return 0;
786
}
787

    
788
static int kvm_put_vcpu_events(CPUState *env, int level)
789
{
790
#ifdef KVM_CAP_VCPU_EVENTS
791
    struct kvm_vcpu_events events;
792

    
793
    if (!kvm_has_vcpu_events()) {
794
        return 0;
795
    }
796

    
797
    events.exception.injected = (env->exception_injected >= 0);
798
    events.exception.nr = env->exception_injected;
799
    events.exception.has_error_code = env->has_error_code;
800
    events.exception.error_code = env->error_code;
801

    
802
    events.interrupt.injected = (env->interrupt_injected >= 0);
803
    events.interrupt.nr = env->interrupt_injected;
804
    events.interrupt.soft = env->soft_interrupt;
805

    
806
    events.nmi.injected = env->nmi_injected;
807
    events.nmi.pending = env->nmi_pending;
808
    events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
809

    
810
    events.sipi_vector = env->sipi_vector;
811

    
812
    events.flags = 0;
813
    if (level >= KVM_PUT_RESET_STATE) {
814
        events.flags |=
815
            KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
816
    }
817

    
818
    return kvm_vcpu_ioctl(env, KVM_SET_VCPU_EVENTS, &events);
819
#else
820
    return 0;
821
#endif
822
}
823

    
824
static int kvm_get_vcpu_events(CPUState *env)
825
{
826
#ifdef KVM_CAP_VCPU_EVENTS
827
    struct kvm_vcpu_events events;
828
    int ret;
829

    
830
    if (!kvm_has_vcpu_events()) {
831
        return 0;
832
    }
833

    
834
    ret = kvm_vcpu_ioctl(env, KVM_GET_VCPU_EVENTS, &events);
835
    if (ret < 0) {
836
       return ret;
837
    }
838
    env->exception_injected =
839
       events.exception.injected ? events.exception.nr : -1;
840
    env->has_error_code = events.exception.has_error_code;
841
    env->error_code = events.exception.error_code;
842

    
843
    env->interrupt_injected =
844
        events.interrupt.injected ? events.interrupt.nr : -1;
845
    env->soft_interrupt = events.interrupt.soft;
846

    
847
    env->nmi_injected = events.nmi.injected;
848
    env->nmi_pending = events.nmi.pending;
849
    if (events.nmi.masked) {
850
        env->hflags2 |= HF2_NMI_MASK;
851
    } else {
852
        env->hflags2 &= ~HF2_NMI_MASK;
853
    }
854

    
855
    env->sipi_vector = events.sipi_vector;
856
#endif
857

    
858
    return 0;
859
}
860

    
861
static int kvm_guest_debug_workarounds(CPUState *env)
862
{
863
    int ret = 0;
864
#ifdef KVM_CAP_SET_GUEST_DEBUG
865
    unsigned long reinject_trap = 0;
866

    
867
    if (!kvm_has_vcpu_events()) {
868
        if (env->exception_injected == 1) {
869
            reinject_trap = KVM_GUESTDBG_INJECT_DB;
870
        } else if (env->exception_injected == 3) {
871
            reinject_trap = KVM_GUESTDBG_INJECT_BP;
872
        }
873
        env->exception_injected = -1;
874
    }
875

    
876
    /*
877
     * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
878
     * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
879
     * by updating the debug state once again if single-stepping is on.
880
     * Another reason to call kvm_update_guest_debug here is a pending debug
881
     * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
882
     * reinject them via SET_GUEST_DEBUG.
883
     */
884
    if (reinject_trap ||
885
        (!kvm_has_robust_singlestep() && env->singlestep_enabled)) {
886
        ret = kvm_update_guest_debug(env, reinject_trap);
887
    }
888
#endif /* KVM_CAP_SET_GUEST_DEBUG */
889
    return ret;
890
}
891

    
892
int kvm_arch_put_registers(CPUState *env, int level)
893
{
894
    int ret;
895

    
896
    ret = kvm_getput_regs(env, 1);
897
    if (ret < 0)
898
        return ret;
899

    
900
    ret = kvm_put_fpu(env);
901
    if (ret < 0)
902
        return ret;
903

    
904
    ret = kvm_put_sregs(env);
905
    if (ret < 0)
906
        return ret;
907

    
908
    ret = kvm_put_msrs(env, level);
909
    if (ret < 0)
910
        return ret;
911

    
912
    if (level >= KVM_PUT_RESET_STATE) {
913
        ret = kvm_put_mp_state(env);
914
        if (ret < 0)
915
            return ret;
916
    }
917

    
918
    ret = kvm_put_vcpu_events(env, level);
919
    if (ret < 0)
920
        return ret;
921

    
922
    /* must be last */
923
    ret = kvm_guest_debug_workarounds(env);
924
    if (ret < 0)
925
        return ret;
926

    
927
    return 0;
928
}
929

    
930
int kvm_arch_get_registers(CPUState *env)
931
{
932
    int ret;
933

    
934
    ret = kvm_getput_regs(env, 0);
935
    if (ret < 0)
936
        return ret;
937

    
938
    ret = kvm_get_fpu(env);
939
    if (ret < 0)
940
        return ret;
941

    
942
    ret = kvm_get_sregs(env);
943
    if (ret < 0)
944
        return ret;
945

    
946
    ret = kvm_get_msrs(env);
947
    if (ret < 0)
948
        return ret;
949

    
950
    ret = kvm_get_mp_state(env);
951
    if (ret < 0)
952
        return ret;
953

    
954
    ret = kvm_get_vcpu_events(env);
955
    if (ret < 0)
956
        return ret;
957

    
958
    return 0;
959
}
960

    
961
int kvm_arch_pre_run(CPUState *env, struct kvm_run *run)
962
{
963
    /* Try to inject an interrupt if the guest can accept it */
964
    if (run->ready_for_interrupt_injection &&
965
        (env->interrupt_request & CPU_INTERRUPT_HARD) &&
966
        (env->eflags & IF_MASK)) {
967
        int irq;
968

    
969
        env->interrupt_request &= ~CPU_INTERRUPT_HARD;
970
        irq = cpu_get_pic_interrupt(env);
971
        if (irq >= 0) {
972
            struct kvm_interrupt intr;
973
            intr.irq = irq;
974
            /* FIXME: errors */
975
            dprintf("injected interrupt %d\n", irq);
976
            kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
977
        }
978
    }
979

    
980
    /* If we have an interrupt but the guest is not ready to receive an
981
     * interrupt, request an interrupt window exit.  This will
982
     * cause a return to userspace as soon as the guest is ready to
983
     * receive interrupts. */
984
    if ((env->interrupt_request & CPU_INTERRUPT_HARD))
985
        run->request_interrupt_window = 1;
986
    else
987
        run->request_interrupt_window = 0;
988

    
989
    dprintf("setting tpr\n");
990
    run->cr8 = cpu_get_apic_tpr(env);
991

    
992
    return 0;
993
}
994

    
995
int kvm_arch_post_run(CPUState *env, struct kvm_run *run)
996
{
997
    if (run->if_flag)
998
        env->eflags |= IF_MASK;
999
    else
1000
        env->eflags &= ~IF_MASK;
1001
    
1002
    cpu_set_apic_tpr(env, run->cr8);
1003
    cpu_set_apic_base(env, run->apic_base);
1004

    
1005
    return 0;
1006
}
1007

    
1008
static int kvm_handle_halt(CPUState *env)
1009
{
1010
    if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
1011
          (env->eflags & IF_MASK)) &&
1012
        !(env->interrupt_request & CPU_INTERRUPT_NMI)) {
1013
        env->halted = 1;
1014
        env->exception_index = EXCP_HLT;
1015
        return 0;
1016
    }
1017

    
1018
    return 1;
1019
}
1020

    
1021
int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run)
1022
{
1023
    int ret = 0;
1024

    
1025
    switch (run->exit_reason) {
1026
    case KVM_EXIT_HLT:
1027
        dprintf("handle_hlt\n");
1028
        ret = kvm_handle_halt(env);
1029
        break;
1030
    }
1031

    
1032
    return ret;
1033
}
1034

    
1035
#ifdef KVM_CAP_SET_GUEST_DEBUG
1036
int kvm_arch_insert_sw_breakpoint(CPUState *env, struct kvm_sw_breakpoint *bp)
1037
{
1038
    static const uint8_t int3 = 0xcc;
1039

    
1040
    if (cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
1041
        cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&int3, 1, 1))
1042
        return -EINVAL;
1043
    return 0;
1044
}
1045

    
1046
int kvm_arch_remove_sw_breakpoint(CPUState *env, struct kvm_sw_breakpoint *bp)
1047
{
1048
    uint8_t int3;
1049

    
1050
    if (cpu_memory_rw_debug(env, bp->pc, &int3, 1, 0) || int3 != 0xcc ||
1051
        cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1))
1052
        return -EINVAL;
1053
    return 0;
1054
}
1055

    
1056
static struct {
1057
    target_ulong addr;
1058
    int len;
1059
    int type;
1060
} hw_breakpoint[4];
1061

    
1062
static int nb_hw_breakpoint;
1063

    
1064
static int find_hw_breakpoint(target_ulong addr, int len, int type)
1065
{
1066
    int n;
1067

    
1068
    for (n = 0; n < nb_hw_breakpoint; n++)
1069
        if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
1070
            (hw_breakpoint[n].len == len || len == -1))
1071
            return n;
1072
    return -1;
1073
}
1074

    
1075
int kvm_arch_insert_hw_breakpoint(target_ulong addr,
1076
                                  target_ulong len, int type)
1077
{
1078
    switch (type) {
1079
    case GDB_BREAKPOINT_HW:
1080
        len = 1;
1081
        break;
1082
    case GDB_WATCHPOINT_WRITE:
1083
    case GDB_WATCHPOINT_ACCESS:
1084
        switch (len) {
1085
        case 1:
1086
            break;
1087
        case 2:
1088
        case 4:
1089
        case 8:
1090
            if (addr & (len - 1))
1091
                return -EINVAL;
1092
            break;
1093
        default:
1094
            return -EINVAL;
1095
        }
1096
        break;
1097
    default:
1098
        return -ENOSYS;
1099
    }
1100

    
1101
    if (nb_hw_breakpoint == 4)
1102
        return -ENOBUFS;
1103

    
1104
    if (find_hw_breakpoint(addr, len, type) >= 0)
1105
        return -EEXIST;
1106

    
1107
    hw_breakpoint[nb_hw_breakpoint].addr = addr;
1108
    hw_breakpoint[nb_hw_breakpoint].len = len;
1109
    hw_breakpoint[nb_hw_breakpoint].type = type;
1110
    nb_hw_breakpoint++;
1111

    
1112
    return 0;
1113
}
1114

    
1115
int kvm_arch_remove_hw_breakpoint(target_ulong addr,
1116
                                  target_ulong len, int type)
1117
{
1118
    int n;
1119

    
1120
    n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
1121
    if (n < 0)
1122
        return -ENOENT;
1123

    
1124
    nb_hw_breakpoint--;
1125
    hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
1126

    
1127
    return 0;
1128
}
1129

    
1130
void kvm_arch_remove_all_hw_breakpoints(void)
1131
{
1132
    nb_hw_breakpoint = 0;
1133
}
1134

    
1135
static CPUWatchpoint hw_watchpoint;
1136

    
1137
int kvm_arch_debug(struct kvm_debug_exit_arch *arch_info)
1138
{
1139
    int handle = 0;
1140
    int n;
1141

    
1142
    if (arch_info->exception == 1) {
1143
        if (arch_info->dr6 & (1 << 14)) {
1144
            if (cpu_single_env->singlestep_enabled)
1145
                handle = 1;
1146
        } else {
1147
            for (n = 0; n < 4; n++)
1148
                if (arch_info->dr6 & (1 << n))
1149
                    switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
1150
                    case 0x0:
1151
                        handle = 1;
1152
                        break;
1153
                    case 0x1:
1154
                        handle = 1;
1155
                        cpu_single_env->watchpoint_hit = &hw_watchpoint;
1156
                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
1157
                        hw_watchpoint.flags = BP_MEM_WRITE;
1158
                        break;
1159
                    case 0x3:
1160
                        handle = 1;
1161
                        cpu_single_env->watchpoint_hit = &hw_watchpoint;
1162
                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
1163
                        hw_watchpoint.flags = BP_MEM_ACCESS;
1164
                        break;
1165
                    }
1166
        }
1167
    } else if (kvm_find_sw_breakpoint(cpu_single_env, arch_info->pc))
1168
        handle = 1;
1169

    
1170
    if (!handle) {
1171
        cpu_synchronize_state(cpu_single_env);
1172
        assert(cpu_single_env->exception_injected == -1);
1173

    
1174
        cpu_single_env->exception_injected = arch_info->exception;
1175
        cpu_single_env->has_error_code = 0;
1176
    }
1177

    
1178
    return handle;
1179
}
1180

    
1181
void kvm_arch_update_guest_debug(CPUState *env, struct kvm_guest_debug *dbg)
1182
{
1183
    const uint8_t type_code[] = {
1184
        [GDB_BREAKPOINT_HW] = 0x0,
1185
        [GDB_WATCHPOINT_WRITE] = 0x1,
1186
        [GDB_WATCHPOINT_ACCESS] = 0x3
1187
    };
1188
    const uint8_t len_code[] = {
1189
        [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
1190
    };
1191
    int n;
1192

    
1193
    if (kvm_sw_breakpoints_active(env))
1194
        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
1195

    
1196
    if (nb_hw_breakpoint > 0) {
1197
        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
1198
        dbg->arch.debugreg[7] = 0x0600;
1199
        for (n = 0; n < nb_hw_breakpoint; n++) {
1200
            dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
1201
            dbg->arch.debugreg[7] |= (2 << (n * 2)) |
1202
                (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
1203
                (len_code[hw_breakpoint[n].len] << (18 + n*4));
1204
        }
1205
    }
1206
}
1207
#endif /* KVM_CAP_SET_GUEST_DEBUG */