Statistics
| Branch: | Revision:

root / kvm-all.c @ c18e2f94

History | View | Annotate | Download (32.3 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
32
#define PAGE_SIZE TARGET_PAGE_SIZE
33

    
34
//#define DEBUG_KVM
35

    
36
#ifdef DEBUG_KVM
37
#define DPRINTF(fmt, ...) \
38
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
39
#else
40
#define DPRINTF(fmt, ...) \
41
    do { } while (0)
42
#endif
43

    
44
typedef struct KVMSlot
45
{
46
    target_phys_addr_t start_addr;
47
    ram_addr_t memory_size;
48
    ram_addr_t phys_offset;
49
    int slot;
50
    int flags;
51
} KVMSlot;
52

    
53
typedef struct kvm_dirty_log KVMDirtyLog;
54

    
55
struct KVMState
56
{
57
    KVMSlot slots[32];
58
    int fd;
59
    int vmfd;
60
    int coalesced_mmio;
61
#ifdef KVM_CAP_COALESCED_MMIO
62
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
63
#endif
64
    int broken_set_mem_region;
65
    int migration_log;
66
    int vcpu_events;
67
    int robust_singlestep;
68
    int debugregs;
69
#ifdef KVM_CAP_SET_GUEST_DEBUG
70
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
71
#endif
72
    int irqchip_in_kernel;
73
    int pit_in_kernel;
74
    int xsave, xcrs;
75
};
76

    
77
static KVMState *kvm_state;
78

    
79
static KVMSlot *kvm_alloc_slot(KVMState *s)
80
{
81
    int i;
82

    
83
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
84
        /* KVM private memory slots */
85
        if (i >= 8 && i < 12)
86
            continue;
87
        if (s->slots[i].memory_size == 0)
88
            return &s->slots[i];
89
    }
90

    
91
    fprintf(stderr, "%s: no free slot available\n", __func__);
92
    abort();
93
}
94

    
95
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
96
                                         target_phys_addr_t start_addr,
97
                                         target_phys_addr_t end_addr)
98
{
99
    int i;
100

    
101
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
102
        KVMSlot *mem = &s->slots[i];
103

    
104
        if (start_addr == mem->start_addr &&
105
            end_addr == mem->start_addr + mem->memory_size) {
106
            return mem;
107
        }
108
    }
109

    
110
    return NULL;
111
}
112

    
113
/*
114
 * Find overlapping slot with lowest start address
115
 */
116
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
117
                                            target_phys_addr_t start_addr,
118
                                            target_phys_addr_t end_addr)
119
{
120
    KVMSlot *found = NULL;
121
    int i;
122

    
123
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
124
        KVMSlot *mem = &s->slots[i];
125

    
126
        if (mem->memory_size == 0 ||
127
            (found && found->start_addr < mem->start_addr)) {
128
            continue;
129
        }
130

    
131
        if (end_addr > mem->start_addr &&
132
            start_addr < mem->start_addr + mem->memory_size) {
133
            found = mem;
134
        }
135
    }
136

    
137
    return found;
138
}
139

    
140
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
141
{
142
    struct kvm_userspace_memory_region mem;
143

    
144
    mem.slot = slot->slot;
145
    mem.guest_phys_addr = slot->start_addr;
146
    mem.memory_size = slot->memory_size;
147
    mem.userspace_addr = (unsigned long)qemu_get_ram_ptr(slot->phys_offset);
148
    mem.flags = slot->flags;
149
    if (s->migration_log) {
150
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
151
    }
152
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
153
}
154

    
155
static void kvm_reset_vcpu(void *opaque)
156
{
157
    CPUState *env = opaque;
158

    
159
    kvm_arch_reset_vcpu(env);
160
}
161

    
162
int kvm_irqchip_in_kernel(void)
163
{
164
    return kvm_state->irqchip_in_kernel;
165
}
166

    
167
int kvm_pit_in_kernel(void)
168
{
169
    return kvm_state->pit_in_kernel;
170
}
171

    
172

    
173
int kvm_init_vcpu(CPUState *env)
174
{
175
    KVMState *s = kvm_state;
176
    long mmap_size;
177
    int ret;
178

    
179
    DPRINTF("kvm_init_vcpu\n");
180

    
181
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
182
    if (ret < 0) {
183
        DPRINTF("kvm_create_vcpu failed\n");
184
        goto err;
185
    }
186

    
187
    env->kvm_fd = ret;
188
    env->kvm_state = s;
189

    
190
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
191
    if (mmap_size < 0) {
192
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
193
        goto err;
194
    }
195

    
196
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
197
                        env->kvm_fd, 0);
198
    if (env->kvm_run == MAP_FAILED) {
199
        ret = -errno;
200
        DPRINTF("mmap'ing vcpu state failed\n");
201
        goto err;
202
    }
203

    
204
#ifdef KVM_CAP_COALESCED_MMIO
205
    if (s->coalesced_mmio && !s->coalesced_mmio_ring)
206
        s->coalesced_mmio_ring = (void *) env->kvm_run +
207
                s->coalesced_mmio * PAGE_SIZE;
208
#endif
209

    
210
    ret = kvm_arch_init_vcpu(env);
211
    if (ret == 0) {
212
        qemu_register_reset(kvm_reset_vcpu, env);
213
        kvm_arch_reset_vcpu(env);
214
    }
215
err:
216
    return ret;
217
}
218

    
219
/*
220
 * dirty pages logging control
221
 */
222
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
223
                                      ram_addr_t size, int flags, int mask)
224
{
225
    KVMState *s = kvm_state;
226
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
227
    int old_flags;
228

    
229
    if (mem == NULL)  {
230
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
231
                    TARGET_FMT_plx "\n", __func__, phys_addr,
232
                    (target_phys_addr_t)(phys_addr + size - 1));
233
            return -EINVAL;
234
    }
235

    
236
    old_flags = mem->flags;
237

    
238
    flags = (mem->flags & ~mask) | flags;
239
    mem->flags = flags;
240

    
241
    /* If nothing changed effectively, no need to issue ioctl */
242
    if (s->migration_log) {
243
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
244
    }
245
    if (flags == old_flags) {
246
            return 0;
247
    }
248

    
249
    return kvm_set_user_memory_region(s, mem);
250
}
251

    
252
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
253
{
254
        return kvm_dirty_pages_log_change(phys_addr, size,
255
                                          KVM_MEM_LOG_DIRTY_PAGES,
256
                                          KVM_MEM_LOG_DIRTY_PAGES);
257
}
258

    
259
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
260
{
261
        return kvm_dirty_pages_log_change(phys_addr, size,
262
                                          0,
263
                                          KVM_MEM_LOG_DIRTY_PAGES);
264
}
265

    
266
static int kvm_set_migration_log(int enable)
267
{
268
    KVMState *s = kvm_state;
269
    KVMSlot *mem;
270
    int i, err;
271

    
272
    s->migration_log = enable;
273

    
274
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
275
        mem = &s->slots[i];
276

    
277
        if (!mem->memory_size) {
278
            continue;
279
        }
280
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
281
            continue;
282
        }
283
        err = kvm_set_user_memory_region(s, mem);
284
        if (err) {
285
            return err;
286
        }
287
    }
288
    return 0;
289
}
290

    
291
/* get kvm's dirty pages bitmap and update qemu's */
292
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
293
                                         unsigned long *bitmap,
294
                                         unsigned long offset,
295
                                         unsigned long mem_size)
296
{
297
    unsigned int i, j;
298
    unsigned long page_number, addr, addr1, c;
299
    ram_addr_t ram_addr;
300
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
301
        HOST_LONG_BITS;
302

    
303
    /*
304
     * bitmap-traveling is faster than memory-traveling (for addr...)
305
     * especially when most of the memory is not dirty.
306
     */
307
    for (i = 0; i < len; i++) {
308
        if (bitmap[i] != 0) {
309
            c = leul_to_cpu(bitmap[i]);
310
            do {
311
                j = ffsl(c) - 1;
312
                c &= ~(1ul << j);
313
                page_number = i * HOST_LONG_BITS + j;
314
                addr1 = page_number * TARGET_PAGE_SIZE;
315
                addr = offset + addr1;
316
                ram_addr = cpu_get_physical_page_desc(addr);
317
                cpu_physical_memory_set_dirty(ram_addr);
318
            } while (c != 0);
319
        }
320
    }
321
    return 0;
322
}
323

    
324
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
325

    
326
/**
327
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
328
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
329
 * This means all bits are set to dirty.
330
 *
331
 * @start_add: start of logged region.
332
 * @end_addr: end of logged region.
333
 */
334
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
335
                                          target_phys_addr_t end_addr)
336
{
337
    KVMState *s = kvm_state;
338
    unsigned long size, allocated_size = 0;
339
    KVMDirtyLog d;
340
    KVMSlot *mem;
341
    int ret = 0;
342

    
343
    d.dirty_bitmap = NULL;
344
    while (start_addr < end_addr) {
345
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
346
        if (mem == NULL) {
347
            break;
348
        }
349

    
350
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
351
        if (!d.dirty_bitmap) {
352
            d.dirty_bitmap = qemu_malloc(size);
353
        } else if (size > allocated_size) {
354
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
355
        }
356
        allocated_size = size;
357
        memset(d.dirty_bitmap, 0, allocated_size);
358

    
359
        d.slot = mem->slot;
360

    
361
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
362
            DPRINTF("ioctl failed %d\n", errno);
363
            ret = -1;
364
            break;
365
        }
366

    
367
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
368
                                      mem->start_addr, mem->memory_size);
369
        start_addr = mem->start_addr + mem->memory_size;
370
    }
371
    qemu_free(d.dirty_bitmap);
372

    
373
    return ret;
374
}
375

    
376
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
377
{
378
    int ret = -ENOSYS;
379
#ifdef KVM_CAP_COALESCED_MMIO
380
    KVMState *s = kvm_state;
381

    
382
    if (s->coalesced_mmio) {
383
        struct kvm_coalesced_mmio_zone zone;
384

    
385
        zone.addr = start;
386
        zone.size = size;
387

    
388
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
389
    }
390
#endif
391

    
392
    return ret;
393
}
394

    
395
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
396
{
397
    int ret = -ENOSYS;
398
#ifdef KVM_CAP_COALESCED_MMIO
399
    KVMState *s = kvm_state;
400

    
401
    if (s->coalesced_mmio) {
402
        struct kvm_coalesced_mmio_zone zone;
403

    
404
        zone.addr = start;
405
        zone.size = size;
406

    
407
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
408
    }
409
#endif
410

    
411
    return ret;
412
}
413

    
414
int kvm_check_extension(KVMState *s, unsigned int extension)
415
{
416
    int ret;
417

    
418
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
419
    if (ret < 0) {
420
        ret = 0;
421
    }
422

    
423
    return ret;
424
}
425

    
426
static void kvm_set_phys_mem(target_phys_addr_t start_addr,
427
                             ram_addr_t size,
428
                             ram_addr_t phys_offset)
429
{
430
    KVMState *s = kvm_state;
431
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
432
    KVMSlot *mem, old;
433
    int err;
434

    
435
    /* kvm works in page size chunks, but the function may be called
436
       with sub-page size and unaligned start address. */
437
    size = TARGET_PAGE_ALIGN(size);
438
    start_addr = TARGET_PAGE_ALIGN(start_addr);
439

    
440
    /* KVM does not support read-only slots */
441
    phys_offset &= ~IO_MEM_ROM;
442

    
443
    while (1) {
444
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
445
        if (!mem) {
446
            break;
447
        }
448

    
449
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
450
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
451
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
452
            /* The new slot fits into the existing one and comes with
453
             * identical parameters - nothing to be done. */
454
            return;
455
        }
456

    
457
        old = *mem;
458

    
459
        /* unregister the overlapping slot */
460
        mem->memory_size = 0;
461
        err = kvm_set_user_memory_region(s, mem);
462
        if (err) {
463
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
464
                    __func__, strerror(-err));
465
            abort();
466
        }
467

    
468
        /* Workaround for older KVM versions: we can't join slots, even not by
469
         * unregistering the previous ones and then registering the larger
470
         * slot. We have to maintain the existing fragmentation. Sigh.
471
         *
472
         * This workaround assumes that the new slot starts at the same
473
         * address as the first existing one. If not or if some overlapping
474
         * slot comes around later, we will fail (not seen in practice so far)
475
         * - and actually require a recent KVM version. */
476
        if (s->broken_set_mem_region &&
477
            old.start_addr == start_addr && old.memory_size < size &&
478
            flags < IO_MEM_UNASSIGNED) {
479
            mem = kvm_alloc_slot(s);
480
            mem->memory_size = old.memory_size;
481
            mem->start_addr = old.start_addr;
482
            mem->phys_offset = old.phys_offset;
483
            mem->flags = 0;
484

    
485
            err = kvm_set_user_memory_region(s, mem);
486
            if (err) {
487
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
488
                        strerror(-err));
489
                abort();
490
            }
491

    
492
            start_addr += old.memory_size;
493
            phys_offset += old.memory_size;
494
            size -= old.memory_size;
495
            continue;
496
        }
497

    
498
        /* register prefix slot */
499
        if (old.start_addr < start_addr) {
500
            mem = kvm_alloc_slot(s);
501
            mem->memory_size = start_addr - old.start_addr;
502
            mem->start_addr = old.start_addr;
503
            mem->phys_offset = old.phys_offset;
504
            mem->flags = 0;
505

    
506
            err = kvm_set_user_memory_region(s, mem);
507
            if (err) {
508
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
509
                        __func__, strerror(-err));
510
                abort();
511
            }
512
        }
513

    
514
        /* register suffix slot */
515
        if (old.start_addr + old.memory_size > start_addr + size) {
516
            ram_addr_t size_delta;
517

    
518
            mem = kvm_alloc_slot(s);
519
            mem->start_addr = start_addr + size;
520
            size_delta = mem->start_addr - old.start_addr;
521
            mem->memory_size = old.memory_size - size_delta;
522
            mem->phys_offset = old.phys_offset + size_delta;
523
            mem->flags = 0;
524

    
525
            err = kvm_set_user_memory_region(s, mem);
526
            if (err) {
527
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
528
                        __func__, strerror(-err));
529
                abort();
530
            }
531
        }
532
    }
533

    
534
    /* in case the KVM bug workaround already "consumed" the new slot */
535
    if (!size)
536
        return;
537

    
538
    /* KVM does not need to know about this memory */
539
    if (flags >= IO_MEM_UNASSIGNED)
540
        return;
541

    
542
    mem = kvm_alloc_slot(s);
543
    mem->memory_size = size;
544
    mem->start_addr = start_addr;
545
    mem->phys_offset = phys_offset;
546
    mem->flags = 0;
547

    
548
    err = kvm_set_user_memory_region(s, mem);
549
    if (err) {
550
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
551
                strerror(-err));
552
        abort();
553
    }
554
}
555

    
556
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
557
                                  target_phys_addr_t start_addr,
558
                                  ram_addr_t size,
559
                                  ram_addr_t phys_offset)
560
{
561
        kvm_set_phys_mem(start_addr, size, phys_offset);
562
}
563

    
564
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
565
                                        target_phys_addr_t start_addr,
566
                                        target_phys_addr_t end_addr)
567
{
568
        return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
569
}
570

    
571
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
572
                                    int enable)
573
{
574
        return kvm_set_migration_log(enable);
575
}
576

    
577
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
578
        .set_memory = kvm_client_set_memory,
579
        .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
580
        .migration_log = kvm_client_migration_log,
581
};
582

    
583
int kvm_init(int smp_cpus)
584
{
585
    static const char upgrade_note[] =
586
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
587
        "(see http://sourceforge.net/projects/kvm).\n";
588
    KVMState *s;
589
    int ret;
590
    int i;
591

    
592
    s = qemu_mallocz(sizeof(KVMState));
593

    
594
#ifdef KVM_CAP_SET_GUEST_DEBUG
595
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
596
#endif
597
    for (i = 0; i < ARRAY_SIZE(s->slots); i++)
598
        s->slots[i].slot = i;
599

    
600
    s->vmfd = -1;
601
    s->fd = qemu_open("/dev/kvm", O_RDWR);
602
    if (s->fd == -1) {
603
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
604
        ret = -errno;
605
        goto err;
606
    }
607

    
608
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
609
    if (ret < KVM_API_VERSION) {
610
        if (ret > 0)
611
            ret = -EINVAL;
612
        fprintf(stderr, "kvm version too old\n");
613
        goto err;
614
    }
615

    
616
    if (ret > KVM_API_VERSION) {
617
        ret = -EINVAL;
618
        fprintf(stderr, "kvm version not supported\n");
619
        goto err;
620
    }
621

    
622
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
623
    if (s->vmfd < 0) {
624
#ifdef TARGET_S390X
625
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
626
                        "your host kernel command line\n");
627
#endif
628
        goto err;
629
    }
630

    
631
    /* initially, KVM allocated its own memory and we had to jump through
632
     * hooks to make phys_ram_base point to this.  Modern versions of KVM
633
     * just use a user allocated buffer so we can use regular pages
634
     * unmodified.  Make sure we have a sufficiently modern version of KVM.
635
     */
636
    if (!kvm_check_extension(s, KVM_CAP_USER_MEMORY)) {
637
        ret = -EINVAL;
638
        fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n%s",
639
                upgrade_note);
640
        goto err;
641
    }
642

    
643
    /* There was a nasty bug in < kvm-80 that prevents memory slots from being
644
     * destroyed properly.  Since we rely on this capability, refuse to work
645
     * with any kernel without this capability. */
646
    if (!kvm_check_extension(s, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
647
        ret = -EINVAL;
648

    
649
        fprintf(stderr,
650
                "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
651
                upgrade_note);
652
        goto err;
653
    }
654

    
655
    s->coalesced_mmio = 0;
656
#ifdef KVM_CAP_COALESCED_MMIO
657
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
658
    s->coalesced_mmio_ring = NULL;
659
#endif
660

    
661
    s->broken_set_mem_region = 1;
662
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
663
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
664
    if (ret > 0) {
665
        s->broken_set_mem_region = 0;
666
    }
667
#endif
668

    
669
    s->vcpu_events = 0;
670
#ifdef KVM_CAP_VCPU_EVENTS
671
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
672
#endif
673

    
674
    s->robust_singlestep = 0;
675
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
676
    s->robust_singlestep =
677
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
678
#endif
679

    
680
    s->debugregs = 0;
681
#ifdef KVM_CAP_DEBUGREGS
682
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
683
#endif
684

    
685
    s->xsave = 0;
686
#ifdef KVM_CAP_XSAVE
687
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
688
#endif
689

    
690
    s->xcrs = 0;
691
#ifdef KVM_CAP_XCRS
692
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
693
#endif
694

    
695
    ret = kvm_arch_init(s, smp_cpus);
696
    if (ret < 0)
697
        goto err;
698

    
699
    kvm_state = s;
700
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
701

    
702
    return 0;
703

    
704
err:
705
    if (s) {
706
        if (s->vmfd != -1)
707
            close(s->vmfd);
708
        if (s->fd != -1)
709
            close(s->fd);
710
    }
711
    qemu_free(s);
712

    
713
    return ret;
714
}
715

    
716
static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
717
                         uint32_t count)
718
{
719
    int i;
720
    uint8_t *ptr = data;
721

    
722
    for (i = 0; i < count; i++) {
723
        if (direction == KVM_EXIT_IO_IN) {
724
            switch (size) {
725
            case 1:
726
                stb_p(ptr, cpu_inb(port));
727
                break;
728
            case 2:
729
                stw_p(ptr, cpu_inw(port));
730
                break;
731
            case 4:
732
                stl_p(ptr, cpu_inl(port));
733
                break;
734
            }
735
        } else {
736
            switch (size) {
737
            case 1:
738
                cpu_outb(port, ldub_p(ptr));
739
                break;
740
            case 2:
741
                cpu_outw(port, lduw_p(ptr));
742
                break;
743
            case 4:
744
                cpu_outl(port, ldl_p(ptr));
745
                break;
746
            }
747
        }
748

    
749
        ptr += size;
750
    }
751

    
752
    return 1;
753
}
754

    
755
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
756
static void kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
757
{
758

    
759
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
760
        int i;
761

    
762
        fprintf(stderr, "KVM internal error. Suberror: %d\n",
763
                run->internal.suberror);
764

    
765
        for (i = 0; i < run->internal.ndata; ++i) {
766
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
767
                    i, (uint64_t)run->internal.data[i]);
768
        }
769
    }
770
    cpu_dump_state(env, stderr, fprintf, 0);
771
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
772
        fprintf(stderr, "emulation failure\n");
773
        if (!kvm_arch_stop_on_emulation_error(env))
774
                return;
775
    }
776
    /* FIXME: Should trigger a qmp message to let management know
777
     * something went wrong.
778
     */
779
    vm_stop(0);
780
}
781
#endif
782

    
783
void kvm_flush_coalesced_mmio_buffer(void)
784
{
785
#ifdef KVM_CAP_COALESCED_MMIO
786
    KVMState *s = kvm_state;
787
    if (s->coalesced_mmio_ring) {
788
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
789
        while (ring->first != ring->last) {
790
            struct kvm_coalesced_mmio *ent;
791

    
792
            ent = &ring->coalesced_mmio[ring->first];
793

    
794
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
795
            smp_wmb();
796
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
797
        }
798
    }
799
#endif
800
}
801

    
802
static void do_kvm_cpu_synchronize_state(void *_env)
803
{
804
    CPUState *env = _env;
805

    
806
    if (!env->kvm_vcpu_dirty) {
807
        kvm_arch_get_registers(env);
808
        env->kvm_vcpu_dirty = 1;
809
    }
810
}
811

    
812
void kvm_cpu_synchronize_state(CPUState *env)
813
{
814
    if (!env->kvm_vcpu_dirty)
815
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
816
}
817

    
818
void kvm_cpu_synchronize_post_reset(CPUState *env)
819
{
820
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
821
    env->kvm_vcpu_dirty = 0;
822
}
823

    
824
void kvm_cpu_synchronize_post_init(CPUState *env)
825
{
826
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
827
    env->kvm_vcpu_dirty = 0;
828
}
829

    
830
int kvm_cpu_exec(CPUState *env)
831
{
832
    struct kvm_run *run = env->kvm_run;
833
    int ret;
834

    
835
    DPRINTF("kvm_cpu_exec()\n");
836

    
837
    do {
838
#ifndef CONFIG_IOTHREAD
839
        if (env->exit_request) {
840
            DPRINTF("interrupt exit requested\n");
841
            ret = 0;
842
            break;
843
        }
844
#endif
845

    
846
        if (kvm_arch_process_irqchip_events(env)) {
847
            ret = 0;
848
            break;
849
        }
850

    
851
        if (env->kvm_vcpu_dirty) {
852
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
853
            env->kvm_vcpu_dirty = 0;
854
        }
855

    
856
        kvm_arch_pre_run(env, run);
857
        cpu_single_env = NULL;
858
        qemu_mutex_unlock_iothread();
859
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
860
        qemu_mutex_lock_iothread();
861
        cpu_single_env = env;
862
        kvm_arch_post_run(env, run);
863

    
864
        if (ret == -EINTR || ret == -EAGAIN) {
865
            cpu_exit(env);
866
            DPRINTF("io window exit\n");
867
            ret = 0;
868
            break;
869
        }
870

    
871
        if (ret < 0) {
872
            DPRINTF("kvm run failed %s\n", strerror(-ret));
873
            abort();
874
        }
875

    
876
        kvm_flush_coalesced_mmio_buffer();
877

    
878
        ret = 0; /* exit loop */
879
        switch (run->exit_reason) {
880
        case KVM_EXIT_IO:
881
            DPRINTF("handle_io\n");
882
            ret = kvm_handle_io(run->io.port,
883
                                (uint8_t *)run + run->io.data_offset,
884
                                run->io.direction,
885
                                run->io.size,
886
                                run->io.count);
887
            break;
888
        case KVM_EXIT_MMIO:
889
            DPRINTF("handle_mmio\n");
890
            cpu_physical_memory_rw(run->mmio.phys_addr,
891
                                   run->mmio.data,
892
                                   run->mmio.len,
893
                                   run->mmio.is_write);
894
            ret = 1;
895
            break;
896
        case KVM_EXIT_IRQ_WINDOW_OPEN:
897
            DPRINTF("irq_window_open\n");
898
            break;
899
        case KVM_EXIT_SHUTDOWN:
900
            DPRINTF("shutdown\n");
901
            qemu_system_reset_request();
902
            ret = 1;
903
            break;
904
        case KVM_EXIT_UNKNOWN:
905
            DPRINTF("kvm_exit_unknown\n");
906
            break;
907
        case KVM_EXIT_FAIL_ENTRY:
908
            DPRINTF("kvm_exit_fail_entry\n");
909
            break;
910
        case KVM_EXIT_EXCEPTION:
911
            DPRINTF("kvm_exit_exception\n");
912
            break;
913
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
914
        case KVM_EXIT_INTERNAL_ERROR:
915
            kvm_handle_internal_error(env, run);
916
            break;
917
#endif
918
        case KVM_EXIT_DEBUG:
919
            DPRINTF("kvm_exit_debug\n");
920
#ifdef KVM_CAP_SET_GUEST_DEBUG
921
            if (kvm_arch_debug(&run->debug.arch)) {
922
                env->exception_index = EXCP_DEBUG;
923
                return 0;
924
            }
925
            /* re-enter, this exception was guest-internal */
926
            ret = 1;
927
#endif /* KVM_CAP_SET_GUEST_DEBUG */
928
            break;
929
        default:
930
            DPRINTF("kvm_arch_handle_exit\n");
931
            ret = kvm_arch_handle_exit(env, run);
932
            break;
933
        }
934
    } while (ret > 0);
935

    
936
    if (env->exit_request) {
937
        env->exit_request = 0;
938
        env->exception_index = EXCP_INTERRUPT;
939
    }
940

    
941
    return ret;
942
}
943

    
944
int kvm_ioctl(KVMState *s, int type, ...)
945
{
946
    int ret;
947
    void *arg;
948
    va_list ap;
949

    
950
    va_start(ap, type);
951
    arg = va_arg(ap, void *);
952
    va_end(ap);
953

    
954
    ret = ioctl(s->fd, type, arg);
955
    if (ret == -1)
956
        ret = -errno;
957

    
958
    return ret;
959
}
960

    
961
int kvm_vm_ioctl(KVMState *s, int type, ...)
962
{
963
    int ret;
964
    void *arg;
965
    va_list ap;
966

    
967
    va_start(ap, type);
968
    arg = va_arg(ap, void *);
969
    va_end(ap);
970

    
971
    ret = ioctl(s->vmfd, type, arg);
972
    if (ret == -1)
973
        ret = -errno;
974

    
975
    return ret;
976
}
977

    
978
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
979
{
980
    int ret;
981
    void *arg;
982
    va_list ap;
983

    
984
    va_start(ap, type);
985
    arg = va_arg(ap, void *);
986
    va_end(ap);
987

    
988
    ret = ioctl(env->kvm_fd, type, arg);
989
    if (ret == -1)
990
        ret = -errno;
991

    
992
    return ret;
993
}
994

    
995
int kvm_has_sync_mmu(void)
996
{
997
#ifdef KVM_CAP_SYNC_MMU
998
    KVMState *s = kvm_state;
999

    
1000
    return kvm_check_extension(s, KVM_CAP_SYNC_MMU);
1001
#else
1002
    return 0;
1003
#endif
1004
}
1005

    
1006
int kvm_has_vcpu_events(void)
1007
{
1008
    return kvm_state->vcpu_events;
1009
}
1010

    
1011
int kvm_has_robust_singlestep(void)
1012
{
1013
    return kvm_state->robust_singlestep;
1014
}
1015

    
1016
int kvm_has_debugregs(void)
1017
{
1018
    return kvm_state->debugregs;
1019
}
1020

    
1021
int kvm_has_xsave(void)
1022
{
1023
    return kvm_state->xsave;
1024
}
1025

    
1026
int kvm_has_xcrs(void)
1027
{
1028
    return kvm_state->xcrs;
1029
}
1030

    
1031
void kvm_setup_guest_memory(void *start, size_t size)
1032
{
1033
    if (!kvm_has_sync_mmu()) {
1034
#ifdef MADV_DONTFORK
1035
        int ret = madvise(start, size, MADV_DONTFORK);
1036

    
1037
        if (ret) {
1038
            perror("madvice");
1039
            exit(1);
1040
        }
1041
#else
1042
        fprintf(stderr,
1043
                "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1044
        exit(1);
1045
#endif
1046
    }
1047
}
1048

    
1049
#ifdef KVM_CAP_SET_GUEST_DEBUG
1050
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1051
                                                 target_ulong pc)
1052
{
1053
    struct kvm_sw_breakpoint *bp;
1054

    
1055
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1056
        if (bp->pc == pc)
1057
            return bp;
1058
    }
1059
    return NULL;
1060
}
1061

    
1062
int kvm_sw_breakpoints_active(CPUState *env)
1063
{
1064
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1065
}
1066

    
1067
struct kvm_set_guest_debug_data {
1068
    struct kvm_guest_debug dbg;
1069
    CPUState *env;
1070
    int err;
1071
};
1072

    
1073
static void kvm_invoke_set_guest_debug(void *data)
1074
{
1075
    struct kvm_set_guest_debug_data *dbg_data = data;
1076
    CPUState *env = dbg_data->env;
1077

    
1078
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1079
}
1080

    
1081
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1082
{
1083
    struct kvm_set_guest_debug_data data;
1084

    
1085
    data.dbg.control = reinject_trap;
1086

    
1087
    if (env->singlestep_enabled) {
1088
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1089
    }
1090
    kvm_arch_update_guest_debug(env, &data.dbg);
1091
    data.env = env;
1092

    
1093
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1094
    return data.err;
1095
}
1096

    
1097
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1098
                          target_ulong len, int type)
1099
{
1100
    struct kvm_sw_breakpoint *bp;
1101
    CPUState *env;
1102
    int err;
1103

    
1104
    if (type == GDB_BREAKPOINT_SW) {
1105
        bp = kvm_find_sw_breakpoint(current_env, addr);
1106
        if (bp) {
1107
            bp->use_count++;
1108
            return 0;
1109
        }
1110

    
1111
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1112
        if (!bp)
1113
            return -ENOMEM;
1114

    
1115
        bp->pc = addr;
1116
        bp->use_count = 1;
1117
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1118
        if (err) {
1119
            free(bp);
1120
            return err;
1121
        }
1122

    
1123
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1124
                          bp, entry);
1125
    } else {
1126
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1127
        if (err)
1128
            return err;
1129
    }
1130

    
1131
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1132
        err = kvm_update_guest_debug(env, 0);
1133
        if (err)
1134
            return err;
1135
    }
1136
    return 0;
1137
}
1138

    
1139
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1140
                          target_ulong len, int type)
1141
{
1142
    struct kvm_sw_breakpoint *bp;
1143
    CPUState *env;
1144
    int err;
1145

    
1146
    if (type == GDB_BREAKPOINT_SW) {
1147
        bp = kvm_find_sw_breakpoint(current_env, addr);
1148
        if (!bp)
1149
            return -ENOENT;
1150

    
1151
        if (bp->use_count > 1) {
1152
            bp->use_count--;
1153
            return 0;
1154
        }
1155

    
1156
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1157
        if (err)
1158
            return err;
1159

    
1160
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1161
        qemu_free(bp);
1162
    } else {
1163
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1164
        if (err)
1165
            return err;
1166
    }
1167

    
1168
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1169
        err = kvm_update_guest_debug(env, 0);
1170
        if (err)
1171
            return err;
1172
    }
1173
    return 0;
1174
}
1175

    
1176
void kvm_remove_all_breakpoints(CPUState *current_env)
1177
{
1178
    struct kvm_sw_breakpoint *bp, *next;
1179
    KVMState *s = current_env->kvm_state;
1180
    CPUState *env;
1181

    
1182
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1183
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1184
            /* Try harder to find a CPU that currently sees the breakpoint. */
1185
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1186
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0)
1187
                    break;
1188
            }
1189
        }
1190
    }
1191
    kvm_arch_remove_all_hw_breakpoints();
1192

    
1193
    for (env = first_cpu; env != NULL; env = env->next_cpu)
1194
        kvm_update_guest_debug(env, 0);
1195
}
1196

    
1197
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1198

    
1199
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1200
{
1201
    return -EINVAL;
1202
}
1203

    
1204
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1205
                          target_ulong len, int type)
1206
{
1207
    return -EINVAL;
1208
}
1209

    
1210
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1211
                          target_ulong len, int type)
1212
{
1213
    return -EINVAL;
1214
}
1215

    
1216
void kvm_remove_all_breakpoints(CPUState *current_env)
1217
{
1218
}
1219
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1220

    
1221
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1222
{
1223
    struct kvm_signal_mask *sigmask;
1224
    int r;
1225

    
1226
    if (!sigset)
1227
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1228

    
1229
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1230

    
1231
    sigmask->len = 8;
1232
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1233
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1234
    free(sigmask);
1235

    
1236
    return r;
1237
}
1238

    
1239
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1240
{
1241
#ifdef KVM_IOEVENTFD
1242
    int ret;
1243
    struct kvm_ioeventfd iofd;
1244

    
1245
    iofd.datamatch = val;
1246
    iofd.addr = addr;
1247
    iofd.len = 4;
1248
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1249
    iofd.fd = fd;
1250

    
1251
    if (!kvm_enabled()) {
1252
        return -ENOSYS;
1253
    }
1254

    
1255
    if (!assign) {
1256
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1257
    }
1258

    
1259
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1260

    
1261
    if (ret < 0) {
1262
        return -errno;
1263
    }
1264

    
1265
    return 0;
1266
#else
1267
    return -ENOSYS;
1268
#endif
1269
}
1270

    
1271
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1272
{
1273
#ifdef KVM_IOEVENTFD
1274
    struct kvm_ioeventfd kick = {
1275
        .datamatch = val,
1276
        .addr = addr,
1277
        .len = 2,
1278
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1279
        .fd = fd,
1280
    };
1281
    int r;
1282
    if (!kvm_enabled())
1283
        return -ENOSYS;
1284
    if (!assign)
1285
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1286
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1287
    if (r < 0)
1288
        return r;
1289
    return 0;
1290
#else
1291
    return -ENOSYS;
1292
#endif
1293
}