Statistics
| Branch: | Revision:

root / kvm-all.c @ 51b0c606

History | View | Annotate | Download (35.3 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* This check must be after config-host.h is included */
32
#ifdef CONFIG_EVENTFD
33
#include <sys/eventfd.h>
34
#endif
35

    
36
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
37
#define PAGE_SIZE TARGET_PAGE_SIZE
38

    
39
//#define DEBUG_KVM
40

    
41
#ifdef DEBUG_KVM
42
#define DPRINTF(fmt, ...) \
43
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
44
#else
45
#define DPRINTF(fmt, ...) \
46
    do { } while (0)
47
#endif
48

    
49
typedef struct KVMSlot
50
{
51
    target_phys_addr_t start_addr;
52
    ram_addr_t memory_size;
53
    ram_addr_t phys_offset;
54
    int slot;
55
    int flags;
56
} KVMSlot;
57

    
58
typedef struct kvm_dirty_log KVMDirtyLog;
59

    
60
struct KVMState
61
{
62
    KVMSlot slots[32];
63
    int fd;
64
    int vmfd;
65
    int coalesced_mmio;
66
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
67
    int broken_set_mem_region;
68
    int migration_log;
69
    int vcpu_events;
70
    int robust_singlestep;
71
    int debugregs;
72
#ifdef KVM_CAP_SET_GUEST_DEBUG
73
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
74
#endif
75
    int irqchip_in_kernel;
76
    int pit_in_kernel;
77
    int xsave, xcrs;
78
    int many_ioeventfds;
79
};
80

    
81
KVMState *kvm_state;
82

    
83
static const KVMCapabilityInfo kvm_required_capabilites[] = {
84
    KVM_CAP_INFO(USER_MEMORY),
85
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
86
    KVM_CAP_LAST_INFO
87
};
88

    
89
static KVMSlot *kvm_alloc_slot(KVMState *s)
90
{
91
    int i;
92

    
93
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
94
        if (s->slots[i].memory_size == 0) {
95
            return &s->slots[i];
96
        }
97
    }
98

    
99
    fprintf(stderr, "%s: no free slot available\n", __func__);
100
    abort();
101
}
102

    
103
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
104
                                         target_phys_addr_t start_addr,
105
                                         target_phys_addr_t end_addr)
106
{
107
    int i;
108

    
109
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
110
        KVMSlot *mem = &s->slots[i];
111

    
112
        if (start_addr == mem->start_addr &&
113
            end_addr == mem->start_addr + mem->memory_size) {
114
            return mem;
115
        }
116
    }
117

    
118
    return NULL;
119
}
120

    
121
/*
122
 * Find overlapping slot with lowest start address
123
 */
124
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
125
                                            target_phys_addr_t start_addr,
126
                                            target_phys_addr_t end_addr)
127
{
128
    KVMSlot *found = NULL;
129
    int i;
130

    
131
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
132
        KVMSlot *mem = &s->slots[i];
133

    
134
        if (mem->memory_size == 0 ||
135
            (found && found->start_addr < mem->start_addr)) {
136
            continue;
137
        }
138

    
139
        if (end_addr > mem->start_addr &&
140
            start_addr < mem->start_addr + mem->memory_size) {
141
            found = mem;
142
        }
143
    }
144

    
145
    return found;
146
}
147

    
148
int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
149
                                      target_phys_addr_t *phys_addr)
150
{
151
    int i;
152

    
153
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
154
        KVMSlot *mem = &s->slots[i];
155

    
156
        if (ram_addr >= mem->phys_offset &&
157
            ram_addr < mem->phys_offset + mem->memory_size) {
158
            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
159
            return 1;
160
        }
161
    }
162

    
163
    return 0;
164
}
165

    
166
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
167
{
168
    struct kvm_userspace_memory_region mem;
169

    
170
    mem.slot = slot->slot;
171
    mem.guest_phys_addr = slot->start_addr;
172
    mem.memory_size = slot->memory_size;
173
    mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
174
    mem.flags = slot->flags;
175
    if (s->migration_log) {
176
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
177
    }
178
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
179
}
180

    
181
static void kvm_reset_vcpu(void *opaque)
182
{
183
    CPUState *env = opaque;
184

    
185
    kvm_arch_reset_vcpu(env);
186
}
187

    
188
int kvm_irqchip_in_kernel(void)
189
{
190
    return kvm_state->irqchip_in_kernel;
191
}
192

    
193
int kvm_pit_in_kernel(void)
194
{
195
    return kvm_state->pit_in_kernel;
196
}
197

    
198
int kvm_init_vcpu(CPUState *env)
199
{
200
    KVMState *s = kvm_state;
201
    long mmap_size;
202
    int ret;
203

    
204
    DPRINTF("kvm_init_vcpu\n");
205

    
206
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
207
    if (ret < 0) {
208
        DPRINTF("kvm_create_vcpu failed\n");
209
        goto err;
210
    }
211

    
212
    env->kvm_fd = ret;
213
    env->kvm_state = s;
214
    env->kvm_vcpu_dirty = 1;
215

    
216
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
217
    if (mmap_size < 0) {
218
        ret = mmap_size;
219
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
220
        goto err;
221
    }
222

    
223
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
224
                        env->kvm_fd, 0);
225
    if (env->kvm_run == MAP_FAILED) {
226
        ret = -errno;
227
        DPRINTF("mmap'ing vcpu state failed\n");
228
        goto err;
229
    }
230

    
231
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
232
        s->coalesced_mmio_ring =
233
            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
234
    }
235

    
236
    ret = kvm_arch_init_vcpu(env);
237
    if (ret == 0) {
238
        qemu_register_reset(kvm_reset_vcpu, env);
239
        kvm_arch_reset_vcpu(env);
240
    }
241
err:
242
    return ret;
243
}
244

    
245
/*
246
 * dirty pages logging control
247
 */
248
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
249
                                      ram_addr_t size, int flags, int mask)
250
{
251
    KVMState *s = kvm_state;
252
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
253
    int old_flags;
254

    
255
    if (mem == NULL)  {
256
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
257
                    TARGET_FMT_plx "\n", __func__, phys_addr,
258
                    (target_phys_addr_t)(phys_addr + size - 1));
259
            return -EINVAL;
260
    }
261

    
262
    old_flags = mem->flags;
263

    
264
    flags = (mem->flags & ~mask) | flags;
265
    mem->flags = flags;
266

    
267
    /* If nothing changed effectively, no need to issue ioctl */
268
    if (s->migration_log) {
269
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
270
    }
271
    if (flags == old_flags) {
272
            return 0;
273
    }
274

    
275
    return kvm_set_user_memory_region(s, mem);
276
}
277

    
278
static int kvm_log_start(CPUPhysMemoryClient *client,
279
                         target_phys_addr_t phys_addr, ram_addr_t size)
280
{
281
    return kvm_dirty_pages_log_change(phys_addr, size, KVM_MEM_LOG_DIRTY_PAGES,
282
                                      KVM_MEM_LOG_DIRTY_PAGES);
283
}
284

    
285
static int kvm_log_stop(CPUPhysMemoryClient *client,
286
                        target_phys_addr_t phys_addr, ram_addr_t size)
287
{
288
    return kvm_dirty_pages_log_change(phys_addr, size, 0,
289
                                      KVM_MEM_LOG_DIRTY_PAGES);
290
}
291

    
292
static int kvm_set_migration_log(int enable)
293
{
294
    KVMState *s = kvm_state;
295
    KVMSlot *mem;
296
    int i, err;
297

    
298
    s->migration_log = enable;
299

    
300
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
301
        mem = &s->slots[i];
302

    
303
        if (!mem->memory_size) {
304
            continue;
305
        }
306
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
307
            continue;
308
        }
309
        err = kvm_set_user_memory_region(s, mem);
310
        if (err) {
311
            return err;
312
        }
313
    }
314
    return 0;
315
}
316

    
317
/* get kvm's dirty pages bitmap and update qemu's */
318
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
319
                                         unsigned long *bitmap,
320
                                         unsigned long offset,
321
                                         unsigned long mem_size)
322
{
323
    unsigned int i, j;
324
    unsigned long page_number, addr, addr1, c;
325
    ram_addr_t ram_addr;
326
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
327
        HOST_LONG_BITS;
328

    
329
    /*
330
     * bitmap-traveling is faster than memory-traveling (for addr...)
331
     * especially when most of the memory is not dirty.
332
     */
333
    for (i = 0; i < len; i++) {
334
        if (bitmap[i] != 0) {
335
            c = leul_to_cpu(bitmap[i]);
336
            do {
337
                j = ffsl(c) - 1;
338
                c &= ~(1ul << j);
339
                page_number = i * HOST_LONG_BITS + j;
340
                addr1 = page_number * TARGET_PAGE_SIZE;
341
                addr = offset + addr1;
342
                ram_addr = cpu_get_physical_page_desc(addr);
343
                cpu_physical_memory_set_dirty(ram_addr);
344
            } while (c != 0);
345
        }
346
    }
347
    return 0;
348
}
349

    
350
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
351

    
352
/**
353
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
354
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
355
 * This means all bits are set to dirty.
356
 *
357
 * @start_add: start of logged region.
358
 * @end_addr: end of logged region.
359
 */
360
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
361
                                          target_phys_addr_t end_addr)
362
{
363
    KVMState *s = kvm_state;
364
    unsigned long size, allocated_size = 0;
365
    KVMDirtyLog d;
366
    KVMSlot *mem;
367
    int ret = 0;
368

    
369
    d.dirty_bitmap = NULL;
370
    while (start_addr < end_addr) {
371
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
372
        if (mem == NULL) {
373
            break;
374
        }
375

    
376
        /* XXX bad kernel interface alert
377
         * For dirty bitmap, kernel allocates array of size aligned to
378
         * bits-per-long.  But for case when the kernel is 64bits and
379
         * the userspace is 32bits, userspace can't align to the same
380
         * bits-per-long, since sizeof(long) is different between kernel
381
         * and user space.  This way, userspace will provide buffer which
382
         * may be 4 bytes less than the kernel will use, resulting in
383
         * userspace memory corruption (which is not detectable by valgrind
384
         * too, in most cases).
385
         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
386
         * a hope that sizeof(long) wont become >8 any time soon.
387
         */
388
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
389
                     /*HOST_LONG_BITS*/ 64) / 8;
390
        if (!d.dirty_bitmap) {
391
            d.dirty_bitmap = qemu_malloc(size);
392
        } else if (size > allocated_size) {
393
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
394
        }
395
        allocated_size = size;
396
        memset(d.dirty_bitmap, 0, allocated_size);
397

    
398
        d.slot = mem->slot;
399

    
400
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
401
            DPRINTF("ioctl failed %d\n", errno);
402
            ret = -1;
403
            break;
404
        }
405

    
406
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
407
                                      mem->start_addr, mem->memory_size);
408
        start_addr = mem->start_addr + mem->memory_size;
409
    }
410
    qemu_free(d.dirty_bitmap);
411

    
412
    return ret;
413
}
414

    
415
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
416
{
417
    int ret = -ENOSYS;
418
    KVMState *s = kvm_state;
419

    
420
    if (s->coalesced_mmio) {
421
        struct kvm_coalesced_mmio_zone zone;
422

    
423
        zone.addr = start;
424
        zone.size = size;
425

    
426
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
427
    }
428

    
429
    return ret;
430
}
431

    
432
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
433
{
434
    int ret = -ENOSYS;
435
    KVMState *s = kvm_state;
436

    
437
    if (s->coalesced_mmio) {
438
        struct kvm_coalesced_mmio_zone zone;
439

    
440
        zone.addr = start;
441
        zone.size = size;
442

    
443
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
444
    }
445

    
446
    return ret;
447
}
448

    
449
int kvm_check_extension(KVMState *s, unsigned int extension)
450
{
451
    int ret;
452

    
453
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
454
    if (ret < 0) {
455
        ret = 0;
456
    }
457

    
458
    return ret;
459
}
460

    
461
static int kvm_check_many_ioeventfds(void)
462
{
463
    /* Userspace can use ioeventfd for io notification.  This requires a host
464
     * that supports eventfd(2) and an I/O thread; since eventfd does not
465
     * support SIGIO it cannot interrupt the vcpu.
466
     *
467
     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
468
     * can avoid creating too many ioeventfds.
469
     */
470
#if defined(CONFIG_EVENTFD) && defined(CONFIG_IOTHREAD)
471
    int ioeventfds[7];
472
    int i, ret = 0;
473
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
474
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
475
        if (ioeventfds[i] < 0) {
476
            break;
477
        }
478
        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
479
        if (ret < 0) {
480
            close(ioeventfds[i]);
481
            break;
482
        }
483
    }
484

    
485
    /* Decide whether many devices are supported or not */
486
    ret = i == ARRAY_SIZE(ioeventfds);
487

    
488
    while (i-- > 0) {
489
        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
490
        close(ioeventfds[i]);
491
    }
492
    return ret;
493
#else
494
    return 0;
495
#endif
496
}
497

    
498
static const KVMCapabilityInfo *
499
kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
500
{
501
    while (list->name) {
502
        if (!kvm_check_extension(s, list->value)) {
503
            return list;
504
        }
505
        list++;
506
    }
507
    return NULL;
508
}
509

    
510
static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
511
                             ram_addr_t phys_offset)
512
{
513
    KVMState *s = kvm_state;
514
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
515
    KVMSlot *mem, old;
516
    int err;
517

    
518
    /* kvm works in page size chunks, but the function may be called
519
       with sub-page size and unaligned start address. */
520
    size = TARGET_PAGE_ALIGN(size);
521
    start_addr = TARGET_PAGE_ALIGN(start_addr);
522

    
523
    /* KVM does not support read-only slots */
524
    phys_offset &= ~IO_MEM_ROM;
525

    
526
    while (1) {
527
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
528
        if (!mem) {
529
            break;
530
        }
531

    
532
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
533
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
534
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
535
            /* The new slot fits into the existing one and comes with
536
             * identical parameters - nothing to be done. */
537
            return;
538
        }
539

    
540
        old = *mem;
541

    
542
        /* unregister the overlapping slot */
543
        mem->memory_size = 0;
544
        err = kvm_set_user_memory_region(s, mem);
545
        if (err) {
546
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
547
                    __func__, strerror(-err));
548
            abort();
549
        }
550

    
551
        /* Workaround for older KVM versions: we can't join slots, even not by
552
         * unregistering the previous ones and then registering the larger
553
         * slot. We have to maintain the existing fragmentation. Sigh.
554
         *
555
         * This workaround assumes that the new slot starts at the same
556
         * address as the first existing one. If not or if some overlapping
557
         * slot comes around later, we will fail (not seen in practice so far)
558
         * - and actually require a recent KVM version. */
559
        if (s->broken_set_mem_region &&
560
            old.start_addr == start_addr && old.memory_size < size &&
561
            flags < IO_MEM_UNASSIGNED) {
562
            mem = kvm_alloc_slot(s);
563
            mem->memory_size = old.memory_size;
564
            mem->start_addr = old.start_addr;
565
            mem->phys_offset = old.phys_offset;
566
            mem->flags = 0;
567

    
568
            err = kvm_set_user_memory_region(s, mem);
569
            if (err) {
570
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
571
                        strerror(-err));
572
                abort();
573
            }
574

    
575
            start_addr += old.memory_size;
576
            phys_offset += old.memory_size;
577
            size -= old.memory_size;
578
            continue;
579
        }
580

    
581
        /* register prefix slot */
582
        if (old.start_addr < start_addr) {
583
            mem = kvm_alloc_slot(s);
584
            mem->memory_size = start_addr - old.start_addr;
585
            mem->start_addr = old.start_addr;
586
            mem->phys_offset = old.phys_offset;
587
            mem->flags = 0;
588

    
589
            err = kvm_set_user_memory_region(s, mem);
590
            if (err) {
591
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
592
                        __func__, strerror(-err));
593
                abort();
594
            }
595
        }
596

    
597
        /* register suffix slot */
598
        if (old.start_addr + old.memory_size > start_addr + size) {
599
            ram_addr_t size_delta;
600

    
601
            mem = kvm_alloc_slot(s);
602
            mem->start_addr = start_addr + size;
603
            size_delta = mem->start_addr - old.start_addr;
604
            mem->memory_size = old.memory_size - size_delta;
605
            mem->phys_offset = old.phys_offset + size_delta;
606
            mem->flags = 0;
607

    
608
            err = kvm_set_user_memory_region(s, mem);
609
            if (err) {
610
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
611
                        __func__, strerror(-err));
612
                abort();
613
            }
614
        }
615
    }
616

    
617
    /* in case the KVM bug workaround already "consumed" the new slot */
618
    if (!size) {
619
        return;
620
    }
621
    /* KVM does not need to know about this memory */
622
    if (flags >= IO_MEM_UNASSIGNED) {
623
        return;
624
    }
625
    mem = kvm_alloc_slot(s);
626
    mem->memory_size = size;
627
    mem->start_addr = start_addr;
628
    mem->phys_offset = phys_offset;
629
    mem->flags = 0;
630

    
631
    err = kvm_set_user_memory_region(s, mem);
632
    if (err) {
633
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
634
                strerror(-err));
635
        abort();
636
    }
637
}
638

    
639
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
640
                                  target_phys_addr_t start_addr,
641
                                  ram_addr_t size, ram_addr_t phys_offset)
642
{
643
    kvm_set_phys_mem(start_addr, size, phys_offset);
644
}
645

    
646
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
647
                                        target_phys_addr_t start_addr,
648
                                        target_phys_addr_t end_addr)
649
{
650
    return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
651
}
652

    
653
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
654
                                    int enable)
655
{
656
    return kvm_set_migration_log(enable);
657
}
658

    
659
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
660
    .set_memory = kvm_client_set_memory,
661
    .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
662
    .migration_log = kvm_client_migration_log,
663
    .log_start = kvm_log_start,
664
    .log_stop = kvm_log_stop,
665
};
666

    
667
static void kvm_handle_interrupt(CPUState *env, int mask)
668
{
669
    env->interrupt_request |= mask;
670

    
671
    if (!qemu_cpu_is_self(env)) {
672
        qemu_cpu_kick(env);
673
    }
674
}
675

    
676
int kvm_init(void)
677
{
678
    static const char upgrade_note[] =
679
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
680
        "(see http://sourceforge.net/projects/kvm).\n";
681
    KVMState *s;
682
    const KVMCapabilityInfo *missing_cap;
683
    int ret;
684
    int i;
685

    
686
    s = qemu_mallocz(sizeof(KVMState));
687

    
688
#ifdef KVM_CAP_SET_GUEST_DEBUG
689
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
690
#endif
691
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
692
        s->slots[i].slot = i;
693
    }
694
    s->vmfd = -1;
695
    s->fd = qemu_open("/dev/kvm", O_RDWR);
696
    if (s->fd == -1) {
697
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
698
        ret = -errno;
699
        goto err;
700
    }
701

    
702
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
703
    if (ret < KVM_API_VERSION) {
704
        if (ret > 0) {
705
            ret = -EINVAL;
706
        }
707
        fprintf(stderr, "kvm version too old\n");
708
        goto err;
709
    }
710

    
711
    if (ret > KVM_API_VERSION) {
712
        ret = -EINVAL;
713
        fprintf(stderr, "kvm version not supported\n");
714
        goto err;
715
    }
716

    
717
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
718
    if (s->vmfd < 0) {
719
#ifdef TARGET_S390X
720
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
721
                        "your host kernel command line\n");
722
#endif
723
        goto err;
724
    }
725

    
726
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
727
    if (!missing_cap) {
728
        missing_cap =
729
            kvm_check_extension_list(s, kvm_arch_required_capabilities);
730
    }
731
    if (missing_cap) {
732
        ret = -EINVAL;
733
        fprintf(stderr, "kvm does not support %s\n%s",
734
                missing_cap->name, upgrade_note);
735
        goto err;
736
    }
737

    
738
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
739

    
740
    s->broken_set_mem_region = 1;
741
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
742
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
743
    if (ret > 0) {
744
        s->broken_set_mem_region = 0;
745
    }
746
#endif
747

    
748
    s->vcpu_events = 0;
749
#ifdef KVM_CAP_VCPU_EVENTS
750
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
751
#endif
752

    
753
    s->robust_singlestep = 0;
754
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
755
    s->robust_singlestep =
756
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
757
#endif
758

    
759
    s->debugregs = 0;
760
#ifdef KVM_CAP_DEBUGREGS
761
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
762
#endif
763

    
764
    s->xsave = 0;
765
#ifdef KVM_CAP_XSAVE
766
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
767
#endif
768

    
769
    s->xcrs = 0;
770
#ifdef KVM_CAP_XCRS
771
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
772
#endif
773

    
774
    ret = kvm_arch_init(s);
775
    if (ret < 0) {
776
        goto err;
777
    }
778

    
779
    kvm_state = s;
780
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
781

    
782
    s->many_ioeventfds = kvm_check_many_ioeventfds();
783

    
784
    cpu_interrupt_handler = kvm_handle_interrupt;
785

    
786
    return 0;
787

    
788
err:
789
    if (s) {
790
        if (s->vmfd != -1) {
791
            close(s->vmfd);
792
        }
793
        if (s->fd != -1) {
794
            close(s->fd);
795
        }
796
    }
797
    qemu_free(s);
798

    
799
    return ret;
800
}
801

    
802
static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
803
                          uint32_t count)
804
{
805
    int i;
806
    uint8_t *ptr = data;
807

    
808
    for (i = 0; i < count; i++) {
809
        if (direction == KVM_EXIT_IO_IN) {
810
            switch (size) {
811
            case 1:
812
                stb_p(ptr, cpu_inb(port));
813
                break;
814
            case 2:
815
                stw_p(ptr, cpu_inw(port));
816
                break;
817
            case 4:
818
                stl_p(ptr, cpu_inl(port));
819
                break;
820
            }
821
        } else {
822
            switch (size) {
823
            case 1:
824
                cpu_outb(port, ldub_p(ptr));
825
                break;
826
            case 2:
827
                cpu_outw(port, lduw_p(ptr));
828
                break;
829
            case 4:
830
                cpu_outl(port, ldl_p(ptr));
831
                break;
832
            }
833
        }
834

    
835
        ptr += size;
836
    }
837
}
838

    
839
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
840
static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
841
{
842
    fprintf(stderr, "KVM internal error.");
843
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
844
        int i;
845

    
846
        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
847
        for (i = 0; i < run->internal.ndata; ++i) {
848
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
849
                    i, (uint64_t)run->internal.data[i]);
850
        }
851
    } else {
852
        fprintf(stderr, "\n");
853
    }
854
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
855
        fprintf(stderr, "emulation failure\n");
856
        if (!kvm_arch_stop_on_emulation_error(env)) {
857
            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
858
            return EXCP_INTERRUPT;
859
        }
860
    }
861
    /* FIXME: Should trigger a qmp message to let management know
862
     * something went wrong.
863
     */
864
    return -1;
865
}
866
#endif
867

    
868
void kvm_flush_coalesced_mmio_buffer(void)
869
{
870
    KVMState *s = kvm_state;
871
    if (s->coalesced_mmio_ring) {
872
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
873
        while (ring->first != ring->last) {
874
            struct kvm_coalesced_mmio *ent;
875

    
876
            ent = &ring->coalesced_mmio[ring->first];
877

    
878
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
879
            smp_wmb();
880
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
881
        }
882
    }
883
}
884

    
885
static void do_kvm_cpu_synchronize_state(void *_env)
886
{
887
    CPUState *env = _env;
888

    
889
    if (!env->kvm_vcpu_dirty) {
890
        kvm_arch_get_registers(env);
891
        env->kvm_vcpu_dirty = 1;
892
    }
893
}
894

    
895
void kvm_cpu_synchronize_state(CPUState *env)
896
{
897
    if (!env->kvm_vcpu_dirty) {
898
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
899
    }
900
}
901

    
902
void kvm_cpu_synchronize_post_reset(CPUState *env)
903
{
904
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
905
    env->kvm_vcpu_dirty = 0;
906
}
907

    
908
void kvm_cpu_synchronize_post_init(CPUState *env)
909
{
910
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
911
    env->kvm_vcpu_dirty = 0;
912
}
913

    
914
int kvm_cpu_exec(CPUState *env)
915
{
916
    struct kvm_run *run = env->kvm_run;
917
    int ret, run_ret;
918

    
919
    DPRINTF("kvm_cpu_exec()\n");
920

    
921
    if (kvm_arch_process_async_events(env)) {
922
        env->exit_request = 0;
923
        return EXCP_HLT;
924
    }
925

    
926
    cpu_single_env = env;
927

    
928
    do {
929
        if (env->kvm_vcpu_dirty) {
930
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
931
            env->kvm_vcpu_dirty = 0;
932
        }
933

    
934
        kvm_arch_pre_run(env, run);
935
        if (env->exit_request) {
936
            DPRINTF("interrupt exit requested\n");
937
            /*
938
             * KVM requires us to reenter the kernel after IO exits to complete
939
             * instruction emulation. This self-signal will ensure that we
940
             * leave ASAP again.
941
             */
942
            qemu_cpu_kick_self();
943
        }
944
        cpu_single_env = NULL;
945
        qemu_mutex_unlock_iothread();
946

    
947
        run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
948

    
949
        qemu_mutex_lock_iothread();
950
        cpu_single_env = env;
951
        kvm_arch_post_run(env, run);
952

    
953
        kvm_flush_coalesced_mmio_buffer();
954

    
955
        if (run_ret < 0) {
956
            if (run_ret == -EINTR || run_ret == -EAGAIN) {
957
                DPRINTF("io window exit\n");
958
                ret = EXCP_INTERRUPT;
959
                break;
960
            }
961
            DPRINTF("kvm run failed %s\n", strerror(-run_ret));
962
            abort();
963
        }
964

    
965
        switch (run->exit_reason) {
966
        case KVM_EXIT_IO:
967
            DPRINTF("handle_io\n");
968
            kvm_handle_io(run->io.port,
969
                          (uint8_t *)run + run->io.data_offset,
970
                          run->io.direction,
971
                          run->io.size,
972
                          run->io.count);
973
            ret = 0;
974
            break;
975
        case KVM_EXIT_MMIO:
976
            DPRINTF("handle_mmio\n");
977
            cpu_physical_memory_rw(run->mmio.phys_addr,
978
                                   run->mmio.data,
979
                                   run->mmio.len,
980
                                   run->mmio.is_write);
981
            ret = 0;
982
            break;
983
        case KVM_EXIT_IRQ_WINDOW_OPEN:
984
            DPRINTF("irq_window_open\n");
985
            ret = EXCP_INTERRUPT;
986
            break;
987
        case KVM_EXIT_SHUTDOWN:
988
            DPRINTF("shutdown\n");
989
            qemu_system_reset_request();
990
            ret = EXCP_INTERRUPT;
991
            break;
992
        case KVM_EXIT_UNKNOWN:
993
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
994
                    (uint64_t)run->hw.hardware_exit_reason);
995
            ret = -1;
996
            break;
997
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
998
        case KVM_EXIT_INTERNAL_ERROR:
999
            ret = kvm_handle_internal_error(env, run);
1000
            break;
1001
#endif
1002
        default:
1003
            DPRINTF("kvm_arch_handle_exit\n");
1004
            ret = kvm_arch_handle_exit(env, run);
1005
            break;
1006
        }
1007
    } while (ret == 0);
1008

    
1009
    if (ret < 0) {
1010
        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
1011
        vm_stop(VMSTOP_PANIC);
1012
    }
1013

    
1014
    env->exit_request = 0;
1015
    cpu_single_env = NULL;
1016
    return ret;
1017
}
1018

    
1019
int kvm_ioctl(KVMState *s, int type, ...)
1020
{
1021
    int ret;
1022
    void *arg;
1023
    va_list ap;
1024

    
1025
    va_start(ap, type);
1026
    arg = va_arg(ap, void *);
1027
    va_end(ap);
1028

    
1029
    ret = ioctl(s->fd, type, arg);
1030
    if (ret == -1) {
1031
        ret = -errno;
1032
    }
1033
    return ret;
1034
}
1035

    
1036
int kvm_vm_ioctl(KVMState *s, int type, ...)
1037
{
1038
    int ret;
1039
    void *arg;
1040
    va_list ap;
1041

    
1042
    va_start(ap, type);
1043
    arg = va_arg(ap, void *);
1044
    va_end(ap);
1045

    
1046
    ret = ioctl(s->vmfd, type, arg);
1047
    if (ret == -1) {
1048
        ret = -errno;
1049
    }
1050
    return ret;
1051
}
1052

    
1053
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
1054
{
1055
    int ret;
1056
    void *arg;
1057
    va_list ap;
1058

    
1059
    va_start(ap, type);
1060
    arg = va_arg(ap, void *);
1061
    va_end(ap);
1062

    
1063
    ret = ioctl(env->kvm_fd, type, arg);
1064
    if (ret == -1) {
1065
        ret = -errno;
1066
    }
1067
    return ret;
1068
}
1069

    
1070
int kvm_has_sync_mmu(void)
1071
{
1072
    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1073
}
1074

    
1075
int kvm_has_vcpu_events(void)
1076
{
1077
    return kvm_state->vcpu_events;
1078
}
1079

    
1080
int kvm_has_robust_singlestep(void)
1081
{
1082
    return kvm_state->robust_singlestep;
1083
}
1084

    
1085
int kvm_has_debugregs(void)
1086
{
1087
    return kvm_state->debugregs;
1088
}
1089

    
1090
int kvm_has_xsave(void)
1091
{
1092
    return kvm_state->xsave;
1093
}
1094

    
1095
int kvm_has_xcrs(void)
1096
{
1097
    return kvm_state->xcrs;
1098
}
1099

    
1100
int kvm_has_many_ioeventfds(void)
1101
{
1102
    if (!kvm_enabled()) {
1103
        return 0;
1104
    }
1105
    return kvm_state->many_ioeventfds;
1106
}
1107

    
1108
void kvm_setup_guest_memory(void *start, size_t size)
1109
{
1110
    if (!kvm_has_sync_mmu()) {
1111
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1112

    
1113
        if (ret) {
1114
            perror("qemu_madvise");
1115
            fprintf(stderr,
1116
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1117
            exit(1);
1118
        }
1119
    }
1120
}
1121

    
1122
#ifdef KVM_CAP_SET_GUEST_DEBUG
1123
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1124
                                                 target_ulong pc)
1125
{
1126
    struct kvm_sw_breakpoint *bp;
1127

    
1128
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1129
        if (bp->pc == pc) {
1130
            return bp;
1131
        }
1132
    }
1133
    return NULL;
1134
}
1135

    
1136
int kvm_sw_breakpoints_active(CPUState *env)
1137
{
1138
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1139
}
1140

    
1141
struct kvm_set_guest_debug_data {
1142
    struct kvm_guest_debug dbg;
1143
    CPUState *env;
1144
    int err;
1145
};
1146

    
1147
static void kvm_invoke_set_guest_debug(void *data)
1148
{
1149
    struct kvm_set_guest_debug_data *dbg_data = data;
1150
    CPUState *env = dbg_data->env;
1151

    
1152
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1153
}
1154

    
1155
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1156
{
1157
    struct kvm_set_guest_debug_data data;
1158

    
1159
    data.dbg.control = reinject_trap;
1160

    
1161
    if (env->singlestep_enabled) {
1162
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1163
    }
1164
    kvm_arch_update_guest_debug(env, &data.dbg);
1165
    data.env = env;
1166

    
1167
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1168
    return data.err;
1169
}
1170

    
1171
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1172
                          target_ulong len, int type)
1173
{
1174
    struct kvm_sw_breakpoint *bp;
1175
    CPUState *env;
1176
    int err;
1177

    
1178
    if (type == GDB_BREAKPOINT_SW) {
1179
        bp = kvm_find_sw_breakpoint(current_env, addr);
1180
        if (bp) {
1181
            bp->use_count++;
1182
            return 0;
1183
        }
1184

    
1185
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1186
        if (!bp) {
1187
            return -ENOMEM;
1188
        }
1189

    
1190
        bp->pc = addr;
1191
        bp->use_count = 1;
1192
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1193
        if (err) {
1194
            free(bp);
1195
            return err;
1196
        }
1197

    
1198
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1199
                          bp, entry);
1200
    } else {
1201
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1202
        if (err) {
1203
            return err;
1204
        }
1205
    }
1206

    
1207
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1208
        err = kvm_update_guest_debug(env, 0);
1209
        if (err) {
1210
            return err;
1211
        }
1212
    }
1213
    return 0;
1214
}
1215

    
1216
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1217
                          target_ulong len, int type)
1218
{
1219
    struct kvm_sw_breakpoint *bp;
1220
    CPUState *env;
1221
    int err;
1222

    
1223
    if (type == GDB_BREAKPOINT_SW) {
1224
        bp = kvm_find_sw_breakpoint(current_env, addr);
1225
        if (!bp) {
1226
            return -ENOENT;
1227
        }
1228

    
1229
        if (bp->use_count > 1) {
1230
            bp->use_count--;
1231
            return 0;
1232
        }
1233

    
1234
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1235
        if (err) {
1236
            return err;
1237
        }
1238

    
1239
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1240
        qemu_free(bp);
1241
    } else {
1242
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1243
        if (err) {
1244
            return err;
1245
        }
1246
    }
1247

    
1248
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1249
        err = kvm_update_guest_debug(env, 0);
1250
        if (err) {
1251
            return err;
1252
        }
1253
    }
1254
    return 0;
1255
}
1256

    
1257
void kvm_remove_all_breakpoints(CPUState *current_env)
1258
{
1259
    struct kvm_sw_breakpoint *bp, *next;
1260
    KVMState *s = current_env->kvm_state;
1261
    CPUState *env;
1262

    
1263
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1264
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1265
            /* Try harder to find a CPU that currently sees the breakpoint. */
1266
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1267
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1268
                    break;
1269
                }
1270
            }
1271
        }
1272
    }
1273
    kvm_arch_remove_all_hw_breakpoints();
1274

    
1275
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1276
        kvm_update_guest_debug(env, 0);
1277
    }
1278
}
1279

    
1280
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1281

    
1282
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1283
{
1284
    return -EINVAL;
1285
}
1286

    
1287
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1288
                          target_ulong len, int type)
1289
{
1290
    return -EINVAL;
1291
}
1292

    
1293
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1294
                          target_ulong len, int type)
1295
{
1296
    return -EINVAL;
1297
}
1298

    
1299
void kvm_remove_all_breakpoints(CPUState *current_env)
1300
{
1301
}
1302
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1303

    
1304
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1305
{
1306
    struct kvm_signal_mask *sigmask;
1307
    int r;
1308

    
1309
    if (!sigset) {
1310
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1311
    }
1312

    
1313
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1314

    
1315
    sigmask->len = 8;
1316
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1317
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1318
    free(sigmask);
1319

    
1320
    return r;
1321
}
1322

    
1323
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1324
{
1325
#ifdef KVM_IOEVENTFD
1326
    int ret;
1327
    struct kvm_ioeventfd iofd;
1328

    
1329
    iofd.datamatch = val;
1330
    iofd.addr = addr;
1331
    iofd.len = 4;
1332
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1333
    iofd.fd = fd;
1334

    
1335
    if (!kvm_enabled()) {
1336
        return -ENOSYS;
1337
    }
1338

    
1339
    if (!assign) {
1340
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1341
    }
1342

    
1343
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1344

    
1345
    if (ret < 0) {
1346
        return -errno;
1347
    }
1348

    
1349
    return 0;
1350
#else
1351
    return -ENOSYS;
1352
#endif
1353
}
1354

    
1355
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1356
{
1357
#ifdef KVM_IOEVENTFD
1358
    struct kvm_ioeventfd kick = {
1359
        .datamatch = val,
1360
        .addr = addr,
1361
        .len = 2,
1362
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1363
        .fd = fd,
1364
    };
1365
    int r;
1366
    if (!kvm_enabled()) {
1367
        return -ENOSYS;
1368
    }
1369
    if (!assign) {
1370
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1371
    }
1372
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1373
    if (r < 0) {
1374
        return r;
1375
    }
1376
    return 0;
1377
#else
1378
    return -ENOSYS;
1379
#endif
1380
}
1381

    
1382
int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
1383
{
1384
    return kvm_arch_on_sigbus_vcpu(env, code, addr);
1385
}
1386

    
1387
int kvm_on_sigbus(int code, void *addr)
1388
{
1389
    return kvm_arch_on_sigbus(code, addr);
1390
}