Statistics
| Branch: | Revision:

root / kvm-all.c @ 6a7af8cb

History | View | Annotate | Download (34.6 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* This check must be after config-host.h is included */
32
#ifdef CONFIG_EVENTFD
33
#include <sys/eventfd.h>
34
#endif
35

    
36
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
37
#define PAGE_SIZE TARGET_PAGE_SIZE
38

    
39
//#define DEBUG_KVM
40

    
41
#ifdef DEBUG_KVM
42
#define DPRINTF(fmt, ...) \
43
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
44
#else
45
#define DPRINTF(fmt, ...) \
46
    do { } while (0)
47
#endif
48

    
49
typedef struct KVMSlot
50
{
51
    target_phys_addr_t start_addr;
52
    ram_addr_t memory_size;
53
    ram_addr_t phys_offset;
54
    int slot;
55
    int flags;
56
} KVMSlot;
57

    
58
typedef struct kvm_dirty_log KVMDirtyLog;
59

    
60
struct KVMState
61
{
62
    KVMSlot slots[32];
63
    int fd;
64
    int vmfd;
65
    int coalesced_mmio;
66
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
67
    int broken_set_mem_region;
68
    int migration_log;
69
    int vcpu_events;
70
    int robust_singlestep;
71
    int debugregs;
72
#ifdef KVM_CAP_SET_GUEST_DEBUG
73
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
74
#endif
75
    int irqchip_in_kernel;
76
    int pit_in_kernel;
77
    int xsave, xcrs;
78
    int many_ioeventfds;
79
};
80

    
81
KVMState *kvm_state;
82

    
83
static const KVMCapabilityInfo kvm_required_capabilites[] = {
84
    KVM_CAP_INFO(USER_MEMORY),
85
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
86
    KVM_CAP_LAST_INFO
87
};
88

    
89
static KVMSlot *kvm_alloc_slot(KVMState *s)
90
{
91
    int i;
92

    
93
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
94
        if (s->slots[i].memory_size == 0) {
95
            return &s->slots[i];
96
        }
97
    }
98

    
99
    fprintf(stderr, "%s: no free slot available\n", __func__);
100
    abort();
101
}
102

    
103
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
104
                                         target_phys_addr_t start_addr,
105
                                         target_phys_addr_t end_addr)
106
{
107
    int i;
108

    
109
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
110
        KVMSlot *mem = &s->slots[i];
111

    
112
        if (start_addr == mem->start_addr &&
113
            end_addr == mem->start_addr + mem->memory_size) {
114
            return mem;
115
        }
116
    }
117

    
118
    return NULL;
119
}
120

    
121
/*
122
 * Find overlapping slot with lowest start address
123
 */
124
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
125
                                            target_phys_addr_t start_addr,
126
                                            target_phys_addr_t end_addr)
127
{
128
    KVMSlot *found = NULL;
129
    int i;
130

    
131
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
132
        KVMSlot *mem = &s->slots[i];
133

    
134
        if (mem->memory_size == 0 ||
135
            (found && found->start_addr < mem->start_addr)) {
136
            continue;
137
        }
138

    
139
        if (end_addr > mem->start_addr &&
140
            start_addr < mem->start_addr + mem->memory_size) {
141
            found = mem;
142
        }
143
    }
144

    
145
    return found;
146
}
147

    
148
int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
149
                                      target_phys_addr_t *phys_addr)
150
{
151
    int i;
152

    
153
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
154
        KVMSlot *mem = &s->slots[i];
155

    
156
        if (ram_addr >= mem->phys_offset &&
157
            ram_addr < mem->phys_offset + mem->memory_size) {
158
            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
159
            return 1;
160
        }
161
    }
162

    
163
    return 0;
164
}
165

    
166
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
167
{
168
    struct kvm_userspace_memory_region mem;
169

    
170
    mem.slot = slot->slot;
171
    mem.guest_phys_addr = slot->start_addr;
172
    mem.memory_size = slot->memory_size;
173
    mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
174
    mem.flags = slot->flags;
175
    if (s->migration_log) {
176
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
177
    }
178
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
179
}
180

    
181
static void kvm_reset_vcpu(void *opaque)
182
{
183
    CPUState *env = opaque;
184

    
185
    kvm_arch_reset_vcpu(env);
186
}
187

    
188
int kvm_irqchip_in_kernel(void)
189
{
190
    return kvm_state->irqchip_in_kernel;
191
}
192

    
193
int kvm_pit_in_kernel(void)
194
{
195
    return kvm_state->pit_in_kernel;
196
}
197

    
198
int kvm_init_vcpu(CPUState *env)
199
{
200
    KVMState *s = kvm_state;
201
    long mmap_size;
202
    int ret;
203

    
204
    DPRINTF("kvm_init_vcpu\n");
205

    
206
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
207
    if (ret < 0) {
208
        DPRINTF("kvm_create_vcpu failed\n");
209
        goto err;
210
    }
211

    
212
    env->kvm_fd = ret;
213
    env->kvm_state = s;
214

    
215
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
216
    if (mmap_size < 0) {
217
        ret = mmap_size;
218
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
219
        goto err;
220
    }
221

    
222
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
223
                        env->kvm_fd, 0);
224
    if (env->kvm_run == MAP_FAILED) {
225
        ret = -errno;
226
        DPRINTF("mmap'ing vcpu state failed\n");
227
        goto err;
228
    }
229

    
230
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
231
        s->coalesced_mmio_ring =
232
            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
233
    }
234

    
235
    ret = kvm_arch_init_vcpu(env);
236
    if (ret == 0) {
237
        qemu_register_reset(kvm_reset_vcpu, env);
238
        kvm_arch_reset_vcpu(env);
239
    }
240
err:
241
    return ret;
242
}
243

    
244
/*
245
 * dirty pages logging control
246
 */
247
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
248
                                      ram_addr_t size, int flags, int mask)
249
{
250
    KVMState *s = kvm_state;
251
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
252
    int old_flags;
253

    
254
    if (mem == NULL)  {
255
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
256
                    TARGET_FMT_plx "\n", __func__, phys_addr,
257
                    (target_phys_addr_t)(phys_addr + size - 1));
258
            return -EINVAL;
259
    }
260

    
261
    old_flags = mem->flags;
262

    
263
    flags = (mem->flags & ~mask) | flags;
264
    mem->flags = flags;
265

    
266
    /* If nothing changed effectively, no need to issue ioctl */
267
    if (s->migration_log) {
268
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
269
    }
270
    if (flags == old_flags) {
271
            return 0;
272
    }
273

    
274
    return kvm_set_user_memory_region(s, mem);
275
}
276

    
277
static int kvm_log_start(CPUPhysMemoryClient *client,
278
                         target_phys_addr_t phys_addr, ram_addr_t size)
279
{
280
    return kvm_dirty_pages_log_change(phys_addr, size, KVM_MEM_LOG_DIRTY_PAGES,
281
                                      KVM_MEM_LOG_DIRTY_PAGES);
282
}
283

    
284
static int kvm_log_stop(CPUPhysMemoryClient *client,
285
                        target_phys_addr_t phys_addr, ram_addr_t size)
286
{
287
    return kvm_dirty_pages_log_change(phys_addr, size, 0,
288
                                      KVM_MEM_LOG_DIRTY_PAGES);
289
}
290

    
291
static int kvm_set_migration_log(int enable)
292
{
293
    KVMState *s = kvm_state;
294
    KVMSlot *mem;
295
    int i, err;
296

    
297
    s->migration_log = enable;
298

    
299
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
300
        mem = &s->slots[i];
301

    
302
        if (!mem->memory_size) {
303
            continue;
304
        }
305
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
306
            continue;
307
        }
308
        err = kvm_set_user_memory_region(s, mem);
309
        if (err) {
310
            return err;
311
        }
312
    }
313
    return 0;
314
}
315

    
316
/* get kvm's dirty pages bitmap and update qemu's */
317
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
318
                                         unsigned long *bitmap,
319
                                         unsigned long offset,
320
                                         unsigned long mem_size)
321
{
322
    unsigned int i, j;
323
    unsigned long page_number, addr, addr1, c;
324
    ram_addr_t ram_addr;
325
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
326
        HOST_LONG_BITS;
327

    
328
    /*
329
     * bitmap-traveling is faster than memory-traveling (for addr...)
330
     * especially when most of the memory is not dirty.
331
     */
332
    for (i = 0; i < len; i++) {
333
        if (bitmap[i] != 0) {
334
            c = leul_to_cpu(bitmap[i]);
335
            do {
336
                j = ffsl(c) - 1;
337
                c &= ~(1ul << j);
338
                page_number = i * HOST_LONG_BITS + j;
339
                addr1 = page_number * TARGET_PAGE_SIZE;
340
                addr = offset + addr1;
341
                ram_addr = cpu_get_physical_page_desc(addr);
342
                cpu_physical_memory_set_dirty(ram_addr);
343
            } while (c != 0);
344
        }
345
    }
346
    return 0;
347
}
348

    
349
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
350

    
351
/**
352
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
353
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
354
 * This means all bits are set to dirty.
355
 *
356
 * @start_add: start of logged region.
357
 * @end_addr: end of logged region.
358
 */
359
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
360
                                          target_phys_addr_t end_addr)
361
{
362
    KVMState *s = kvm_state;
363
    unsigned long size, allocated_size = 0;
364
    KVMDirtyLog d;
365
    KVMSlot *mem;
366
    int ret = 0;
367

    
368
    d.dirty_bitmap = NULL;
369
    while (start_addr < end_addr) {
370
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
371
        if (mem == NULL) {
372
            break;
373
        }
374

    
375
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
376
        if (!d.dirty_bitmap) {
377
            d.dirty_bitmap = qemu_malloc(size);
378
        } else if (size > allocated_size) {
379
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
380
        }
381
        allocated_size = size;
382
        memset(d.dirty_bitmap, 0, allocated_size);
383

    
384
        d.slot = mem->slot;
385

    
386
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
387
            DPRINTF("ioctl failed %d\n", errno);
388
            ret = -1;
389
            break;
390
        }
391

    
392
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
393
                                      mem->start_addr, mem->memory_size);
394
        start_addr = mem->start_addr + mem->memory_size;
395
    }
396
    qemu_free(d.dirty_bitmap);
397

    
398
    return ret;
399
}
400

    
401
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
402
{
403
    int ret = -ENOSYS;
404
    KVMState *s = kvm_state;
405

    
406
    if (s->coalesced_mmio) {
407
        struct kvm_coalesced_mmio_zone zone;
408

    
409
        zone.addr = start;
410
        zone.size = size;
411

    
412
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
413
    }
414

    
415
    return ret;
416
}
417

    
418
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
419
{
420
    int ret = -ENOSYS;
421
    KVMState *s = kvm_state;
422

    
423
    if (s->coalesced_mmio) {
424
        struct kvm_coalesced_mmio_zone zone;
425

    
426
        zone.addr = start;
427
        zone.size = size;
428

    
429
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
430
    }
431

    
432
    return ret;
433
}
434

    
435
int kvm_check_extension(KVMState *s, unsigned int extension)
436
{
437
    int ret;
438

    
439
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
440
    if (ret < 0) {
441
        ret = 0;
442
    }
443

    
444
    return ret;
445
}
446

    
447
static int kvm_check_many_ioeventfds(void)
448
{
449
    /* Userspace can use ioeventfd for io notification.  This requires a host
450
     * that supports eventfd(2) and an I/O thread; since eventfd does not
451
     * support SIGIO it cannot interrupt the vcpu.
452
     *
453
     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
454
     * can avoid creating too many ioeventfds.
455
     */
456
#if defined(CONFIG_EVENTFD) && defined(CONFIG_IOTHREAD)
457
    int ioeventfds[7];
458
    int i, ret = 0;
459
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
460
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
461
        if (ioeventfds[i] < 0) {
462
            break;
463
        }
464
        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
465
        if (ret < 0) {
466
            close(ioeventfds[i]);
467
            break;
468
        }
469
    }
470

    
471
    /* Decide whether many devices are supported or not */
472
    ret = i == ARRAY_SIZE(ioeventfds);
473

    
474
    while (i-- > 0) {
475
        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
476
        close(ioeventfds[i]);
477
    }
478
    return ret;
479
#else
480
    return 0;
481
#endif
482
}
483

    
484
static const KVMCapabilityInfo *
485
kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
486
{
487
    while (list->name) {
488
        if (!kvm_check_extension(s, list->value)) {
489
            return list;
490
        }
491
        list++;
492
    }
493
    return NULL;
494
}
495

    
496
static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
497
                             ram_addr_t phys_offset)
498
{
499
    KVMState *s = kvm_state;
500
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
501
    KVMSlot *mem, old;
502
    int err;
503

    
504
    /* kvm works in page size chunks, but the function may be called
505
       with sub-page size and unaligned start address. */
506
    size = TARGET_PAGE_ALIGN(size);
507
    start_addr = TARGET_PAGE_ALIGN(start_addr);
508

    
509
    /* KVM does not support read-only slots */
510
    phys_offset &= ~IO_MEM_ROM;
511

    
512
    while (1) {
513
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
514
        if (!mem) {
515
            break;
516
        }
517

    
518
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
519
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
520
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
521
            /* The new slot fits into the existing one and comes with
522
             * identical parameters - nothing to be done. */
523
            return;
524
        }
525

    
526
        old = *mem;
527

    
528
        /* unregister the overlapping slot */
529
        mem->memory_size = 0;
530
        err = kvm_set_user_memory_region(s, mem);
531
        if (err) {
532
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
533
                    __func__, strerror(-err));
534
            abort();
535
        }
536

    
537
        /* Workaround for older KVM versions: we can't join slots, even not by
538
         * unregistering the previous ones and then registering the larger
539
         * slot. We have to maintain the existing fragmentation. Sigh.
540
         *
541
         * This workaround assumes that the new slot starts at the same
542
         * address as the first existing one. If not or if some overlapping
543
         * slot comes around later, we will fail (not seen in practice so far)
544
         * - and actually require a recent KVM version. */
545
        if (s->broken_set_mem_region &&
546
            old.start_addr == start_addr && old.memory_size < size &&
547
            flags < IO_MEM_UNASSIGNED) {
548
            mem = kvm_alloc_slot(s);
549
            mem->memory_size = old.memory_size;
550
            mem->start_addr = old.start_addr;
551
            mem->phys_offset = old.phys_offset;
552
            mem->flags = 0;
553

    
554
            err = kvm_set_user_memory_region(s, mem);
555
            if (err) {
556
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
557
                        strerror(-err));
558
                abort();
559
            }
560

    
561
            start_addr += old.memory_size;
562
            phys_offset += old.memory_size;
563
            size -= old.memory_size;
564
            continue;
565
        }
566

    
567
        /* register prefix slot */
568
        if (old.start_addr < start_addr) {
569
            mem = kvm_alloc_slot(s);
570
            mem->memory_size = start_addr - old.start_addr;
571
            mem->start_addr = old.start_addr;
572
            mem->phys_offset = old.phys_offset;
573
            mem->flags = 0;
574

    
575
            err = kvm_set_user_memory_region(s, mem);
576
            if (err) {
577
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
578
                        __func__, strerror(-err));
579
                abort();
580
            }
581
        }
582

    
583
        /* register suffix slot */
584
        if (old.start_addr + old.memory_size > start_addr + size) {
585
            ram_addr_t size_delta;
586

    
587
            mem = kvm_alloc_slot(s);
588
            mem->start_addr = start_addr + size;
589
            size_delta = mem->start_addr - old.start_addr;
590
            mem->memory_size = old.memory_size - size_delta;
591
            mem->phys_offset = old.phys_offset + size_delta;
592
            mem->flags = 0;
593

    
594
            err = kvm_set_user_memory_region(s, mem);
595
            if (err) {
596
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
597
                        __func__, strerror(-err));
598
                abort();
599
            }
600
        }
601
    }
602

    
603
    /* in case the KVM bug workaround already "consumed" the new slot */
604
    if (!size) {
605
        return;
606
    }
607
    /* KVM does not need to know about this memory */
608
    if (flags >= IO_MEM_UNASSIGNED) {
609
        return;
610
    }
611
    mem = kvm_alloc_slot(s);
612
    mem->memory_size = size;
613
    mem->start_addr = start_addr;
614
    mem->phys_offset = phys_offset;
615
    mem->flags = 0;
616

    
617
    err = kvm_set_user_memory_region(s, mem);
618
    if (err) {
619
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
620
                strerror(-err));
621
        abort();
622
    }
623
}
624

    
625
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
626
                                  target_phys_addr_t start_addr,
627
                                  ram_addr_t size, ram_addr_t phys_offset)
628
{
629
    kvm_set_phys_mem(start_addr, size, phys_offset);
630
}
631

    
632
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
633
                                        target_phys_addr_t start_addr,
634
                                        target_phys_addr_t end_addr)
635
{
636
    return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
637
}
638

    
639
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
640
                                    int enable)
641
{
642
    return kvm_set_migration_log(enable);
643
}
644

    
645
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
646
    .set_memory = kvm_client_set_memory,
647
    .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
648
    .migration_log = kvm_client_migration_log,
649
    .log_start = kvm_log_start,
650
    .log_stop = kvm_log_stop,
651
};
652

    
653
int kvm_init(void)
654
{
655
    static const char upgrade_note[] =
656
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
657
        "(see http://sourceforge.net/projects/kvm).\n";
658
    KVMState *s;
659
    const KVMCapabilityInfo *missing_cap;
660
    int ret;
661
    int i;
662

    
663
    s = qemu_mallocz(sizeof(KVMState));
664

    
665
#ifdef KVM_CAP_SET_GUEST_DEBUG
666
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
667
#endif
668
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
669
        s->slots[i].slot = i;
670
    }
671
    s->vmfd = -1;
672
    s->fd = qemu_open("/dev/kvm", O_RDWR);
673
    if (s->fd == -1) {
674
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
675
        ret = -errno;
676
        goto err;
677
    }
678

    
679
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
680
    if (ret < KVM_API_VERSION) {
681
        if (ret > 0) {
682
            ret = -EINVAL;
683
        }
684
        fprintf(stderr, "kvm version too old\n");
685
        goto err;
686
    }
687

    
688
    if (ret > KVM_API_VERSION) {
689
        ret = -EINVAL;
690
        fprintf(stderr, "kvm version not supported\n");
691
        goto err;
692
    }
693

    
694
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
695
    if (s->vmfd < 0) {
696
#ifdef TARGET_S390X
697
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
698
                        "your host kernel command line\n");
699
#endif
700
        goto err;
701
    }
702

    
703
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
704
    if (!missing_cap) {
705
        missing_cap =
706
            kvm_check_extension_list(s, kvm_arch_required_capabilities);
707
    }
708
    if (missing_cap) {
709
        ret = -EINVAL;
710
        fprintf(stderr, "kvm does not support %s\n%s",
711
                missing_cap->name, upgrade_note);
712
        goto err;
713
    }
714

    
715
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
716

    
717
    s->broken_set_mem_region = 1;
718
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
719
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
720
    if (ret > 0) {
721
        s->broken_set_mem_region = 0;
722
    }
723
#endif
724

    
725
    s->vcpu_events = 0;
726
#ifdef KVM_CAP_VCPU_EVENTS
727
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
728
#endif
729

    
730
    s->robust_singlestep = 0;
731
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
732
    s->robust_singlestep =
733
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
734
#endif
735

    
736
    s->debugregs = 0;
737
#ifdef KVM_CAP_DEBUGREGS
738
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
739
#endif
740

    
741
    s->xsave = 0;
742
#ifdef KVM_CAP_XSAVE
743
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
744
#endif
745

    
746
    s->xcrs = 0;
747
#ifdef KVM_CAP_XCRS
748
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
749
#endif
750

    
751
    ret = kvm_arch_init(s);
752
    if (ret < 0) {
753
        goto err;
754
    }
755

    
756
    kvm_state = s;
757
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
758

    
759
    s->many_ioeventfds = kvm_check_many_ioeventfds();
760

    
761
    return 0;
762

    
763
err:
764
    if (s) {
765
        if (s->vmfd != -1) {
766
            close(s->vmfd);
767
        }
768
        if (s->fd != -1) {
769
            close(s->fd);
770
        }
771
    }
772
    qemu_free(s);
773

    
774
    return ret;
775
}
776

    
777
static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
778
                          uint32_t count)
779
{
780
    int i;
781
    uint8_t *ptr = data;
782

    
783
    for (i = 0; i < count; i++) {
784
        if (direction == KVM_EXIT_IO_IN) {
785
            switch (size) {
786
            case 1:
787
                stb_p(ptr, cpu_inb(port));
788
                break;
789
            case 2:
790
                stw_p(ptr, cpu_inw(port));
791
                break;
792
            case 4:
793
                stl_p(ptr, cpu_inl(port));
794
                break;
795
            }
796
        } else {
797
            switch (size) {
798
            case 1:
799
                cpu_outb(port, ldub_p(ptr));
800
                break;
801
            case 2:
802
                cpu_outw(port, lduw_p(ptr));
803
                break;
804
            case 4:
805
                cpu_outl(port, ldl_p(ptr));
806
                break;
807
            }
808
        }
809

    
810
        ptr += size;
811
    }
812
}
813

    
814
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
815
static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
816
{
817
    fprintf(stderr, "KVM internal error.");
818
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
819
        int i;
820

    
821
        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
822
        for (i = 0; i < run->internal.ndata; ++i) {
823
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
824
                    i, (uint64_t)run->internal.data[i]);
825
        }
826
    } else {
827
        fprintf(stderr, "\n");
828
    }
829
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
830
        fprintf(stderr, "emulation failure\n");
831
        if (!kvm_arch_stop_on_emulation_error(env)) {
832
            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
833
            return 0;
834
        }
835
    }
836
    /* FIXME: Should trigger a qmp message to let management know
837
     * something went wrong.
838
     */
839
    return -1;
840
}
841
#endif
842

    
843
void kvm_flush_coalesced_mmio_buffer(void)
844
{
845
    KVMState *s = kvm_state;
846
    if (s->coalesced_mmio_ring) {
847
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
848
        while (ring->first != ring->last) {
849
            struct kvm_coalesced_mmio *ent;
850

    
851
            ent = &ring->coalesced_mmio[ring->first];
852

    
853
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
854
            smp_wmb();
855
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
856
        }
857
    }
858
}
859

    
860
static void do_kvm_cpu_synchronize_state(void *_env)
861
{
862
    CPUState *env = _env;
863

    
864
    if (!env->kvm_vcpu_dirty) {
865
        kvm_arch_get_registers(env);
866
        env->kvm_vcpu_dirty = 1;
867
    }
868
}
869

    
870
void kvm_cpu_synchronize_state(CPUState *env)
871
{
872
    if (!env->kvm_vcpu_dirty) {
873
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
874
    }
875
}
876

    
877
void kvm_cpu_synchronize_post_reset(CPUState *env)
878
{
879
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
880
    env->kvm_vcpu_dirty = 0;
881
}
882

    
883
void kvm_cpu_synchronize_post_init(CPUState *env)
884
{
885
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
886
    env->kvm_vcpu_dirty = 0;
887
}
888

    
889
int kvm_cpu_exec(CPUState *env)
890
{
891
    struct kvm_run *run = env->kvm_run;
892
    int ret;
893

    
894
    DPRINTF("kvm_cpu_exec()\n");
895

    
896
    if (kvm_arch_process_irqchip_events(env)) {
897
        env->exit_request = 0;
898
        return EXCP_HLT;
899
    }
900

    
901
    cpu_single_env = env;
902

    
903
    do {
904
        if (env->kvm_vcpu_dirty) {
905
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
906
            env->kvm_vcpu_dirty = 0;
907
        }
908

    
909
        kvm_arch_pre_run(env, run);
910
        if (env->exit_request) {
911
            DPRINTF("interrupt exit requested\n");
912
            /*
913
             * KVM requires us to reenter the kernel after IO exits to complete
914
             * instruction emulation. This self-signal will ensure that we
915
             * leave ASAP again.
916
             */
917
            qemu_cpu_kick_self();
918
        }
919
        cpu_single_env = NULL;
920
        qemu_mutex_unlock_iothread();
921

    
922
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
923

    
924
        qemu_mutex_lock_iothread();
925
        cpu_single_env = env;
926
        kvm_arch_post_run(env, run);
927

    
928
        kvm_flush_coalesced_mmio_buffer();
929

    
930
        if (ret == -EINTR || ret == -EAGAIN) {
931
            DPRINTF("io window exit\n");
932
            ret = 0;
933
            break;
934
        }
935

    
936
        if (ret < 0) {
937
            DPRINTF("kvm run failed %s\n", strerror(-ret));
938
            abort();
939
        }
940

    
941
        ret = 0; /* exit loop */
942
        switch (run->exit_reason) {
943
        case KVM_EXIT_IO:
944
            DPRINTF("handle_io\n");
945
            kvm_handle_io(run->io.port,
946
                          (uint8_t *)run + run->io.data_offset,
947
                          run->io.direction,
948
                          run->io.size,
949
                          run->io.count);
950
            ret = 1;
951
            break;
952
        case KVM_EXIT_MMIO:
953
            DPRINTF("handle_mmio\n");
954
            cpu_physical_memory_rw(run->mmio.phys_addr,
955
                                   run->mmio.data,
956
                                   run->mmio.len,
957
                                   run->mmio.is_write);
958
            ret = 1;
959
            break;
960
        case KVM_EXIT_IRQ_WINDOW_OPEN:
961
            DPRINTF("irq_window_open\n");
962
            break;
963
        case KVM_EXIT_SHUTDOWN:
964
            DPRINTF("shutdown\n");
965
            qemu_system_reset_request();
966
            break;
967
        case KVM_EXIT_UNKNOWN:
968
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
969
                    (uint64_t)run->hw.hardware_exit_reason);
970
            ret = -1;
971
            break;
972
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
973
        case KVM_EXIT_INTERNAL_ERROR:
974
            ret = kvm_handle_internal_error(env, run);
975
            break;
976
#endif
977
        case KVM_EXIT_DEBUG:
978
            DPRINTF("kvm_exit_debug\n");
979
#ifdef KVM_CAP_SET_GUEST_DEBUG
980
            if (kvm_arch_debug(&run->debug.arch)) {
981
                ret = EXCP_DEBUG;
982
                goto out;
983
            }
984
            /* re-enter, this exception was guest-internal */
985
            ret = 1;
986
#endif /* KVM_CAP_SET_GUEST_DEBUG */
987
            break;
988
        default:
989
            DPRINTF("kvm_arch_handle_exit\n");
990
            ret = kvm_arch_handle_exit(env, run);
991
            break;
992
        }
993
    } while (ret > 0);
994

    
995
    if (ret < 0) {
996
        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
997
        vm_stop(VMSTOP_PANIC);
998
    }
999
    ret = EXCP_INTERRUPT;
1000

    
1001
out:
1002
    env->exit_request = 0;
1003
    cpu_single_env = NULL;
1004
    return ret;
1005
}
1006

    
1007
int kvm_ioctl(KVMState *s, int type, ...)
1008
{
1009
    int ret;
1010
    void *arg;
1011
    va_list ap;
1012

    
1013
    va_start(ap, type);
1014
    arg = va_arg(ap, void *);
1015
    va_end(ap);
1016

    
1017
    ret = ioctl(s->fd, type, arg);
1018
    if (ret == -1) {
1019
        ret = -errno;
1020
    }
1021
    return ret;
1022
}
1023

    
1024
int kvm_vm_ioctl(KVMState *s, int type, ...)
1025
{
1026
    int ret;
1027
    void *arg;
1028
    va_list ap;
1029

    
1030
    va_start(ap, type);
1031
    arg = va_arg(ap, void *);
1032
    va_end(ap);
1033

    
1034
    ret = ioctl(s->vmfd, type, arg);
1035
    if (ret == -1) {
1036
        ret = -errno;
1037
    }
1038
    return ret;
1039
}
1040

    
1041
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
1042
{
1043
    int ret;
1044
    void *arg;
1045
    va_list ap;
1046

    
1047
    va_start(ap, type);
1048
    arg = va_arg(ap, void *);
1049
    va_end(ap);
1050

    
1051
    ret = ioctl(env->kvm_fd, type, arg);
1052
    if (ret == -1) {
1053
        ret = -errno;
1054
    }
1055
    return ret;
1056
}
1057

    
1058
int kvm_has_sync_mmu(void)
1059
{
1060
    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1061
}
1062

    
1063
int kvm_has_vcpu_events(void)
1064
{
1065
    return kvm_state->vcpu_events;
1066
}
1067

    
1068
int kvm_has_robust_singlestep(void)
1069
{
1070
    return kvm_state->robust_singlestep;
1071
}
1072

    
1073
int kvm_has_debugregs(void)
1074
{
1075
    return kvm_state->debugregs;
1076
}
1077

    
1078
int kvm_has_xsave(void)
1079
{
1080
    return kvm_state->xsave;
1081
}
1082

    
1083
int kvm_has_xcrs(void)
1084
{
1085
    return kvm_state->xcrs;
1086
}
1087

    
1088
int kvm_has_many_ioeventfds(void)
1089
{
1090
    if (!kvm_enabled()) {
1091
        return 0;
1092
    }
1093
    return kvm_state->many_ioeventfds;
1094
}
1095

    
1096
void kvm_setup_guest_memory(void *start, size_t size)
1097
{
1098
    if (!kvm_has_sync_mmu()) {
1099
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1100

    
1101
        if (ret) {
1102
            perror("qemu_madvise");
1103
            fprintf(stderr,
1104
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1105
            exit(1);
1106
        }
1107
    }
1108
}
1109

    
1110
#ifdef KVM_CAP_SET_GUEST_DEBUG
1111
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1112
                                                 target_ulong pc)
1113
{
1114
    struct kvm_sw_breakpoint *bp;
1115

    
1116
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1117
        if (bp->pc == pc) {
1118
            return bp;
1119
        }
1120
    }
1121
    return NULL;
1122
}
1123

    
1124
int kvm_sw_breakpoints_active(CPUState *env)
1125
{
1126
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1127
}
1128

    
1129
struct kvm_set_guest_debug_data {
1130
    struct kvm_guest_debug dbg;
1131
    CPUState *env;
1132
    int err;
1133
};
1134

    
1135
static void kvm_invoke_set_guest_debug(void *data)
1136
{
1137
    struct kvm_set_guest_debug_data *dbg_data = data;
1138
    CPUState *env = dbg_data->env;
1139

    
1140
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1141
}
1142

    
1143
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1144
{
1145
    struct kvm_set_guest_debug_data data;
1146

    
1147
    data.dbg.control = reinject_trap;
1148

    
1149
    if (env->singlestep_enabled) {
1150
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1151
    }
1152
    kvm_arch_update_guest_debug(env, &data.dbg);
1153
    data.env = env;
1154

    
1155
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1156
    return data.err;
1157
}
1158

    
1159
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1160
                          target_ulong len, int type)
1161
{
1162
    struct kvm_sw_breakpoint *bp;
1163
    CPUState *env;
1164
    int err;
1165

    
1166
    if (type == GDB_BREAKPOINT_SW) {
1167
        bp = kvm_find_sw_breakpoint(current_env, addr);
1168
        if (bp) {
1169
            bp->use_count++;
1170
            return 0;
1171
        }
1172

    
1173
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1174
        if (!bp) {
1175
            return -ENOMEM;
1176
        }
1177

    
1178
        bp->pc = addr;
1179
        bp->use_count = 1;
1180
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1181
        if (err) {
1182
            free(bp);
1183
            return err;
1184
        }
1185

    
1186
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1187
                          bp, entry);
1188
    } else {
1189
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1190
        if (err) {
1191
            return err;
1192
        }
1193
    }
1194

    
1195
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1196
        err = kvm_update_guest_debug(env, 0);
1197
        if (err) {
1198
            return err;
1199
        }
1200
    }
1201
    return 0;
1202
}
1203

    
1204
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1205
                          target_ulong len, int type)
1206
{
1207
    struct kvm_sw_breakpoint *bp;
1208
    CPUState *env;
1209
    int err;
1210

    
1211
    if (type == GDB_BREAKPOINT_SW) {
1212
        bp = kvm_find_sw_breakpoint(current_env, addr);
1213
        if (!bp) {
1214
            return -ENOENT;
1215
        }
1216

    
1217
        if (bp->use_count > 1) {
1218
            bp->use_count--;
1219
            return 0;
1220
        }
1221

    
1222
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1223
        if (err) {
1224
            return err;
1225
        }
1226

    
1227
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1228
        qemu_free(bp);
1229
    } else {
1230
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1231
        if (err) {
1232
            return err;
1233
        }
1234
    }
1235

    
1236
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1237
        err = kvm_update_guest_debug(env, 0);
1238
        if (err) {
1239
            return err;
1240
        }
1241
    }
1242
    return 0;
1243
}
1244

    
1245
void kvm_remove_all_breakpoints(CPUState *current_env)
1246
{
1247
    struct kvm_sw_breakpoint *bp, *next;
1248
    KVMState *s = current_env->kvm_state;
1249
    CPUState *env;
1250

    
1251
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1252
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1253
            /* Try harder to find a CPU that currently sees the breakpoint. */
1254
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1255
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1256
                    break;
1257
                }
1258
            }
1259
        }
1260
    }
1261
    kvm_arch_remove_all_hw_breakpoints();
1262

    
1263
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1264
        kvm_update_guest_debug(env, 0);
1265
    }
1266
}
1267

    
1268
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1269

    
1270
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1271
{
1272
    return -EINVAL;
1273
}
1274

    
1275
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1276
                          target_ulong len, int type)
1277
{
1278
    return -EINVAL;
1279
}
1280

    
1281
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1282
                          target_ulong len, int type)
1283
{
1284
    return -EINVAL;
1285
}
1286

    
1287
void kvm_remove_all_breakpoints(CPUState *current_env)
1288
{
1289
}
1290
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1291

    
1292
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1293
{
1294
    struct kvm_signal_mask *sigmask;
1295
    int r;
1296

    
1297
    if (!sigset) {
1298
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1299
    }
1300

    
1301
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1302

    
1303
    sigmask->len = 8;
1304
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1305
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1306
    free(sigmask);
1307

    
1308
    return r;
1309
}
1310

    
1311
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1312
{
1313
#ifdef KVM_IOEVENTFD
1314
    int ret;
1315
    struct kvm_ioeventfd iofd;
1316

    
1317
    iofd.datamatch = val;
1318
    iofd.addr = addr;
1319
    iofd.len = 4;
1320
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1321
    iofd.fd = fd;
1322

    
1323
    if (!kvm_enabled()) {
1324
        return -ENOSYS;
1325
    }
1326

    
1327
    if (!assign) {
1328
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1329
    }
1330

    
1331
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1332

    
1333
    if (ret < 0) {
1334
        return -errno;
1335
    }
1336

    
1337
    return 0;
1338
#else
1339
    return -ENOSYS;
1340
#endif
1341
}
1342

    
1343
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1344
{
1345
#ifdef KVM_IOEVENTFD
1346
    struct kvm_ioeventfd kick = {
1347
        .datamatch = val,
1348
        .addr = addr,
1349
        .len = 2,
1350
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1351
        .fd = fd,
1352
    };
1353
    int r;
1354
    if (!kvm_enabled()) {
1355
        return -ENOSYS;
1356
    }
1357
    if (!assign) {
1358
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1359
    }
1360
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1361
    if (r < 0) {
1362
        return r;
1363
    }
1364
    return 0;
1365
#else
1366
    return -ENOSYS;
1367
#endif
1368
}
1369

    
1370
int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
1371
{
1372
    return kvm_arch_on_sigbus_vcpu(env, code, addr);
1373
}
1374

    
1375
int kvm_on_sigbus(int code, void *addr)
1376
{
1377
    return kvm_arch_on_sigbus(code, addr);
1378
}