Statistics
| Branch: | Revision:

root / kvm-all.c @ fbc1c7e6

History | View | Annotate | Download (34.6 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* This check must be after config-host.h is included */
32
#ifdef CONFIG_EVENTFD
33
#include <sys/eventfd.h>
34
#endif
35

    
36
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
37
#define PAGE_SIZE TARGET_PAGE_SIZE
38

    
39
//#define DEBUG_KVM
40

    
41
#ifdef DEBUG_KVM
42
#define DPRINTF(fmt, ...) \
43
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
44
#else
45
#define DPRINTF(fmt, ...) \
46
    do { } while (0)
47
#endif
48

    
49
typedef struct KVMSlot
50
{
51
    target_phys_addr_t start_addr;
52
    ram_addr_t memory_size;
53
    ram_addr_t phys_offset;
54
    int slot;
55
    int flags;
56
} KVMSlot;
57

    
58
typedef struct kvm_dirty_log KVMDirtyLog;
59

    
60
struct KVMState
61
{
62
    KVMSlot slots[32];
63
    int fd;
64
    int vmfd;
65
    int coalesced_mmio;
66
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
67
    int broken_set_mem_region;
68
    int migration_log;
69
    int vcpu_events;
70
    int robust_singlestep;
71
    int debugregs;
72
#ifdef KVM_CAP_SET_GUEST_DEBUG
73
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
74
#endif
75
    int irqchip_in_kernel;
76
    int pit_in_kernel;
77
    int xsave, xcrs;
78
    int many_ioeventfds;
79
};
80

    
81
KVMState *kvm_state;
82

    
83
static const KVMCapabilityInfo kvm_required_capabilites[] = {
84
    KVM_CAP_INFO(USER_MEMORY),
85
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
86
    KVM_CAP_LAST_INFO
87
};
88

    
89
static KVMSlot *kvm_alloc_slot(KVMState *s)
90
{
91
    int i;
92

    
93
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
94
        if (s->slots[i].memory_size == 0) {
95
            return &s->slots[i];
96
        }
97
    }
98

    
99
    fprintf(stderr, "%s: no free slot available\n", __func__);
100
    abort();
101
}
102

    
103
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
104
                                         target_phys_addr_t start_addr,
105
                                         target_phys_addr_t end_addr)
106
{
107
    int i;
108

    
109
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
110
        KVMSlot *mem = &s->slots[i];
111

    
112
        if (start_addr == mem->start_addr &&
113
            end_addr == mem->start_addr + mem->memory_size) {
114
            return mem;
115
        }
116
    }
117

    
118
    return NULL;
119
}
120

    
121
/*
122
 * Find overlapping slot with lowest start address
123
 */
124
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
125
                                            target_phys_addr_t start_addr,
126
                                            target_phys_addr_t end_addr)
127
{
128
    KVMSlot *found = NULL;
129
    int i;
130

    
131
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
132
        KVMSlot *mem = &s->slots[i];
133

    
134
        if (mem->memory_size == 0 ||
135
            (found && found->start_addr < mem->start_addr)) {
136
            continue;
137
        }
138

    
139
        if (end_addr > mem->start_addr &&
140
            start_addr < mem->start_addr + mem->memory_size) {
141
            found = mem;
142
        }
143
    }
144

    
145
    return found;
146
}
147

    
148
int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
149
                                      target_phys_addr_t *phys_addr)
150
{
151
    int i;
152

    
153
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
154
        KVMSlot *mem = &s->slots[i];
155

    
156
        if (ram_addr >= mem->phys_offset &&
157
            ram_addr < mem->phys_offset + mem->memory_size) {
158
            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
159
            return 1;
160
        }
161
    }
162

    
163
    return 0;
164
}
165

    
166
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
167
{
168
    struct kvm_userspace_memory_region mem;
169

    
170
    mem.slot = slot->slot;
171
    mem.guest_phys_addr = slot->start_addr;
172
    mem.memory_size = slot->memory_size;
173
    mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
174
    mem.flags = slot->flags;
175
    if (s->migration_log) {
176
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
177
    }
178
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
179
}
180

    
181
static void kvm_reset_vcpu(void *opaque)
182
{
183
    CPUState *env = opaque;
184

    
185
    kvm_arch_reset_vcpu(env);
186
}
187

    
188
int kvm_irqchip_in_kernel(void)
189
{
190
    return kvm_state->irqchip_in_kernel;
191
}
192

    
193
int kvm_pit_in_kernel(void)
194
{
195
    return kvm_state->pit_in_kernel;
196
}
197

    
198
int kvm_init_vcpu(CPUState *env)
199
{
200
    KVMState *s = kvm_state;
201
    long mmap_size;
202
    int ret;
203

    
204
    DPRINTF("kvm_init_vcpu\n");
205

    
206
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
207
    if (ret < 0) {
208
        DPRINTF("kvm_create_vcpu failed\n");
209
        goto err;
210
    }
211

    
212
    env->kvm_fd = ret;
213
    env->kvm_state = s;
214

    
215
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
216
    if (mmap_size < 0) {
217
        ret = mmap_size;
218
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
219
        goto err;
220
    }
221

    
222
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
223
                        env->kvm_fd, 0);
224
    if (env->kvm_run == MAP_FAILED) {
225
        ret = -errno;
226
        DPRINTF("mmap'ing vcpu state failed\n");
227
        goto err;
228
    }
229

    
230
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
231
        s->coalesced_mmio_ring =
232
            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
233
    }
234

    
235
    ret = kvm_arch_init_vcpu(env);
236
    if (ret == 0) {
237
        qemu_register_reset(kvm_reset_vcpu, env);
238
        kvm_arch_reset_vcpu(env);
239
    }
240
err:
241
    return ret;
242
}
243

    
244
/*
245
 * dirty pages logging control
246
 */
247
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
248
                                      ram_addr_t size, int flags, int mask)
249
{
250
    KVMState *s = kvm_state;
251
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
252
    int old_flags;
253

    
254
    if (mem == NULL)  {
255
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
256
                    TARGET_FMT_plx "\n", __func__, phys_addr,
257
                    (target_phys_addr_t)(phys_addr + size - 1));
258
            return -EINVAL;
259
    }
260

    
261
    old_flags = mem->flags;
262

    
263
    flags = (mem->flags & ~mask) | flags;
264
    mem->flags = flags;
265

    
266
    /* If nothing changed effectively, no need to issue ioctl */
267
    if (s->migration_log) {
268
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
269
    }
270
    if (flags == old_flags) {
271
            return 0;
272
    }
273

    
274
    return kvm_set_user_memory_region(s, mem);
275
}
276

    
277
static int kvm_log_start(CPUPhysMemoryClient *client,
278
                         target_phys_addr_t phys_addr, ram_addr_t size)
279
{
280
    return kvm_dirty_pages_log_change(phys_addr, size, KVM_MEM_LOG_DIRTY_PAGES,
281
                                      KVM_MEM_LOG_DIRTY_PAGES);
282
}
283

    
284
static int kvm_log_stop(CPUPhysMemoryClient *client,
285
                        target_phys_addr_t phys_addr, ram_addr_t size)
286
{
287
    return kvm_dirty_pages_log_change(phys_addr, size, 0,
288
                                      KVM_MEM_LOG_DIRTY_PAGES);
289
}
290

    
291
static int kvm_set_migration_log(int enable)
292
{
293
    KVMState *s = kvm_state;
294
    KVMSlot *mem;
295
    int i, err;
296

    
297
    s->migration_log = enable;
298

    
299
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
300
        mem = &s->slots[i];
301

    
302
        if (!mem->memory_size) {
303
            continue;
304
        }
305
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
306
            continue;
307
        }
308
        err = kvm_set_user_memory_region(s, mem);
309
        if (err) {
310
            return err;
311
        }
312
    }
313
    return 0;
314
}
315

    
316
/* get kvm's dirty pages bitmap and update qemu's */
317
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
318
                                         unsigned long *bitmap,
319
                                         unsigned long offset,
320
                                         unsigned long mem_size)
321
{
322
    unsigned int i, j;
323
    unsigned long page_number, addr, addr1, c;
324
    ram_addr_t ram_addr;
325
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
326
        HOST_LONG_BITS;
327

    
328
    /*
329
     * bitmap-traveling is faster than memory-traveling (for addr...)
330
     * especially when most of the memory is not dirty.
331
     */
332
    for (i = 0; i < len; i++) {
333
        if (bitmap[i] != 0) {
334
            c = leul_to_cpu(bitmap[i]);
335
            do {
336
                j = ffsl(c) - 1;
337
                c &= ~(1ul << j);
338
                page_number = i * HOST_LONG_BITS + j;
339
                addr1 = page_number * TARGET_PAGE_SIZE;
340
                addr = offset + addr1;
341
                ram_addr = cpu_get_physical_page_desc(addr);
342
                cpu_physical_memory_set_dirty(ram_addr);
343
            } while (c != 0);
344
        }
345
    }
346
    return 0;
347
}
348

    
349
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
350

    
351
/**
352
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
353
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
354
 * This means all bits are set to dirty.
355
 *
356
 * @start_add: start of logged region.
357
 * @end_addr: end of logged region.
358
 */
359
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
360
                                          target_phys_addr_t end_addr)
361
{
362
    KVMState *s = kvm_state;
363
    unsigned long size, allocated_size = 0;
364
    KVMDirtyLog d;
365
    KVMSlot *mem;
366
    int ret = 0;
367

    
368
    d.dirty_bitmap = NULL;
369
    while (start_addr < end_addr) {
370
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
371
        if (mem == NULL) {
372
            break;
373
        }
374

    
375
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
376
        if (!d.dirty_bitmap) {
377
            d.dirty_bitmap = qemu_malloc(size);
378
        } else if (size > allocated_size) {
379
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
380
        }
381
        allocated_size = size;
382
        memset(d.dirty_bitmap, 0, allocated_size);
383

    
384
        d.slot = mem->slot;
385

    
386
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
387
            DPRINTF("ioctl failed %d\n", errno);
388
            ret = -1;
389
            break;
390
        }
391

    
392
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
393
                                      mem->start_addr, mem->memory_size);
394
        start_addr = mem->start_addr + mem->memory_size;
395
    }
396
    qemu_free(d.dirty_bitmap);
397

    
398
    return ret;
399
}
400

    
401
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
402
{
403
    int ret = -ENOSYS;
404
    KVMState *s = kvm_state;
405

    
406
    if (s->coalesced_mmio) {
407
        struct kvm_coalesced_mmio_zone zone;
408

    
409
        zone.addr = start;
410
        zone.size = size;
411

    
412
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
413
    }
414

    
415
    return ret;
416
}
417

    
418
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
419
{
420
    int ret = -ENOSYS;
421
    KVMState *s = kvm_state;
422

    
423
    if (s->coalesced_mmio) {
424
        struct kvm_coalesced_mmio_zone zone;
425

    
426
        zone.addr = start;
427
        zone.size = size;
428

    
429
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
430
    }
431

    
432
    return ret;
433
}
434

    
435
int kvm_check_extension(KVMState *s, unsigned int extension)
436
{
437
    int ret;
438

    
439
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
440
    if (ret < 0) {
441
        ret = 0;
442
    }
443

    
444
    return ret;
445
}
446

    
447
static int kvm_check_many_ioeventfds(void)
448
{
449
    /* Userspace can use ioeventfd for io notification.  This requires a host
450
     * that supports eventfd(2) and an I/O thread; since eventfd does not
451
     * support SIGIO it cannot interrupt the vcpu.
452
     *
453
     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
454
     * can avoid creating too many ioeventfds.
455
     */
456
#if defined(CONFIG_EVENTFD) && defined(CONFIG_IOTHREAD)
457
    int ioeventfds[7];
458
    int i, ret = 0;
459
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
460
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
461
        if (ioeventfds[i] < 0) {
462
            break;
463
        }
464
        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
465
        if (ret < 0) {
466
            close(ioeventfds[i]);
467
            break;
468
        }
469
    }
470

    
471
    /* Decide whether many devices are supported or not */
472
    ret = i == ARRAY_SIZE(ioeventfds);
473

    
474
    while (i-- > 0) {
475
        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
476
        close(ioeventfds[i]);
477
    }
478
    return ret;
479
#else
480
    return 0;
481
#endif
482
}
483

    
484
static const KVMCapabilityInfo *
485
kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
486
{
487
    while (list->name) {
488
        if (!kvm_check_extension(s, list->value)) {
489
            return list;
490
        }
491
        list++;
492
    }
493
    return NULL;
494
}
495

    
496
static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
497
                             ram_addr_t phys_offset)
498
{
499
    KVMState *s = kvm_state;
500
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
501
    KVMSlot *mem, old;
502
    int err;
503

    
504
    /* kvm works in page size chunks, but the function may be called
505
       with sub-page size and unaligned start address. */
506
    size = TARGET_PAGE_ALIGN(size);
507
    start_addr = TARGET_PAGE_ALIGN(start_addr);
508

    
509
    /* KVM does not support read-only slots */
510
    phys_offset &= ~IO_MEM_ROM;
511

    
512
    while (1) {
513
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
514
        if (!mem) {
515
            break;
516
        }
517

    
518
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
519
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
520
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
521
            /* The new slot fits into the existing one and comes with
522
             * identical parameters - nothing to be done. */
523
            return;
524
        }
525

    
526
        old = *mem;
527

    
528
        /* unregister the overlapping slot */
529
        mem->memory_size = 0;
530
        err = kvm_set_user_memory_region(s, mem);
531
        if (err) {
532
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
533
                    __func__, strerror(-err));
534
            abort();
535
        }
536

    
537
        /* Workaround for older KVM versions: we can't join slots, even not by
538
         * unregistering the previous ones and then registering the larger
539
         * slot. We have to maintain the existing fragmentation. Sigh.
540
         *
541
         * This workaround assumes that the new slot starts at the same
542
         * address as the first existing one. If not or if some overlapping
543
         * slot comes around later, we will fail (not seen in practice so far)
544
         * - and actually require a recent KVM version. */
545
        if (s->broken_set_mem_region &&
546
            old.start_addr == start_addr && old.memory_size < size &&
547
            flags < IO_MEM_UNASSIGNED) {
548
            mem = kvm_alloc_slot(s);
549
            mem->memory_size = old.memory_size;
550
            mem->start_addr = old.start_addr;
551
            mem->phys_offset = old.phys_offset;
552
            mem->flags = 0;
553

    
554
            err = kvm_set_user_memory_region(s, mem);
555
            if (err) {
556
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
557
                        strerror(-err));
558
                abort();
559
            }
560

    
561
            start_addr += old.memory_size;
562
            phys_offset += old.memory_size;
563
            size -= old.memory_size;
564
            continue;
565
        }
566

    
567
        /* register prefix slot */
568
        if (old.start_addr < start_addr) {
569
            mem = kvm_alloc_slot(s);
570
            mem->memory_size = start_addr - old.start_addr;
571
            mem->start_addr = old.start_addr;
572
            mem->phys_offset = old.phys_offset;
573
            mem->flags = 0;
574

    
575
            err = kvm_set_user_memory_region(s, mem);
576
            if (err) {
577
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
578
                        __func__, strerror(-err));
579
                abort();
580
            }
581
        }
582

    
583
        /* register suffix slot */
584
        if (old.start_addr + old.memory_size > start_addr + size) {
585
            ram_addr_t size_delta;
586

    
587
            mem = kvm_alloc_slot(s);
588
            mem->start_addr = start_addr + size;
589
            size_delta = mem->start_addr - old.start_addr;
590
            mem->memory_size = old.memory_size - size_delta;
591
            mem->phys_offset = old.phys_offset + size_delta;
592
            mem->flags = 0;
593

    
594
            err = kvm_set_user_memory_region(s, mem);
595
            if (err) {
596
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
597
                        __func__, strerror(-err));
598
                abort();
599
            }
600
        }
601
    }
602

    
603
    /* in case the KVM bug workaround already "consumed" the new slot */
604
    if (!size) {
605
        return;
606
    }
607
    /* KVM does not need to know about this memory */
608
    if (flags >= IO_MEM_UNASSIGNED) {
609
        return;
610
    }
611
    mem = kvm_alloc_slot(s);
612
    mem->memory_size = size;
613
    mem->start_addr = start_addr;
614
    mem->phys_offset = phys_offset;
615
    mem->flags = 0;
616

    
617
    err = kvm_set_user_memory_region(s, mem);
618
    if (err) {
619
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
620
                strerror(-err));
621
        abort();
622
    }
623
}
624

    
625
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
626
                                  target_phys_addr_t start_addr,
627
                                  ram_addr_t size, ram_addr_t phys_offset)
628
{
629
    kvm_set_phys_mem(start_addr, size, phys_offset);
630
}
631

    
632
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
633
                                        target_phys_addr_t start_addr,
634
                                        target_phys_addr_t end_addr)
635
{
636
    return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
637
}
638

    
639
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
640
                                    int enable)
641
{
642
    return kvm_set_migration_log(enable);
643
}
644

    
645
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
646
    .set_memory = kvm_client_set_memory,
647
    .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
648
    .migration_log = kvm_client_migration_log,
649
    .log_start = kvm_log_start,
650
    .log_stop = kvm_log_stop,
651
};
652

    
653
int kvm_init(void)
654
{
655
    static const char upgrade_note[] =
656
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
657
        "(see http://sourceforge.net/projects/kvm).\n";
658
    KVMState *s;
659
    const KVMCapabilityInfo *missing_cap;
660
    int ret;
661
    int i;
662

    
663
    s = qemu_mallocz(sizeof(KVMState));
664

    
665
#ifdef KVM_CAP_SET_GUEST_DEBUG
666
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
667
#endif
668
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
669
        s->slots[i].slot = i;
670
    }
671
    s->vmfd = -1;
672
    s->fd = qemu_open("/dev/kvm", O_RDWR);
673
    if (s->fd == -1) {
674
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
675
        ret = -errno;
676
        goto err;
677
    }
678

    
679
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
680
    if (ret < KVM_API_VERSION) {
681
        if (ret > 0) {
682
            ret = -EINVAL;
683
        }
684
        fprintf(stderr, "kvm version too old\n");
685
        goto err;
686
    }
687

    
688
    if (ret > KVM_API_VERSION) {
689
        ret = -EINVAL;
690
        fprintf(stderr, "kvm version not supported\n");
691
        goto err;
692
    }
693

    
694
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
695
    if (s->vmfd < 0) {
696
#ifdef TARGET_S390X
697
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
698
                        "your host kernel command line\n");
699
#endif
700
        goto err;
701
    }
702

    
703
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
704
    if (!missing_cap) {
705
        missing_cap =
706
            kvm_check_extension_list(s, kvm_arch_required_capabilities);
707
    }
708
    if (missing_cap) {
709
        ret = -EINVAL;
710
        fprintf(stderr, "kvm does not support %s\n%s",
711
                missing_cap->name, upgrade_note);
712
        goto err;
713
    }
714

    
715
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
716

    
717
    s->broken_set_mem_region = 1;
718
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
719
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
720
    if (ret > 0) {
721
        s->broken_set_mem_region = 0;
722
    }
723
#endif
724

    
725
    s->vcpu_events = 0;
726
#ifdef KVM_CAP_VCPU_EVENTS
727
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
728
#endif
729

    
730
    s->robust_singlestep = 0;
731
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
732
    s->robust_singlestep =
733
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
734
#endif
735

    
736
    s->debugregs = 0;
737
#ifdef KVM_CAP_DEBUGREGS
738
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
739
#endif
740

    
741
    s->xsave = 0;
742
#ifdef KVM_CAP_XSAVE
743
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
744
#endif
745

    
746
    s->xcrs = 0;
747
#ifdef KVM_CAP_XCRS
748
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
749
#endif
750

    
751
    ret = kvm_arch_init(s);
752
    if (ret < 0) {
753
        goto err;
754
    }
755

    
756
    kvm_state = s;
757
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
758

    
759
    s->many_ioeventfds = kvm_check_many_ioeventfds();
760

    
761
    return 0;
762

    
763
err:
764
    if (s) {
765
        if (s->vmfd != -1) {
766
            close(s->vmfd);
767
        }
768
        if (s->fd != -1) {
769
            close(s->fd);
770
        }
771
    }
772
    qemu_free(s);
773

    
774
    return ret;
775
}
776

    
777
static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
778
                          uint32_t count)
779
{
780
    int i;
781
    uint8_t *ptr = data;
782

    
783
    for (i = 0; i < count; i++) {
784
        if (direction == KVM_EXIT_IO_IN) {
785
            switch (size) {
786
            case 1:
787
                stb_p(ptr, cpu_inb(port));
788
                break;
789
            case 2:
790
                stw_p(ptr, cpu_inw(port));
791
                break;
792
            case 4:
793
                stl_p(ptr, cpu_inl(port));
794
                break;
795
            }
796
        } else {
797
            switch (size) {
798
            case 1:
799
                cpu_outb(port, ldub_p(ptr));
800
                break;
801
            case 2:
802
                cpu_outw(port, lduw_p(ptr));
803
                break;
804
            case 4:
805
                cpu_outl(port, ldl_p(ptr));
806
                break;
807
            }
808
        }
809

    
810
        ptr += size;
811
    }
812
}
813

    
814
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
815
static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
816
{
817
    fprintf(stderr, "KVM internal error.");
818
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
819
        int i;
820

    
821
        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
822
        for (i = 0; i < run->internal.ndata; ++i) {
823
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
824
                    i, (uint64_t)run->internal.data[i]);
825
        }
826
    } else {
827
        fprintf(stderr, "\n");
828
    }
829
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
830
        fprintf(stderr, "emulation failure\n");
831
        if (!kvm_arch_stop_on_emulation_error(env)) {
832
            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
833
            return 0;
834
        }
835
    }
836
    /* FIXME: Should trigger a qmp message to let management know
837
     * something went wrong.
838
     */
839
    return -1;
840
}
841
#endif
842

    
843
void kvm_flush_coalesced_mmio_buffer(void)
844
{
845
    KVMState *s = kvm_state;
846
    if (s->coalesced_mmio_ring) {
847
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
848
        while (ring->first != ring->last) {
849
            struct kvm_coalesced_mmio *ent;
850

    
851
            ent = &ring->coalesced_mmio[ring->first];
852

    
853
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
854
            smp_wmb();
855
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
856
        }
857
    }
858
}
859

    
860
static void do_kvm_cpu_synchronize_state(void *_env)
861
{
862
    CPUState *env = _env;
863

    
864
    if (!env->kvm_vcpu_dirty) {
865
        kvm_arch_get_registers(env);
866
        env->kvm_vcpu_dirty = 1;
867
    }
868
}
869

    
870
void kvm_cpu_synchronize_state(CPUState *env)
871
{
872
    if (!env->kvm_vcpu_dirty) {
873
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
874
    }
875
}
876

    
877
void kvm_cpu_synchronize_post_reset(CPUState *env)
878
{
879
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
880
    env->kvm_vcpu_dirty = 0;
881
}
882

    
883
void kvm_cpu_synchronize_post_init(CPUState *env)
884
{
885
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
886
    env->kvm_vcpu_dirty = 0;
887
}
888

    
889
int kvm_cpu_exec(CPUState *env)
890
{
891
    struct kvm_run *run = env->kvm_run;
892
    int ret;
893

    
894
    DPRINTF("kvm_cpu_exec()\n");
895

    
896
    if (kvm_arch_process_irqchip_events(env)) {
897
        env->exit_request = 0;
898
        return EXCP_HLT;
899
    }
900

    
901
    cpu_single_env = env;
902

    
903
    do {
904
        if (env->kvm_vcpu_dirty) {
905
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
906
            env->kvm_vcpu_dirty = 0;
907
        }
908

    
909
        kvm_arch_pre_run(env, run);
910
        if (env->exit_request) {
911
            DPRINTF("interrupt exit requested\n");
912
            /*
913
             * KVM requires us to reenter the kernel after IO exits to complete
914
             * instruction emulation. This self-signal will ensure that we
915
             * leave ASAP again.
916
             */
917
            qemu_cpu_kick_self();
918
        }
919
        cpu_single_env = NULL;
920
        qemu_mutex_unlock_iothread();
921

    
922
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
923

    
924
        qemu_mutex_lock_iothread();
925
        cpu_single_env = env;
926
        kvm_arch_post_run(env, run);
927

    
928
        kvm_flush_coalesced_mmio_buffer();
929

    
930
        if (ret == -EINTR || ret == -EAGAIN) {
931
            DPRINTF("io window exit\n");
932
            ret = 0;
933
            break;
934
        }
935

    
936
        if (ret < 0) {
937
            DPRINTF("kvm run failed %s\n", strerror(-ret));
938
            abort();
939
        }
940

    
941
        ret = 0; /* exit loop */
942
        switch (run->exit_reason) {
943
        case KVM_EXIT_IO:
944
            DPRINTF("handle_io\n");
945
            kvm_handle_io(run->io.port,
946
                          (uint8_t *)run + run->io.data_offset,
947
                          run->io.direction,
948
                          run->io.size,
949
                          run->io.count);
950
            ret = 1;
951
            break;
952
        case KVM_EXIT_MMIO:
953
            DPRINTF("handle_mmio\n");
954
            cpu_physical_memory_rw(run->mmio.phys_addr,
955
                                   run->mmio.data,
956
                                   run->mmio.len,
957
                                   run->mmio.is_write);
958
            ret = 1;
959
            break;
960
        case KVM_EXIT_IRQ_WINDOW_OPEN:
961
            DPRINTF("irq_window_open\n");
962
            break;
963
        case KVM_EXIT_SHUTDOWN:
964
            DPRINTF("shutdown\n");
965
            qemu_system_reset_request();
966
            break;
967
        case KVM_EXIT_UNKNOWN:
968
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
969
                    (uint64_t)run->hw.hardware_exit_reason);
970
            ret = -1;
971
            break;
972
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
973
        case KVM_EXIT_INTERNAL_ERROR:
974
            ret = kvm_handle_internal_error(env, run);
975
            break;
976
#endif
977
        case KVM_EXIT_DEBUG:
978
            DPRINTF("kvm_exit_debug\n");
979
#ifdef KVM_CAP_SET_GUEST_DEBUG
980
            if (kvm_arch_debug(&run->debug.arch)) {
981
                ret = EXCP_DEBUG;
982
                goto out;
983
            }
984
            /* re-enter, this exception was guest-internal */
985
            ret = 1;
986
#endif /* KVM_CAP_SET_GUEST_DEBUG */
987
            break;
988
        default:
989
            DPRINTF("kvm_arch_handle_exit\n");
990
            ret = kvm_arch_handle_exit(env, run);
991
            break;
992
        }
993
    } while (ret > 0);
994

    
995
    if (ret < 0) {
996
        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
997
        vm_stop(VMSTOP_PANIC);
998
    }
999
    ret = EXCP_INTERRUPT;
1000

    
1001
#ifdef KVM_CAP_SET_GUEST_DEBUG
1002
out:
1003
#endif
1004
    env->exit_request = 0;
1005
    cpu_single_env = NULL;
1006
    return ret;
1007
}
1008

    
1009
int kvm_ioctl(KVMState *s, int type, ...)
1010
{
1011
    int ret;
1012
    void *arg;
1013
    va_list ap;
1014

    
1015
    va_start(ap, type);
1016
    arg = va_arg(ap, void *);
1017
    va_end(ap);
1018

    
1019
    ret = ioctl(s->fd, type, arg);
1020
    if (ret == -1) {
1021
        ret = -errno;
1022
    }
1023
    return ret;
1024
}
1025

    
1026
int kvm_vm_ioctl(KVMState *s, int type, ...)
1027
{
1028
    int ret;
1029
    void *arg;
1030
    va_list ap;
1031

    
1032
    va_start(ap, type);
1033
    arg = va_arg(ap, void *);
1034
    va_end(ap);
1035

    
1036
    ret = ioctl(s->vmfd, type, arg);
1037
    if (ret == -1) {
1038
        ret = -errno;
1039
    }
1040
    return ret;
1041
}
1042

    
1043
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
1044
{
1045
    int ret;
1046
    void *arg;
1047
    va_list ap;
1048

    
1049
    va_start(ap, type);
1050
    arg = va_arg(ap, void *);
1051
    va_end(ap);
1052

    
1053
    ret = ioctl(env->kvm_fd, type, arg);
1054
    if (ret == -1) {
1055
        ret = -errno;
1056
    }
1057
    return ret;
1058
}
1059

    
1060
int kvm_has_sync_mmu(void)
1061
{
1062
    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1063
}
1064

    
1065
int kvm_has_vcpu_events(void)
1066
{
1067
    return kvm_state->vcpu_events;
1068
}
1069

    
1070
int kvm_has_robust_singlestep(void)
1071
{
1072
    return kvm_state->robust_singlestep;
1073
}
1074

    
1075
int kvm_has_debugregs(void)
1076
{
1077
    return kvm_state->debugregs;
1078
}
1079

    
1080
int kvm_has_xsave(void)
1081
{
1082
    return kvm_state->xsave;
1083
}
1084

    
1085
int kvm_has_xcrs(void)
1086
{
1087
    return kvm_state->xcrs;
1088
}
1089

    
1090
int kvm_has_many_ioeventfds(void)
1091
{
1092
    if (!kvm_enabled()) {
1093
        return 0;
1094
    }
1095
    return kvm_state->many_ioeventfds;
1096
}
1097

    
1098
void kvm_setup_guest_memory(void *start, size_t size)
1099
{
1100
    if (!kvm_has_sync_mmu()) {
1101
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1102

    
1103
        if (ret) {
1104
            perror("qemu_madvise");
1105
            fprintf(stderr,
1106
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1107
            exit(1);
1108
        }
1109
    }
1110
}
1111

    
1112
#ifdef KVM_CAP_SET_GUEST_DEBUG
1113
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1114
                                                 target_ulong pc)
1115
{
1116
    struct kvm_sw_breakpoint *bp;
1117

    
1118
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1119
        if (bp->pc == pc) {
1120
            return bp;
1121
        }
1122
    }
1123
    return NULL;
1124
}
1125

    
1126
int kvm_sw_breakpoints_active(CPUState *env)
1127
{
1128
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1129
}
1130

    
1131
struct kvm_set_guest_debug_data {
1132
    struct kvm_guest_debug dbg;
1133
    CPUState *env;
1134
    int err;
1135
};
1136

    
1137
static void kvm_invoke_set_guest_debug(void *data)
1138
{
1139
    struct kvm_set_guest_debug_data *dbg_data = data;
1140
    CPUState *env = dbg_data->env;
1141

    
1142
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1143
}
1144

    
1145
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1146
{
1147
    struct kvm_set_guest_debug_data data;
1148

    
1149
    data.dbg.control = reinject_trap;
1150

    
1151
    if (env->singlestep_enabled) {
1152
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1153
    }
1154
    kvm_arch_update_guest_debug(env, &data.dbg);
1155
    data.env = env;
1156

    
1157
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1158
    return data.err;
1159
}
1160

    
1161
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1162
                          target_ulong len, int type)
1163
{
1164
    struct kvm_sw_breakpoint *bp;
1165
    CPUState *env;
1166
    int err;
1167

    
1168
    if (type == GDB_BREAKPOINT_SW) {
1169
        bp = kvm_find_sw_breakpoint(current_env, addr);
1170
        if (bp) {
1171
            bp->use_count++;
1172
            return 0;
1173
        }
1174

    
1175
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1176
        if (!bp) {
1177
            return -ENOMEM;
1178
        }
1179

    
1180
        bp->pc = addr;
1181
        bp->use_count = 1;
1182
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1183
        if (err) {
1184
            free(bp);
1185
            return err;
1186
        }
1187

    
1188
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1189
                          bp, entry);
1190
    } else {
1191
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1192
        if (err) {
1193
            return err;
1194
        }
1195
    }
1196

    
1197
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1198
        err = kvm_update_guest_debug(env, 0);
1199
        if (err) {
1200
            return err;
1201
        }
1202
    }
1203
    return 0;
1204
}
1205

    
1206
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1207
                          target_ulong len, int type)
1208
{
1209
    struct kvm_sw_breakpoint *bp;
1210
    CPUState *env;
1211
    int err;
1212

    
1213
    if (type == GDB_BREAKPOINT_SW) {
1214
        bp = kvm_find_sw_breakpoint(current_env, addr);
1215
        if (!bp) {
1216
            return -ENOENT;
1217
        }
1218

    
1219
        if (bp->use_count > 1) {
1220
            bp->use_count--;
1221
            return 0;
1222
        }
1223

    
1224
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1225
        if (err) {
1226
            return err;
1227
        }
1228

    
1229
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1230
        qemu_free(bp);
1231
    } else {
1232
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1233
        if (err) {
1234
            return err;
1235
        }
1236
    }
1237

    
1238
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1239
        err = kvm_update_guest_debug(env, 0);
1240
        if (err) {
1241
            return err;
1242
        }
1243
    }
1244
    return 0;
1245
}
1246

    
1247
void kvm_remove_all_breakpoints(CPUState *current_env)
1248
{
1249
    struct kvm_sw_breakpoint *bp, *next;
1250
    KVMState *s = current_env->kvm_state;
1251
    CPUState *env;
1252

    
1253
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1254
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1255
            /* Try harder to find a CPU that currently sees the breakpoint. */
1256
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1257
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1258
                    break;
1259
                }
1260
            }
1261
        }
1262
    }
1263
    kvm_arch_remove_all_hw_breakpoints();
1264

    
1265
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1266
        kvm_update_guest_debug(env, 0);
1267
    }
1268
}
1269

    
1270
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1271

    
1272
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1273
{
1274
    return -EINVAL;
1275
}
1276

    
1277
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1278
                          target_ulong len, int type)
1279
{
1280
    return -EINVAL;
1281
}
1282

    
1283
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1284
                          target_ulong len, int type)
1285
{
1286
    return -EINVAL;
1287
}
1288

    
1289
void kvm_remove_all_breakpoints(CPUState *current_env)
1290
{
1291
}
1292
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1293

    
1294
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1295
{
1296
    struct kvm_signal_mask *sigmask;
1297
    int r;
1298

    
1299
    if (!sigset) {
1300
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1301
    }
1302

    
1303
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1304

    
1305
    sigmask->len = 8;
1306
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1307
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1308
    free(sigmask);
1309

    
1310
    return r;
1311
}
1312

    
1313
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1314
{
1315
#ifdef KVM_IOEVENTFD
1316
    int ret;
1317
    struct kvm_ioeventfd iofd;
1318

    
1319
    iofd.datamatch = val;
1320
    iofd.addr = addr;
1321
    iofd.len = 4;
1322
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1323
    iofd.fd = fd;
1324

    
1325
    if (!kvm_enabled()) {
1326
        return -ENOSYS;
1327
    }
1328

    
1329
    if (!assign) {
1330
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1331
    }
1332

    
1333
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1334

    
1335
    if (ret < 0) {
1336
        return -errno;
1337
    }
1338

    
1339
    return 0;
1340
#else
1341
    return -ENOSYS;
1342
#endif
1343
}
1344

    
1345
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1346
{
1347
#ifdef KVM_IOEVENTFD
1348
    struct kvm_ioeventfd kick = {
1349
        .datamatch = val,
1350
        .addr = addr,
1351
        .len = 2,
1352
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1353
        .fd = fd,
1354
    };
1355
    int r;
1356
    if (!kvm_enabled()) {
1357
        return -ENOSYS;
1358
    }
1359
    if (!assign) {
1360
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1361
    }
1362
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1363
    if (r < 0) {
1364
        return r;
1365
    }
1366
    return 0;
1367
#else
1368
    return -ENOSYS;
1369
#endif
1370
}
1371

    
1372
int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
1373
{
1374
    return kvm_arch_on_sigbus_vcpu(env, code, addr);
1375
}
1376

    
1377
int kvm_on_sigbus(int code, void *addr)
1378
{
1379
    return kvm_arch_on_sigbus(code, addr);
1380
}