Statistics
| Branch: | Revision:

root / kvm-all.c @ f2574737

History | View | Annotate | Download (34.3 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* This check must be after config-host.h is included */
32
#ifdef CONFIG_EVENTFD
33
#include <sys/eventfd.h>
34
#endif
35

    
36
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
37
#define PAGE_SIZE TARGET_PAGE_SIZE
38

    
39
//#define DEBUG_KVM
40

    
41
#ifdef DEBUG_KVM
42
#define DPRINTF(fmt, ...) \
43
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
44
#else
45
#define DPRINTF(fmt, ...) \
46
    do { } while (0)
47
#endif
48

    
49
typedef struct KVMSlot
50
{
51
    target_phys_addr_t start_addr;
52
    ram_addr_t memory_size;
53
    ram_addr_t phys_offset;
54
    int slot;
55
    int flags;
56
} KVMSlot;
57

    
58
typedef struct kvm_dirty_log KVMDirtyLog;
59

    
60
struct KVMState
61
{
62
    KVMSlot slots[32];
63
    int fd;
64
    int vmfd;
65
    int coalesced_mmio;
66
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
67
    int broken_set_mem_region;
68
    int migration_log;
69
    int vcpu_events;
70
    int robust_singlestep;
71
    int debugregs;
72
#ifdef KVM_CAP_SET_GUEST_DEBUG
73
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
74
#endif
75
    int irqchip_in_kernel;
76
    int pit_in_kernel;
77
    int xsave, xcrs;
78
    int many_ioeventfds;
79
};
80

    
81
KVMState *kvm_state;
82

    
83
static const KVMCapabilityInfo kvm_required_capabilites[] = {
84
    KVM_CAP_INFO(USER_MEMORY),
85
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
86
    KVM_CAP_LAST_INFO
87
};
88

    
89
static KVMSlot *kvm_alloc_slot(KVMState *s)
90
{
91
    int i;
92

    
93
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
94
        if (s->slots[i].memory_size == 0) {
95
            return &s->slots[i];
96
        }
97
    }
98

    
99
    fprintf(stderr, "%s: no free slot available\n", __func__);
100
    abort();
101
}
102

    
103
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
104
                                         target_phys_addr_t start_addr,
105
                                         target_phys_addr_t end_addr)
106
{
107
    int i;
108

    
109
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
110
        KVMSlot *mem = &s->slots[i];
111

    
112
        if (start_addr == mem->start_addr &&
113
            end_addr == mem->start_addr + mem->memory_size) {
114
            return mem;
115
        }
116
    }
117

    
118
    return NULL;
119
}
120

    
121
/*
122
 * Find overlapping slot with lowest start address
123
 */
124
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
125
                                            target_phys_addr_t start_addr,
126
                                            target_phys_addr_t end_addr)
127
{
128
    KVMSlot *found = NULL;
129
    int i;
130

    
131
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
132
        KVMSlot *mem = &s->slots[i];
133

    
134
        if (mem->memory_size == 0 ||
135
            (found && found->start_addr < mem->start_addr)) {
136
            continue;
137
        }
138

    
139
        if (end_addr > mem->start_addr &&
140
            start_addr < mem->start_addr + mem->memory_size) {
141
            found = mem;
142
        }
143
    }
144

    
145
    return found;
146
}
147

    
148
int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
149
                                      target_phys_addr_t *phys_addr)
150
{
151
    int i;
152

    
153
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
154
        KVMSlot *mem = &s->slots[i];
155

    
156
        if (ram_addr >= mem->phys_offset &&
157
            ram_addr < mem->phys_offset + mem->memory_size) {
158
            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
159
            return 1;
160
        }
161
    }
162

    
163
    return 0;
164
}
165

    
166
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
167
{
168
    struct kvm_userspace_memory_region mem;
169

    
170
    mem.slot = slot->slot;
171
    mem.guest_phys_addr = slot->start_addr;
172
    mem.memory_size = slot->memory_size;
173
    mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
174
    mem.flags = slot->flags;
175
    if (s->migration_log) {
176
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
177
    }
178
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
179
}
180

    
181
static void kvm_reset_vcpu(void *opaque)
182
{
183
    CPUState *env = opaque;
184

    
185
    kvm_arch_reset_vcpu(env);
186
}
187

    
188
int kvm_irqchip_in_kernel(void)
189
{
190
    return kvm_state->irqchip_in_kernel;
191
}
192

    
193
int kvm_pit_in_kernel(void)
194
{
195
    return kvm_state->pit_in_kernel;
196
}
197

    
198
int kvm_init_vcpu(CPUState *env)
199
{
200
    KVMState *s = kvm_state;
201
    long mmap_size;
202
    int ret;
203

    
204
    DPRINTF("kvm_init_vcpu\n");
205

    
206
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
207
    if (ret < 0) {
208
        DPRINTF("kvm_create_vcpu failed\n");
209
        goto err;
210
    }
211

    
212
    env->kvm_fd = ret;
213
    env->kvm_state = s;
214
    env->kvm_vcpu_dirty = 1;
215

    
216
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
217
    if (mmap_size < 0) {
218
        ret = mmap_size;
219
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
220
        goto err;
221
    }
222

    
223
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
224
                        env->kvm_fd, 0);
225
    if (env->kvm_run == MAP_FAILED) {
226
        ret = -errno;
227
        DPRINTF("mmap'ing vcpu state failed\n");
228
        goto err;
229
    }
230

    
231
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
232
        s->coalesced_mmio_ring =
233
            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
234
    }
235

    
236
    ret = kvm_arch_init_vcpu(env);
237
    if (ret == 0) {
238
        qemu_register_reset(kvm_reset_vcpu, env);
239
        kvm_arch_reset_vcpu(env);
240
    }
241
err:
242
    return ret;
243
}
244

    
245
/*
246
 * dirty pages logging control
247
 */
248
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
249
                                      ram_addr_t size, int flags, int mask)
250
{
251
    KVMState *s = kvm_state;
252
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
253
    int old_flags;
254

    
255
    if (mem == NULL)  {
256
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
257
                    TARGET_FMT_plx "\n", __func__, phys_addr,
258
                    (target_phys_addr_t)(phys_addr + size - 1));
259
            return -EINVAL;
260
    }
261

    
262
    old_flags = mem->flags;
263

    
264
    flags = (mem->flags & ~mask) | flags;
265
    mem->flags = flags;
266

    
267
    /* If nothing changed effectively, no need to issue ioctl */
268
    if (s->migration_log) {
269
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
270
    }
271
    if (flags == old_flags) {
272
            return 0;
273
    }
274

    
275
    return kvm_set_user_memory_region(s, mem);
276
}
277

    
278
static int kvm_log_start(CPUPhysMemoryClient *client,
279
                         target_phys_addr_t phys_addr, ram_addr_t size)
280
{
281
    return kvm_dirty_pages_log_change(phys_addr, size, KVM_MEM_LOG_DIRTY_PAGES,
282
                                      KVM_MEM_LOG_DIRTY_PAGES);
283
}
284

    
285
static int kvm_log_stop(CPUPhysMemoryClient *client,
286
                        target_phys_addr_t phys_addr, ram_addr_t size)
287
{
288
    return kvm_dirty_pages_log_change(phys_addr, size, 0,
289
                                      KVM_MEM_LOG_DIRTY_PAGES);
290
}
291

    
292
static int kvm_set_migration_log(int enable)
293
{
294
    KVMState *s = kvm_state;
295
    KVMSlot *mem;
296
    int i, err;
297

    
298
    s->migration_log = enable;
299

    
300
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
301
        mem = &s->slots[i];
302

    
303
        if (!mem->memory_size) {
304
            continue;
305
        }
306
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
307
            continue;
308
        }
309
        err = kvm_set_user_memory_region(s, mem);
310
        if (err) {
311
            return err;
312
        }
313
    }
314
    return 0;
315
}
316

    
317
/* get kvm's dirty pages bitmap and update qemu's */
318
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
319
                                         unsigned long *bitmap,
320
                                         unsigned long offset,
321
                                         unsigned long mem_size)
322
{
323
    unsigned int i, j;
324
    unsigned long page_number, addr, addr1, c;
325
    ram_addr_t ram_addr;
326
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
327
        HOST_LONG_BITS;
328

    
329
    /*
330
     * bitmap-traveling is faster than memory-traveling (for addr...)
331
     * especially when most of the memory is not dirty.
332
     */
333
    for (i = 0; i < len; i++) {
334
        if (bitmap[i] != 0) {
335
            c = leul_to_cpu(bitmap[i]);
336
            do {
337
                j = ffsl(c) - 1;
338
                c &= ~(1ul << j);
339
                page_number = i * HOST_LONG_BITS + j;
340
                addr1 = page_number * TARGET_PAGE_SIZE;
341
                addr = offset + addr1;
342
                ram_addr = cpu_get_physical_page_desc(addr);
343
                cpu_physical_memory_set_dirty(ram_addr);
344
            } while (c != 0);
345
        }
346
    }
347
    return 0;
348
}
349

    
350
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
351

    
352
/**
353
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
354
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
355
 * This means all bits are set to dirty.
356
 *
357
 * @start_add: start of logged region.
358
 * @end_addr: end of logged region.
359
 */
360
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
361
                                          target_phys_addr_t end_addr)
362
{
363
    KVMState *s = kvm_state;
364
    unsigned long size, allocated_size = 0;
365
    KVMDirtyLog d;
366
    KVMSlot *mem;
367
    int ret = 0;
368

    
369
    d.dirty_bitmap = NULL;
370
    while (start_addr < end_addr) {
371
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
372
        if (mem == NULL) {
373
            break;
374
        }
375

    
376
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
377
        if (!d.dirty_bitmap) {
378
            d.dirty_bitmap = qemu_malloc(size);
379
        } else if (size > allocated_size) {
380
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
381
        }
382
        allocated_size = size;
383
        memset(d.dirty_bitmap, 0, allocated_size);
384

    
385
        d.slot = mem->slot;
386

    
387
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
388
            DPRINTF("ioctl failed %d\n", errno);
389
            ret = -1;
390
            break;
391
        }
392

    
393
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
394
                                      mem->start_addr, mem->memory_size);
395
        start_addr = mem->start_addr + mem->memory_size;
396
    }
397
    qemu_free(d.dirty_bitmap);
398

    
399
    return ret;
400
}
401

    
402
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
403
{
404
    int ret = -ENOSYS;
405
    KVMState *s = kvm_state;
406

    
407
    if (s->coalesced_mmio) {
408
        struct kvm_coalesced_mmio_zone zone;
409

    
410
        zone.addr = start;
411
        zone.size = size;
412

    
413
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
414
    }
415

    
416
    return ret;
417
}
418

    
419
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
420
{
421
    int ret = -ENOSYS;
422
    KVMState *s = kvm_state;
423

    
424
    if (s->coalesced_mmio) {
425
        struct kvm_coalesced_mmio_zone zone;
426

    
427
        zone.addr = start;
428
        zone.size = size;
429

    
430
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
431
    }
432

    
433
    return ret;
434
}
435

    
436
int kvm_check_extension(KVMState *s, unsigned int extension)
437
{
438
    int ret;
439

    
440
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
441
    if (ret < 0) {
442
        ret = 0;
443
    }
444

    
445
    return ret;
446
}
447

    
448
static int kvm_check_many_ioeventfds(void)
449
{
450
    /* Userspace can use ioeventfd for io notification.  This requires a host
451
     * that supports eventfd(2) and an I/O thread; since eventfd does not
452
     * support SIGIO it cannot interrupt the vcpu.
453
     *
454
     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
455
     * can avoid creating too many ioeventfds.
456
     */
457
#if defined(CONFIG_EVENTFD) && defined(CONFIG_IOTHREAD)
458
    int ioeventfds[7];
459
    int i, ret = 0;
460
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
461
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
462
        if (ioeventfds[i] < 0) {
463
            break;
464
        }
465
        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
466
        if (ret < 0) {
467
            close(ioeventfds[i]);
468
            break;
469
        }
470
    }
471

    
472
    /* Decide whether many devices are supported or not */
473
    ret = i == ARRAY_SIZE(ioeventfds);
474

    
475
    while (i-- > 0) {
476
        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
477
        close(ioeventfds[i]);
478
    }
479
    return ret;
480
#else
481
    return 0;
482
#endif
483
}
484

    
485
static const KVMCapabilityInfo *
486
kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
487
{
488
    while (list->name) {
489
        if (!kvm_check_extension(s, list->value)) {
490
            return list;
491
        }
492
        list++;
493
    }
494
    return NULL;
495
}
496

    
497
static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
498
                             ram_addr_t phys_offset)
499
{
500
    KVMState *s = kvm_state;
501
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
502
    KVMSlot *mem, old;
503
    int err;
504

    
505
    /* kvm works in page size chunks, but the function may be called
506
       with sub-page size and unaligned start address. */
507
    size = TARGET_PAGE_ALIGN(size);
508
    start_addr = TARGET_PAGE_ALIGN(start_addr);
509

    
510
    /* KVM does not support read-only slots */
511
    phys_offset &= ~IO_MEM_ROM;
512

    
513
    while (1) {
514
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
515
        if (!mem) {
516
            break;
517
        }
518

    
519
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
520
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
521
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
522
            /* The new slot fits into the existing one and comes with
523
             * identical parameters - nothing to be done. */
524
            return;
525
        }
526

    
527
        old = *mem;
528

    
529
        /* unregister the overlapping slot */
530
        mem->memory_size = 0;
531
        err = kvm_set_user_memory_region(s, mem);
532
        if (err) {
533
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
534
                    __func__, strerror(-err));
535
            abort();
536
        }
537

    
538
        /* Workaround for older KVM versions: we can't join slots, even not by
539
         * unregistering the previous ones and then registering the larger
540
         * slot. We have to maintain the existing fragmentation. Sigh.
541
         *
542
         * This workaround assumes that the new slot starts at the same
543
         * address as the first existing one. If not or if some overlapping
544
         * slot comes around later, we will fail (not seen in practice so far)
545
         * - and actually require a recent KVM version. */
546
        if (s->broken_set_mem_region &&
547
            old.start_addr == start_addr && old.memory_size < size &&
548
            flags < IO_MEM_UNASSIGNED) {
549
            mem = kvm_alloc_slot(s);
550
            mem->memory_size = old.memory_size;
551
            mem->start_addr = old.start_addr;
552
            mem->phys_offset = old.phys_offset;
553
            mem->flags = 0;
554

    
555
            err = kvm_set_user_memory_region(s, mem);
556
            if (err) {
557
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
558
                        strerror(-err));
559
                abort();
560
            }
561

    
562
            start_addr += old.memory_size;
563
            phys_offset += old.memory_size;
564
            size -= old.memory_size;
565
            continue;
566
        }
567

    
568
        /* register prefix slot */
569
        if (old.start_addr < start_addr) {
570
            mem = kvm_alloc_slot(s);
571
            mem->memory_size = start_addr - old.start_addr;
572
            mem->start_addr = old.start_addr;
573
            mem->phys_offset = old.phys_offset;
574
            mem->flags = 0;
575

    
576
            err = kvm_set_user_memory_region(s, mem);
577
            if (err) {
578
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
579
                        __func__, strerror(-err));
580
                abort();
581
            }
582
        }
583

    
584
        /* register suffix slot */
585
        if (old.start_addr + old.memory_size > start_addr + size) {
586
            ram_addr_t size_delta;
587

    
588
            mem = kvm_alloc_slot(s);
589
            mem->start_addr = start_addr + size;
590
            size_delta = mem->start_addr - old.start_addr;
591
            mem->memory_size = old.memory_size - size_delta;
592
            mem->phys_offset = old.phys_offset + size_delta;
593
            mem->flags = 0;
594

    
595
            err = kvm_set_user_memory_region(s, mem);
596
            if (err) {
597
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
598
                        __func__, strerror(-err));
599
                abort();
600
            }
601
        }
602
    }
603

    
604
    /* in case the KVM bug workaround already "consumed" the new slot */
605
    if (!size) {
606
        return;
607
    }
608
    /* KVM does not need to know about this memory */
609
    if (flags >= IO_MEM_UNASSIGNED) {
610
        return;
611
    }
612
    mem = kvm_alloc_slot(s);
613
    mem->memory_size = size;
614
    mem->start_addr = start_addr;
615
    mem->phys_offset = phys_offset;
616
    mem->flags = 0;
617

    
618
    err = kvm_set_user_memory_region(s, mem);
619
    if (err) {
620
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
621
                strerror(-err));
622
        abort();
623
    }
624
}
625

    
626
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
627
                                  target_phys_addr_t start_addr,
628
                                  ram_addr_t size, ram_addr_t phys_offset)
629
{
630
    kvm_set_phys_mem(start_addr, size, phys_offset);
631
}
632

    
633
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
634
                                        target_phys_addr_t start_addr,
635
                                        target_phys_addr_t end_addr)
636
{
637
    return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
638
}
639

    
640
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
641
                                    int enable)
642
{
643
    return kvm_set_migration_log(enable);
644
}
645

    
646
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
647
    .set_memory = kvm_client_set_memory,
648
    .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
649
    .migration_log = kvm_client_migration_log,
650
    .log_start = kvm_log_start,
651
    .log_stop = kvm_log_stop,
652
};
653

    
654
int kvm_init(void)
655
{
656
    static const char upgrade_note[] =
657
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
658
        "(see http://sourceforge.net/projects/kvm).\n";
659
    KVMState *s;
660
    const KVMCapabilityInfo *missing_cap;
661
    int ret;
662
    int i;
663

    
664
    s = qemu_mallocz(sizeof(KVMState));
665

    
666
#ifdef KVM_CAP_SET_GUEST_DEBUG
667
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
668
#endif
669
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
670
        s->slots[i].slot = i;
671
    }
672
    s->vmfd = -1;
673
    s->fd = qemu_open("/dev/kvm", O_RDWR);
674
    if (s->fd == -1) {
675
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
676
        ret = -errno;
677
        goto err;
678
    }
679

    
680
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
681
    if (ret < KVM_API_VERSION) {
682
        if (ret > 0) {
683
            ret = -EINVAL;
684
        }
685
        fprintf(stderr, "kvm version too old\n");
686
        goto err;
687
    }
688

    
689
    if (ret > KVM_API_VERSION) {
690
        ret = -EINVAL;
691
        fprintf(stderr, "kvm version not supported\n");
692
        goto err;
693
    }
694

    
695
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
696
    if (s->vmfd < 0) {
697
#ifdef TARGET_S390X
698
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
699
                        "your host kernel command line\n");
700
#endif
701
        goto err;
702
    }
703

    
704
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
705
    if (!missing_cap) {
706
        missing_cap =
707
            kvm_check_extension_list(s, kvm_arch_required_capabilities);
708
    }
709
    if (missing_cap) {
710
        ret = -EINVAL;
711
        fprintf(stderr, "kvm does not support %s\n%s",
712
                missing_cap->name, upgrade_note);
713
        goto err;
714
    }
715

    
716
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
717

    
718
    s->broken_set_mem_region = 1;
719
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
720
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
721
    if (ret > 0) {
722
        s->broken_set_mem_region = 0;
723
    }
724
#endif
725

    
726
    s->vcpu_events = 0;
727
#ifdef KVM_CAP_VCPU_EVENTS
728
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
729
#endif
730

    
731
    s->robust_singlestep = 0;
732
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
733
    s->robust_singlestep =
734
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
735
#endif
736

    
737
    s->debugregs = 0;
738
#ifdef KVM_CAP_DEBUGREGS
739
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
740
#endif
741

    
742
    s->xsave = 0;
743
#ifdef KVM_CAP_XSAVE
744
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
745
#endif
746

    
747
    s->xcrs = 0;
748
#ifdef KVM_CAP_XCRS
749
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
750
#endif
751

    
752
    ret = kvm_arch_init(s);
753
    if (ret < 0) {
754
        goto err;
755
    }
756

    
757
    kvm_state = s;
758
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
759

    
760
    s->many_ioeventfds = kvm_check_many_ioeventfds();
761

    
762
    return 0;
763

    
764
err:
765
    if (s) {
766
        if (s->vmfd != -1) {
767
            close(s->vmfd);
768
        }
769
        if (s->fd != -1) {
770
            close(s->fd);
771
        }
772
    }
773
    qemu_free(s);
774

    
775
    return ret;
776
}
777

    
778
static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
779
                          uint32_t count)
780
{
781
    int i;
782
    uint8_t *ptr = data;
783

    
784
    for (i = 0; i < count; i++) {
785
        if (direction == KVM_EXIT_IO_IN) {
786
            switch (size) {
787
            case 1:
788
                stb_p(ptr, cpu_inb(port));
789
                break;
790
            case 2:
791
                stw_p(ptr, cpu_inw(port));
792
                break;
793
            case 4:
794
                stl_p(ptr, cpu_inl(port));
795
                break;
796
            }
797
        } else {
798
            switch (size) {
799
            case 1:
800
                cpu_outb(port, ldub_p(ptr));
801
                break;
802
            case 2:
803
                cpu_outw(port, lduw_p(ptr));
804
                break;
805
            case 4:
806
                cpu_outl(port, ldl_p(ptr));
807
                break;
808
            }
809
        }
810

    
811
        ptr += size;
812
    }
813
}
814

    
815
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
816
static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
817
{
818
    fprintf(stderr, "KVM internal error.");
819
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
820
        int i;
821

    
822
        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
823
        for (i = 0; i < run->internal.ndata; ++i) {
824
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
825
                    i, (uint64_t)run->internal.data[i]);
826
        }
827
    } else {
828
        fprintf(stderr, "\n");
829
    }
830
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
831
        fprintf(stderr, "emulation failure\n");
832
        if (!kvm_arch_stop_on_emulation_error(env)) {
833
            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
834
            return EXCP_INTERRUPT;
835
        }
836
    }
837
    /* FIXME: Should trigger a qmp message to let management know
838
     * something went wrong.
839
     */
840
    return -1;
841
}
842
#endif
843

    
844
void kvm_flush_coalesced_mmio_buffer(void)
845
{
846
    KVMState *s = kvm_state;
847
    if (s->coalesced_mmio_ring) {
848
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
849
        while (ring->first != ring->last) {
850
            struct kvm_coalesced_mmio *ent;
851

    
852
            ent = &ring->coalesced_mmio[ring->first];
853

    
854
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
855
            smp_wmb();
856
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
857
        }
858
    }
859
}
860

    
861
static void do_kvm_cpu_synchronize_state(void *_env)
862
{
863
    CPUState *env = _env;
864

    
865
    if (!env->kvm_vcpu_dirty) {
866
        kvm_arch_get_registers(env);
867
        env->kvm_vcpu_dirty = 1;
868
    }
869
}
870

    
871
void kvm_cpu_synchronize_state(CPUState *env)
872
{
873
    if (!env->kvm_vcpu_dirty) {
874
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
875
    }
876
}
877

    
878
void kvm_cpu_synchronize_post_reset(CPUState *env)
879
{
880
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
881
    env->kvm_vcpu_dirty = 0;
882
}
883

    
884
void kvm_cpu_synchronize_post_init(CPUState *env)
885
{
886
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
887
    env->kvm_vcpu_dirty = 0;
888
}
889

    
890
int kvm_cpu_exec(CPUState *env)
891
{
892
    struct kvm_run *run = env->kvm_run;
893
    int ret, run_ret;
894

    
895
    DPRINTF("kvm_cpu_exec()\n");
896

    
897
    if (kvm_arch_process_async_events(env)) {
898
        env->exit_request = 0;
899
        return EXCP_HLT;
900
    }
901

    
902
    cpu_single_env = env;
903

    
904
    do {
905
        if (env->kvm_vcpu_dirty) {
906
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
907
            env->kvm_vcpu_dirty = 0;
908
        }
909

    
910
        kvm_arch_pre_run(env, run);
911
        if (env->exit_request) {
912
            DPRINTF("interrupt exit requested\n");
913
            /*
914
             * KVM requires us to reenter the kernel after IO exits to complete
915
             * instruction emulation. This self-signal will ensure that we
916
             * leave ASAP again.
917
             */
918
            qemu_cpu_kick_self();
919
        }
920
        cpu_single_env = NULL;
921
        qemu_mutex_unlock_iothread();
922

    
923
        run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
924

    
925
        qemu_mutex_lock_iothread();
926
        cpu_single_env = env;
927
        kvm_arch_post_run(env, run);
928

    
929
        kvm_flush_coalesced_mmio_buffer();
930

    
931
        if (run_ret < 0) {
932
            if (run_ret == -EINTR || run_ret == -EAGAIN) {
933
                DPRINTF("io window exit\n");
934
                ret = EXCP_INTERRUPT;
935
                break;
936
            }
937
            DPRINTF("kvm run failed %s\n", strerror(-run_ret));
938
            abort();
939
        }
940

    
941
        switch (run->exit_reason) {
942
        case KVM_EXIT_IO:
943
            DPRINTF("handle_io\n");
944
            kvm_handle_io(run->io.port,
945
                          (uint8_t *)run + run->io.data_offset,
946
                          run->io.direction,
947
                          run->io.size,
948
                          run->io.count);
949
            ret = 0;
950
            break;
951
        case KVM_EXIT_MMIO:
952
            DPRINTF("handle_mmio\n");
953
            cpu_physical_memory_rw(run->mmio.phys_addr,
954
                                   run->mmio.data,
955
                                   run->mmio.len,
956
                                   run->mmio.is_write);
957
            ret = 0;
958
            break;
959
        case KVM_EXIT_IRQ_WINDOW_OPEN:
960
            DPRINTF("irq_window_open\n");
961
            ret = EXCP_INTERRUPT;
962
            break;
963
        case KVM_EXIT_SHUTDOWN:
964
            DPRINTF("shutdown\n");
965
            qemu_system_reset_request();
966
            ret = EXCP_INTERRUPT;
967
            break;
968
        case KVM_EXIT_UNKNOWN:
969
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
970
                    (uint64_t)run->hw.hardware_exit_reason);
971
            ret = -1;
972
            break;
973
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
974
        case KVM_EXIT_INTERNAL_ERROR:
975
            ret = kvm_handle_internal_error(env, run);
976
            break;
977
#endif
978
        default:
979
            DPRINTF("kvm_arch_handle_exit\n");
980
            ret = kvm_arch_handle_exit(env, run);
981
            break;
982
        }
983
    } while (ret == 0);
984

    
985
    if (ret < 0) {
986
        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
987
        vm_stop(VMSTOP_PANIC);
988
    }
989

    
990
    env->exit_request = 0;
991
    cpu_single_env = NULL;
992
    return ret;
993
}
994

    
995
int kvm_ioctl(KVMState *s, int type, ...)
996
{
997
    int ret;
998
    void *arg;
999
    va_list ap;
1000

    
1001
    va_start(ap, type);
1002
    arg = va_arg(ap, void *);
1003
    va_end(ap);
1004

    
1005
    ret = ioctl(s->fd, type, arg);
1006
    if (ret == -1) {
1007
        ret = -errno;
1008
    }
1009
    return ret;
1010
}
1011

    
1012
int kvm_vm_ioctl(KVMState *s, int type, ...)
1013
{
1014
    int ret;
1015
    void *arg;
1016
    va_list ap;
1017

    
1018
    va_start(ap, type);
1019
    arg = va_arg(ap, void *);
1020
    va_end(ap);
1021

    
1022
    ret = ioctl(s->vmfd, type, arg);
1023
    if (ret == -1) {
1024
        ret = -errno;
1025
    }
1026
    return ret;
1027
}
1028

    
1029
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
1030
{
1031
    int ret;
1032
    void *arg;
1033
    va_list ap;
1034

    
1035
    va_start(ap, type);
1036
    arg = va_arg(ap, void *);
1037
    va_end(ap);
1038

    
1039
    ret = ioctl(env->kvm_fd, type, arg);
1040
    if (ret == -1) {
1041
        ret = -errno;
1042
    }
1043
    return ret;
1044
}
1045

    
1046
int kvm_has_sync_mmu(void)
1047
{
1048
    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1049
}
1050

    
1051
int kvm_has_vcpu_events(void)
1052
{
1053
    return kvm_state->vcpu_events;
1054
}
1055

    
1056
int kvm_has_robust_singlestep(void)
1057
{
1058
    return kvm_state->robust_singlestep;
1059
}
1060

    
1061
int kvm_has_debugregs(void)
1062
{
1063
    return kvm_state->debugregs;
1064
}
1065

    
1066
int kvm_has_xsave(void)
1067
{
1068
    return kvm_state->xsave;
1069
}
1070

    
1071
int kvm_has_xcrs(void)
1072
{
1073
    return kvm_state->xcrs;
1074
}
1075

    
1076
int kvm_has_many_ioeventfds(void)
1077
{
1078
    if (!kvm_enabled()) {
1079
        return 0;
1080
    }
1081
    return kvm_state->many_ioeventfds;
1082
}
1083

    
1084
void kvm_setup_guest_memory(void *start, size_t size)
1085
{
1086
    if (!kvm_has_sync_mmu()) {
1087
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1088

    
1089
        if (ret) {
1090
            perror("qemu_madvise");
1091
            fprintf(stderr,
1092
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1093
            exit(1);
1094
        }
1095
    }
1096
}
1097

    
1098
#ifdef KVM_CAP_SET_GUEST_DEBUG
1099
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1100
                                                 target_ulong pc)
1101
{
1102
    struct kvm_sw_breakpoint *bp;
1103

    
1104
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1105
        if (bp->pc == pc) {
1106
            return bp;
1107
        }
1108
    }
1109
    return NULL;
1110
}
1111

    
1112
int kvm_sw_breakpoints_active(CPUState *env)
1113
{
1114
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1115
}
1116

    
1117
struct kvm_set_guest_debug_data {
1118
    struct kvm_guest_debug dbg;
1119
    CPUState *env;
1120
    int err;
1121
};
1122

    
1123
static void kvm_invoke_set_guest_debug(void *data)
1124
{
1125
    struct kvm_set_guest_debug_data *dbg_data = data;
1126
    CPUState *env = dbg_data->env;
1127

    
1128
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1129
}
1130

    
1131
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1132
{
1133
    struct kvm_set_guest_debug_data data;
1134

    
1135
    data.dbg.control = reinject_trap;
1136

    
1137
    if (env->singlestep_enabled) {
1138
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1139
    }
1140
    kvm_arch_update_guest_debug(env, &data.dbg);
1141
    data.env = env;
1142

    
1143
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1144
    return data.err;
1145
}
1146

    
1147
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1148
                          target_ulong len, int type)
1149
{
1150
    struct kvm_sw_breakpoint *bp;
1151
    CPUState *env;
1152
    int err;
1153

    
1154
    if (type == GDB_BREAKPOINT_SW) {
1155
        bp = kvm_find_sw_breakpoint(current_env, addr);
1156
        if (bp) {
1157
            bp->use_count++;
1158
            return 0;
1159
        }
1160

    
1161
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1162
        if (!bp) {
1163
            return -ENOMEM;
1164
        }
1165

    
1166
        bp->pc = addr;
1167
        bp->use_count = 1;
1168
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1169
        if (err) {
1170
            free(bp);
1171
            return err;
1172
        }
1173

    
1174
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1175
                          bp, entry);
1176
    } else {
1177
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1178
        if (err) {
1179
            return err;
1180
        }
1181
    }
1182

    
1183
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1184
        err = kvm_update_guest_debug(env, 0);
1185
        if (err) {
1186
            return err;
1187
        }
1188
    }
1189
    return 0;
1190
}
1191

    
1192
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1193
                          target_ulong len, int type)
1194
{
1195
    struct kvm_sw_breakpoint *bp;
1196
    CPUState *env;
1197
    int err;
1198

    
1199
    if (type == GDB_BREAKPOINT_SW) {
1200
        bp = kvm_find_sw_breakpoint(current_env, addr);
1201
        if (!bp) {
1202
            return -ENOENT;
1203
        }
1204

    
1205
        if (bp->use_count > 1) {
1206
            bp->use_count--;
1207
            return 0;
1208
        }
1209

    
1210
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1211
        if (err) {
1212
            return err;
1213
        }
1214

    
1215
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1216
        qemu_free(bp);
1217
    } else {
1218
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1219
        if (err) {
1220
            return err;
1221
        }
1222
    }
1223

    
1224
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1225
        err = kvm_update_guest_debug(env, 0);
1226
        if (err) {
1227
            return err;
1228
        }
1229
    }
1230
    return 0;
1231
}
1232

    
1233
void kvm_remove_all_breakpoints(CPUState *current_env)
1234
{
1235
    struct kvm_sw_breakpoint *bp, *next;
1236
    KVMState *s = current_env->kvm_state;
1237
    CPUState *env;
1238

    
1239
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1240
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1241
            /* Try harder to find a CPU that currently sees the breakpoint. */
1242
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1243
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1244
                    break;
1245
                }
1246
            }
1247
        }
1248
    }
1249
    kvm_arch_remove_all_hw_breakpoints();
1250

    
1251
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1252
        kvm_update_guest_debug(env, 0);
1253
    }
1254
}
1255

    
1256
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1257

    
1258
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1259
{
1260
    return -EINVAL;
1261
}
1262

    
1263
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1264
                          target_ulong len, int type)
1265
{
1266
    return -EINVAL;
1267
}
1268

    
1269
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1270
                          target_ulong len, int type)
1271
{
1272
    return -EINVAL;
1273
}
1274

    
1275
void kvm_remove_all_breakpoints(CPUState *current_env)
1276
{
1277
}
1278
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1279

    
1280
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1281
{
1282
    struct kvm_signal_mask *sigmask;
1283
    int r;
1284

    
1285
    if (!sigset) {
1286
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1287
    }
1288

    
1289
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1290

    
1291
    sigmask->len = 8;
1292
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1293
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1294
    free(sigmask);
1295

    
1296
    return r;
1297
}
1298

    
1299
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1300
{
1301
#ifdef KVM_IOEVENTFD
1302
    int ret;
1303
    struct kvm_ioeventfd iofd;
1304

    
1305
    iofd.datamatch = val;
1306
    iofd.addr = addr;
1307
    iofd.len = 4;
1308
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1309
    iofd.fd = fd;
1310

    
1311
    if (!kvm_enabled()) {
1312
        return -ENOSYS;
1313
    }
1314

    
1315
    if (!assign) {
1316
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1317
    }
1318

    
1319
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1320

    
1321
    if (ret < 0) {
1322
        return -errno;
1323
    }
1324

    
1325
    return 0;
1326
#else
1327
    return -ENOSYS;
1328
#endif
1329
}
1330

    
1331
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1332
{
1333
#ifdef KVM_IOEVENTFD
1334
    struct kvm_ioeventfd kick = {
1335
        .datamatch = val,
1336
        .addr = addr,
1337
        .len = 2,
1338
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1339
        .fd = fd,
1340
    };
1341
    int r;
1342
    if (!kvm_enabled()) {
1343
        return -ENOSYS;
1344
    }
1345
    if (!assign) {
1346
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1347
    }
1348
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1349
    if (r < 0) {
1350
        return r;
1351
    }
1352
    return 0;
1353
#else
1354
    return -ENOSYS;
1355
#endif
1356
}
1357

    
1358
int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
1359
{
1360
    return kvm_arch_on_sigbus_vcpu(env, code, addr);
1361
}
1362

    
1363
int kvm_on_sigbus(int code, void *addr)
1364
{
1365
    return kvm_arch_on_sigbus(code, addr);
1366
}