Statistics
| Branch: | Revision:

root / kvm-all.c @ a1b87fe0

History | View | Annotate | Download (34.4 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* This check must be after config-host.h is included */
32
#ifdef CONFIG_EVENTFD
33
#include <sys/eventfd.h>
34
#endif
35

    
36
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
37
#define PAGE_SIZE TARGET_PAGE_SIZE
38

    
39
//#define DEBUG_KVM
40

    
41
#ifdef DEBUG_KVM
42
#define DPRINTF(fmt, ...) \
43
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
44
#else
45
#define DPRINTF(fmt, ...) \
46
    do { } while (0)
47
#endif
48

    
49
typedef struct KVMSlot
50
{
51
    target_phys_addr_t start_addr;
52
    ram_addr_t memory_size;
53
    ram_addr_t phys_offset;
54
    int slot;
55
    int flags;
56
} KVMSlot;
57

    
58
typedef struct kvm_dirty_log KVMDirtyLog;
59

    
60
struct KVMState
61
{
62
    KVMSlot slots[32];
63
    int fd;
64
    int vmfd;
65
    int coalesced_mmio;
66
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
67
    int broken_set_mem_region;
68
    int migration_log;
69
    int vcpu_events;
70
    int robust_singlestep;
71
    int debugregs;
72
#ifdef KVM_CAP_SET_GUEST_DEBUG
73
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
74
#endif
75
    int irqchip_in_kernel;
76
    int pit_in_kernel;
77
    int xsave, xcrs;
78
    int many_ioeventfds;
79
};
80

    
81
static KVMState *kvm_state;
82

    
83
static const KVMCapabilityInfo kvm_required_capabilites[] = {
84
    KVM_CAP_INFO(USER_MEMORY),
85
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
86
    KVM_CAP_LAST_INFO
87
};
88

    
89
static KVMSlot *kvm_alloc_slot(KVMState *s)
90
{
91
    int i;
92

    
93
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
94
        /* KVM private memory slots */
95
        if (i >= 8 && i < 12) {
96
            continue;
97
        }
98
        if (s->slots[i].memory_size == 0) {
99
            return &s->slots[i];
100
        }
101
    }
102

    
103
    fprintf(stderr, "%s: no free slot available\n", __func__);
104
    abort();
105
}
106

    
107
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
108
                                         target_phys_addr_t start_addr,
109
                                         target_phys_addr_t end_addr)
110
{
111
    int i;
112

    
113
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
114
        KVMSlot *mem = &s->slots[i];
115

    
116
        if (start_addr == mem->start_addr &&
117
            end_addr == mem->start_addr + mem->memory_size) {
118
            return mem;
119
        }
120
    }
121

    
122
    return NULL;
123
}
124

    
125
/*
126
 * Find overlapping slot with lowest start address
127
 */
128
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
129
                                            target_phys_addr_t start_addr,
130
                                            target_phys_addr_t end_addr)
131
{
132
    KVMSlot *found = NULL;
133
    int i;
134

    
135
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
136
        KVMSlot *mem = &s->slots[i];
137

    
138
        if (mem->memory_size == 0 ||
139
            (found && found->start_addr < mem->start_addr)) {
140
            continue;
141
        }
142

    
143
        if (end_addr > mem->start_addr &&
144
            start_addr < mem->start_addr + mem->memory_size) {
145
            found = mem;
146
        }
147
    }
148

    
149
    return found;
150
}
151

    
152
int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
153
                                      target_phys_addr_t *phys_addr)
154
{
155
    int i;
156

    
157
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
158
        KVMSlot *mem = &s->slots[i];
159

    
160
        if (ram_addr >= mem->phys_offset &&
161
            ram_addr < mem->phys_offset + mem->memory_size) {
162
            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
163
            return 1;
164
        }
165
    }
166

    
167
    return 0;
168
}
169

    
170
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
171
{
172
    struct kvm_userspace_memory_region mem;
173

    
174
    mem.slot = slot->slot;
175
    mem.guest_phys_addr = slot->start_addr;
176
    mem.memory_size = slot->memory_size;
177
    mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
178
    mem.flags = slot->flags;
179
    if (s->migration_log) {
180
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
181
    }
182
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
183
}
184

    
185
static void kvm_reset_vcpu(void *opaque)
186
{
187
    CPUState *env = opaque;
188

    
189
    kvm_arch_reset_vcpu(env);
190
}
191

    
192
int kvm_irqchip_in_kernel(void)
193
{
194
    return kvm_state->irqchip_in_kernel;
195
}
196

    
197
int kvm_pit_in_kernel(void)
198
{
199
    return kvm_state->pit_in_kernel;
200
}
201

    
202

    
203
int kvm_init_vcpu(CPUState *env)
204
{
205
    KVMState *s = kvm_state;
206
    long mmap_size;
207
    int ret;
208

    
209
    DPRINTF("kvm_init_vcpu\n");
210

    
211
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
212
    if (ret < 0) {
213
        DPRINTF("kvm_create_vcpu failed\n");
214
        goto err;
215
    }
216

    
217
    env->kvm_fd = ret;
218
    env->kvm_state = s;
219

    
220
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
221
    if (mmap_size < 0) {
222
        ret = mmap_size;
223
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
224
        goto err;
225
    }
226

    
227
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
228
                        env->kvm_fd, 0);
229
    if (env->kvm_run == MAP_FAILED) {
230
        ret = -errno;
231
        DPRINTF("mmap'ing vcpu state failed\n");
232
        goto err;
233
    }
234

    
235
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
236
        s->coalesced_mmio_ring =
237
            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
238
    }
239

    
240
    ret = kvm_arch_init_vcpu(env);
241
    if (ret == 0) {
242
        qemu_register_reset(kvm_reset_vcpu, env);
243
        kvm_arch_reset_vcpu(env);
244
    }
245
err:
246
    return ret;
247
}
248

    
249
/*
250
 * dirty pages logging control
251
 */
252
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
253
                                      ram_addr_t size, int flags, int mask)
254
{
255
    KVMState *s = kvm_state;
256
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
257
    int old_flags;
258

    
259
    if (mem == NULL)  {
260
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
261
                    TARGET_FMT_plx "\n", __func__, phys_addr,
262
                    (target_phys_addr_t)(phys_addr + size - 1));
263
            return -EINVAL;
264
    }
265

    
266
    old_flags = mem->flags;
267

    
268
    flags = (mem->flags & ~mask) | flags;
269
    mem->flags = flags;
270

    
271
    /* If nothing changed effectively, no need to issue ioctl */
272
    if (s->migration_log) {
273
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
274
    }
275
    if (flags == old_flags) {
276
            return 0;
277
    }
278

    
279
    return kvm_set_user_memory_region(s, mem);
280
}
281

    
282
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
283
{
284
    return kvm_dirty_pages_log_change(phys_addr, size, KVM_MEM_LOG_DIRTY_PAGES,
285
                                      KVM_MEM_LOG_DIRTY_PAGES);
286
}
287

    
288
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
289
{
290
    return kvm_dirty_pages_log_change(phys_addr, size, 0,
291
                                      KVM_MEM_LOG_DIRTY_PAGES);
292
}
293

    
294
static int kvm_set_migration_log(int enable)
295
{
296
    KVMState *s = kvm_state;
297
    KVMSlot *mem;
298
    int i, err;
299

    
300
    s->migration_log = enable;
301

    
302
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
303
        mem = &s->slots[i];
304

    
305
        if (!mem->memory_size) {
306
            continue;
307
        }
308
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
309
            continue;
310
        }
311
        err = kvm_set_user_memory_region(s, mem);
312
        if (err) {
313
            return err;
314
        }
315
    }
316
    return 0;
317
}
318

    
319
/* get kvm's dirty pages bitmap and update qemu's */
320
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
321
                                         unsigned long *bitmap,
322
                                         unsigned long offset,
323
                                         unsigned long mem_size)
324
{
325
    unsigned int i, j;
326
    unsigned long page_number, addr, addr1, c;
327
    ram_addr_t ram_addr;
328
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
329
        HOST_LONG_BITS;
330

    
331
    /*
332
     * bitmap-traveling is faster than memory-traveling (for addr...)
333
     * especially when most of the memory is not dirty.
334
     */
335
    for (i = 0; i < len; i++) {
336
        if (bitmap[i] != 0) {
337
            c = leul_to_cpu(bitmap[i]);
338
            do {
339
                j = ffsl(c) - 1;
340
                c &= ~(1ul << j);
341
                page_number = i * HOST_LONG_BITS + j;
342
                addr1 = page_number * TARGET_PAGE_SIZE;
343
                addr = offset + addr1;
344
                ram_addr = cpu_get_physical_page_desc(addr);
345
                cpu_physical_memory_set_dirty(ram_addr);
346
            } while (c != 0);
347
        }
348
    }
349
    return 0;
350
}
351

    
352
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
353

    
354
/**
355
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
356
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
357
 * This means all bits are set to dirty.
358
 *
359
 * @start_add: start of logged region.
360
 * @end_addr: end of logged region.
361
 */
362
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
363
                                          target_phys_addr_t end_addr)
364
{
365
    KVMState *s = kvm_state;
366
    unsigned long size, allocated_size = 0;
367
    KVMDirtyLog d;
368
    KVMSlot *mem;
369
    int ret = 0;
370

    
371
    d.dirty_bitmap = NULL;
372
    while (start_addr < end_addr) {
373
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
374
        if (mem == NULL) {
375
            break;
376
        }
377

    
378
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
379
        if (!d.dirty_bitmap) {
380
            d.dirty_bitmap = qemu_malloc(size);
381
        } else if (size > allocated_size) {
382
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
383
        }
384
        allocated_size = size;
385
        memset(d.dirty_bitmap, 0, allocated_size);
386

    
387
        d.slot = mem->slot;
388

    
389
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
390
            DPRINTF("ioctl failed %d\n", errno);
391
            ret = -1;
392
            break;
393
        }
394

    
395
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
396
                                      mem->start_addr, mem->memory_size);
397
        start_addr = mem->start_addr + mem->memory_size;
398
    }
399
    qemu_free(d.dirty_bitmap);
400

    
401
    return ret;
402
}
403

    
404
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
405
{
406
    int ret = -ENOSYS;
407
    KVMState *s = kvm_state;
408

    
409
    if (s->coalesced_mmio) {
410
        struct kvm_coalesced_mmio_zone zone;
411

    
412
        zone.addr = start;
413
        zone.size = size;
414

    
415
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
416
    }
417

    
418
    return ret;
419
}
420

    
421
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
422
{
423
    int ret = -ENOSYS;
424
    KVMState *s = kvm_state;
425

    
426
    if (s->coalesced_mmio) {
427
        struct kvm_coalesced_mmio_zone zone;
428

    
429
        zone.addr = start;
430
        zone.size = size;
431

    
432
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
433
    }
434

    
435
    return ret;
436
}
437

    
438
int kvm_check_extension(KVMState *s, unsigned int extension)
439
{
440
    int ret;
441

    
442
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
443
    if (ret < 0) {
444
        ret = 0;
445
    }
446

    
447
    return ret;
448
}
449

    
450
static int kvm_check_many_ioeventfds(void)
451
{
452
    /* Userspace can use ioeventfd for io notification.  This requires a host
453
     * that supports eventfd(2) and an I/O thread; since eventfd does not
454
     * support SIGIO it cannot interrupt the vcpu.
455
     *
456
     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
457
     * can avoid creating too many ioeventfds.
458
     */
459
#if defined(CONFIG_EVENTFD) && defined(CONFIG_IOTHREAD)
460
    int ioeventfds[7];
461
    int i, ret = 0;
462
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
463
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
464
        if (ioeventfds[i] < 0) {
465
            break;
466
        }
467
        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
468
        if (ret < 0) {
469
            close(ioeventfds[i]);
470
            break;
471
        }
472
    }
473

    
474
    /* Decide whether many devices are supported or not */
475
    ret = i == ARRAY_SIZE(ioeventfds);
476

    
477
    while (i-- > 0) {
478
        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
479
        close(ioeventfds[i]);
480
    }
481
    return ret;
482
#else
483
    return 0;
484
#endif
485
}
486

    
487
static const KVMCapabilityInfo *
488
kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
489
{
490
    while (list->name) {
491
        if (!kvm_check_extension(s, list->value)) {
492
            return list;
493
        }
494
        list++;
495
    }
496
    return NULL;
497
}
498

    
499
static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
500
                             ram_addr_t phys_offset)
501
{
502
    KVMState *s = kvm_state;
503
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
504
    KVMSlot *mem, old;
505
    int err;
506

    
507
    /* kvm works in page size chunks, but the function may be called
508
       with sub-page size and unaligned start address. */
509
    size = TARGET_PAGE_ALIGN(size);
510
    start_addr = TARGET_PAGE_ALIGN(start_addr);
511

    
512
    /* KVM does not support read-only slots */
513
    phys_offset &= ~IO_MEM_ROM;
514

    
515
    while (1) {
516
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
517
        if (!mem) {
518
            break;
519
        }
520

    
521
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
522
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
523
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
524
            /* The new slot fits into the existing one and comes with
525
             * identical parameters - nothing to be done. */
526
            return;
527
        }
528

    
529
        old = *mem;
530

    
531
        /* unregister the overlapping slot */
532
        mem->memory_size = 0;
533
        err = kvm_set_user_memory_region(s, mem);
534
        if (err) {
535
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
536
                    __func__, strerror(-err));
537
            abort();
538
        }
539

    
540
        /* Workaround for older KVM versions: we can't join slots, even not by
541
         * unregistering the previous ones and then registering the larger
542
         * slot. We have to maintain the existing fragmentation. Sigh.
543
         *
544
         * This workaround assumes that the new slot starts at the same
545
         * address as the first existing one. If not or if some overlapping
546
         * slot comes around later, we will fail (not seen in practice so far)
547
         * - and actually require a recent KVM version. */
548
        if (s->broken_set_mem_region &&
549
            old.start_addr == start_addr && old.memory_size < size &&
550
            flags < IO_MEM_UNASSIGNED) {
551
            mem = kvm_alloc_slot(s);
552
            mem->memory_size = old.memory_size;
553
            mem->start_addr = old.start_addr;
554
            mem->phys_offset = old.phys_offset;
555
            mem->flags = 0;
556

    
557
            err = kvm_set_user_memory_region(s, mem);
558
            if (err) {
559
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
560
                        strerror(-err));
561
                abort();
562
            }
563

    
564
            start_addr += old.memory_size;
565
            phys_offset += old.memory_size;
566
            size -= old.memory_size;
567
            continue;
568
        }
569

    
570
        /* register prefix slot */
571
        if (old.start_addr < start_addr) {
572
            mem = kvm_alloc_slot(s);
573
            mem->memory_size = start_addr - old.start_addr;
574
            mem->start_addr = old.start_addr;
575
            mem->phys_offset = old.phys_offset;
576
            mem->flags = 0;
577

    
578
            err = kvm_set_user_memory_region(s, mem);
579
            if (err) {
580
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
581
                        __func__, strerror(-err));
582
                abort();
583
            }
584
        }
585

    
586
        /* register suffix slot */
587
        if (old.start_addr + old.memory_size > start_addr + size) {
588
            ram_addr_t size_delta;
589

    
590
            mem = kvm_alloc_slot(s);
591
            mem->start_addr = start_addr + size;
592
            size_delta = mem->start_addr - old.start_addr;
593
            mem->memory_size = old.memory_size - size_delta;
594
            mem->phys_offset = old.phys_offset + size_delta;
595
            mem->flags = 0;
596

    
597
            err = kvm_set_user_memory_region(s, mem);
598
            if (err) {
599
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
600
                        __func__, strerror(-err));
601
                abort();
602
            }
603
        }
604
    }
605

    
606
    /* in case the KVM bug workaround already "consumed" the new slot */
607
    if (!size) {
608
        return;
609
    }
610
    /* KVM does not need to know about this memory */
611
    if (flags >= IO_MEM_UNASSIGNED) {
612
        return;
613
    }
614
    mem = kvm_alloc_slot(s);
615
    mem->memory_size = size;
616
    mem->start_addr = start_addr;
617
    mem->phys_offset = phys_offset;
618
    mem->flags = 0;
619

    
620
    err = kvm_set_user_memory_region(s, mem);
621
    if (err) {
622
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
623
                strerror(-err));
624
        abort();
625
    }
626
}
627

    
628
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
629
                                  target_phys_addr_t start_addr,
630
                                  ram_addr_t size, ram_addr_t phys_offset)
631
{
632
    kvm_set_phys_mem(start_addr, size, phys_offset);
633
}
634

    
635
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
636
                                        target_phys_addr_t start_addr,
637
                                        target_phys_addr_t end_addr)
638
{
639
    return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
640
}
641

    
642
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
643
                                    int enable)
644
{
645
    return kvm_set_migration_log(enable);
646
}
647

    
648
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
649
    .set_memory = kvm_client_set_memory,
650
    .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
651
    .migration_log = kvm_client_migration_log,
652
};
653

    
654
int kvm_init(void)
655
{
656
    static const char upgrade_note[] =
657
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
658
        "(see http://sourceforge.net/projects/kvm).\n";
659
    KVMState *s;
660
    const KVMCapabilityInfo *missing_cap;
661
    int ret;
662
    int i;
663

    
664
    s = qemu_mallocz(sizeof(KVMState));
665

    
666
#ifdef KVM_CAP_SET_GUEST_DEBUG
667
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
668
#endif
669
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
670
        s->slots[i].slot = i;
671
    }
672
    s->vmfd = -1;
673
    s->fd = qemu_open("/dev/kvm", O_RDWR);
674
    if (s->fd == -1) {
675
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
676
        ret = -errno;
677
        goto err;
678
    }
679

    
680
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
681
    if (ret < KVM_API_VERSION) {
682
        if (ret > 0) {
683
            ret = -EINVAL;
684
        }
685
        fprintf(stderr, "kvm version too old\n");
686
        goto err;
687
    }
688

    
689
    if (ret > KVM_API_VERSION) {
690
        ret = -EINVAL;
691
        fprintf(stderr, "kvm version not supported\n");
692
        goto err;
693
    }
694

    
695
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
696
    if (s->vmfd < 0) {
697
#ifdef TARGET_S390X
698
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
699
                        "your host kernel command line\n");
700
#endif
701
        goto err;
702
    }
703

    
704
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
705
    if (!missing_cap) {
706
        missing_cap =
707
            kvm_check_extension_list(s, kvm_arch_required_capabilities);
708
    }
709
    if (missing_cap) {
710
        ret = -EINVAL;
711
        fprintf(stderr, "kvm does not support %s\n%s",
712
                missing_cap->name, upgrade_note);
713
        goto err;
714
    }
715

    
716
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
717

    
718
    s->broken_set_mem_region = 1;
719
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
720
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
721
    if (ret > 0) {
722
        s->broken_set_mem_region = 0;
723
    }
724
#endif
725

    
726
    s->vcpu_events = 0;
727
#ifdef KVM_CAP_VCPU_EVENTS
728
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
729
#endif
730

    
731
    s->robust_singlestep = 0;
732
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
733
    s->robust_singlestep =
734
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
735
#endif
736

    
737
    s->debugregs = 0;
738
#ifdef KVM_CAP_DEBUGREGS
739
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
740
#endif
741

    
742
    s->xsave = 0;
743
#ifdef KVM_CAP_XSAVE
744
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
745
#endif
746

    
747
    s->xcrs = 0;
748
#ifdef KVM_CAP_XCRS
749
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
750
#endif
751

    
752
    ret = kvm_arch_init(s);
753
    if (ret < 0) {
754
        goto err;
755
    }
756

    
757
    kvm_state = s;
758
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
759

    
760
    s->many_ioeventfds = kvm_check_many_ioeventfds();
761

    
762
    return 0;
763

    
764
err:
765
    if (s) {
766
        if (s->vmfd != -1) {
767
            close(s->vmfd);
768
        }
769
        if (s->fd != -1) {
770
            close(s->fd);
771
        }
772
    }
773
    qemu_free(s);
774

    
775
    return ret;
776
}
777

    
778
static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
779
                         uint32_t count)
780
{
781
    int i;
782
    uint8_t *ptr = data;
783

    
784
    for (i = 0; i < count; i++) {
785
        if (direction == KVM_EXIT_IO_IN) {
786
            switch (size) {
787
            case 1:
788
                stb_p(ptr, cpu_inb(port));
789
                break;
790
            case 2:
791
                stw_p(ptr, cpu_inw(port));
792
                break;
793
            case 4:
794
                stl_p(ptr, cpu_inl(port));
795
                break;
796
            }
797
        } else {
798
            switch (size) {
799
            case 1:
800
                cpu_outb(port, ldub_p(ptr));
801
                break;
802
            case 2:
803
                cpu_outw(port, lduw_p(ptr));
804
                break;
805
            case 4:
806
                cpu_outl(port, ldl_p(ptr));
807
                break;
808
            }
809
        }
810

    
811
        ptr += size;
812
    }
813

    
814
    return 1;
815
}
816

    
817
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
818
static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
819
{
820
    fprintf(stderr, "KVM internal error.");
821
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
822
        int i;
823

    
824
        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
825
        for (i = 0; i < run->internal.ndata; ++i) {
826
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
827
                    i, (uint64_t)run->internal.data[i]);
828
        }
829
    } else {
830
        fprintf(stderr, "\n");
831
    }
832
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
833
        fprintf(stderr, "emulation failure\n");
834
        if (!kvm_arch_stop_on_emulation_error(env)) {
835
            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
836
            return 0;
837
        }
838
    }
839
    /* FIXME: Should trigger a qmp message to let management know
840
     * something went wrong.
841
     */
842
    return -1;
843
}
844
#endif
845

    
846
void kvm_flush_coalesced_mmio_buffer(void)
847
{
848
    KVMState *s = kvm_state;
849
    if (s->coalesced_mmio_ring) {
850
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
851
        while (ring->first != ring->last) {
852
            struct kvm_coalesced_mmio *ent;
853

    
854
            ent = &ring->coalesced_mmio[ring->first];
855

    
856
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
857
            smp_wmb();
858
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
859
        }
860
    }
861
}
862

    
863
static void do_kvm_cpu_synchronize_state(void *_env)
864
{
865
    CPUState *env = _env;
866

    
867
    if (!env->kvm_vcpu_dirty) {
868
        kvm_arch_get_registers(env);
869
        env->kvm_vcpu_dirty = 1;
870
    }
871
}
872

    
873
void kvm_cpu_synchronize_state(CPUState *env)
874
{
875
    if (!env->kvm_vcpu_dirty) {
876
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
877
    }
878
}
879

    
880
void kvm_cpu_synchronize_post_reset(CPUState *env)
881
{
882
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
883
    env->kvm_vcpu_dirty = 0;
884
}
885

    
886
void kvm_cpu_synchronize_post_init(CPUState *env)
887
{
888
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
889
    env->kvm_vcpu_dirty = 0;
890
}
891

    
892
int kvm_cpu_exec(CPUState *env)
893
{
894
    struct kvm_run *run = env->kvm_run;
895
    int ret;
896

    
897
    DPRINTF("kvm_cpu_exec()\n");
898

    
899
    do {
900
#ifndef CONFIG_IOTHREAD
901
        if (env->exit_request) {
902
            DPRINTF("interrupt exit requested\n");
903
            ret = 0;
904
            break;
905
        }
906
#endif
907

    
908
        if (kvm_arch_process_irqchip_events(env)) {
909
            ret = 0;
910
            break;
911
        }
912

    
913
        if (env->kvm_vcpu_dirty) {
914
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
915
            env->kvm_vcpu_dirty = 0;
916
        }
917

    
918
        kvm_arch_pre_run(env, run);
919
        cpu_single_env = NULL;
920
        qemu_mutex_unlock_iothread();
921
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
922
        qemu_mutex_lock_iothread();
923
        cpu_single_env = env;
924
        kvm_arch_post_run(env, run);
925

    
926
        kvm_flush_coalesced_mmio_buffer();
927

    
928
        if (ret == -EINTR || ret == -EAGAIN) {
929
            cpu_exit(env);
930
            DPRINTF("io window exit\n");
931
            ret = 0;
932
            break;
933
        }
934

    
935
        if (ret < 0) {
936
            DPRINTF("kvm run failed %s\n", strerror(-ret));
937
            abort();
938
        }
939

    
940
        ret = 0; /* exit loop */
941
        switch (run->exit_reason) {
942
        case KVM_EXIT_IO:
943
            DPRINTF("handle_io\n");
944
            ret = kvm_handle_io(run->io.port,
945
                                (uint8_t *)run + run->io.data_offset,
946
                                run->io.direction,
947
                                run->io.size,
948
                                run->io.count);
949
            break;
950
        case KVM_EXIT_MMIO:
951
            DPRINTF("handle_mmio\n");
952
            cpu_physical_memory_rw(run->mmio.phys_addr,
953
                                   run->mmio.data,
954
                                   run->mmio.len,
955
                                   run->mmio.is_write);
956
            ret = 1;
957
            break;
958
        case KVM_EXIT_IRQ_WINDOW_OPEN:
959
            DPRINTF("irq_window_open\n");
960
            break;
961
        case KVM_EXIT_SHUTDOWN:
962
            DPRINTF("shutdown\n");
963
            qemu_system_reset_request();
964
            ret = 1;
965
            break;
966
        case KVM_EXIT_UNKNOWN:
967
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
968
                    (uint64_t)run->hw.hardware_exit_reason);
969
            ret = -1;
970
            break;
971
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
972
        case KVM_EXIT_INTERNAL_ERROR:
973
            ret = kvm_handle_internal_error(env, run);
974
            break;
975
#endif
976
        case KVM_EXIT_DEBUG:
977
            DPRINTF("kvm_exit_debug\n");
978
#ifdef KVM_CAP_SET_GUEST_DEBUG
979
            if (kvm_arch_debug(&run->debug.arch)) {
980
                env->exception_index = EXCP_DEBUG;
981
                return 0;
982
            }
983
            /* re-enter, this exception was guest-internal */
984
            ret = 1;
985
#endif /* KVM_CAP_SET_GUEST_DEBUG */
986
            break;
987
        default:
988
            DPRINTF("kvm_arch_handle_exit\n");
989
            ret = kvm_arch_handle_exit(env, run);
990
            break;
991
        }
992
    } while (ret > 0);
993

    
994
    if (ret < 0) {
995
        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
996
        vm_stop(0);
997
        env->exit_request = 1;
998
    }
999
    if (env->exit_request) {
1000
        env->exit_request = 0;
1001
        env->exception_index = EXCP_INTERRUPT;
1002
    }
1003

    
1004
    return ret;
1005
}
1006

    
1007
int kvm_ioctl(KVMState *s, int type, ...)
1008
{
1009
    int ret;
1010
    void *arg;
1011
    va_list ap;
1012

    
1013
    va_start(ap, type);
1014
    arg = va_arg(ap, void *);
1015
    va_end(ap);
1016

    
1017
    ret = ioctl(s->fd, type, arg);
1018
    if (ret == -1) {
1019
        ret = -errno;
1020
    }
1021
    return ret;
1022
}
1023

    
1024
int kvm_vm_ioctl(KVMState *s, int type, ...)
1025
{
1026
    int ret;
1027
    void *arg;
1028
    va_list ap;
1029

    
1030
    va_start(ap, type);
1031
    arg = va_arg(ap, void *);
1032
    va_end(ap);
1033

    
1034
    ret = ioctl(s->vmfd, type, arg);
1035
    if (ret == -1) {
1036
        ret = -errno;
1037
    }
1038
    return ret;
1039
}
1040

    
1041
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
1042
{
1043
    int ret;
1044
    void *arg;
1045
    va_list ap;
1046

    
1047
    va_start(ap, type);
1048
    arg = va_arg(ap, void *);
1049
    va_end(ap);
1050

    
1051
    ret = ioctl(env->kvm_fd, type, arg);
1052
    if (ret == -1) {
1053
        ret = -errno;
1054
    }
1055
    return ret;
1056
}
1057

    
1058
int kvm_has_sync_mmu(void)
1059
{
1060
    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1061
}
1062

    
1063
int kvm_has_vcpu_events(void)
1064
{
1065
    return kvm_state->vcpu_events;
1066
}
1067

    
1068
int kvm_has_robust_singlestep(void)
1069
{
1070
    return kvm_state->robust_singlestep;
1071
}
1072

    
1073
int kvm_has_debugregs(void)
1074
{
1075
    return kvm_state->debugregs;
1076
}
1077

    
1078
int kvm_has_xsave(void)
1079
{
1080
    return kvm_state->xsave;
1081
}
1082

    
1083
int kvm_has_xcrs(void)
1084
{
1085
    return kvm_state->xcrs;
1086
}
1087

    
1088
int kvm_has_many_ioeventfds(void)
1089
{
1090
    if (!kvm_enabled()) {
1091
        return 0;
1092
    }
1093
    return kvm_state->many_ioeventfds;
1094
}
1095

    
1096
void kvm_setup_guest_memory(void *start, size_t size)
1097
{
1098
    if (!kvm_has_sync_mmu()) {
1099
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1100

    
1101
        if (ret) {
1102
            perror("qemu_madvise");
1103
            fprintf(stderr,
1104
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1105
            exit(1);
1106
        }
1107
    }
1108
}
1109

    
1110
#ifdef KVM_CAP_SET_GUEST_DEBUG
1111
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1112
                                                 target_ulong pc)
1113
{
1114
    struct kvm_sw_breakpoint *bp;
1115

    
1116
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1117
        if (bp->pc == pc) {
1118
            return bp;
1119
        }
1120
    }
1121
    return NULL;
1122
}
1123

    
1124
int kvm_sw_breakpoints_active(CPUState *env)
1125
{
1126
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1127
}
1128

    
1129
struct kvm_set_guest_debug_data {
1130
    struct kvm_guest_debug dbg;
1131
    CPUState *env;
1132
    int err;
1133
};
1134

    
1135
static void kvm_invoke_set_guest_debug(void *data)
1136
{
1137
    struct kvm_set_guest_debug_data *dbg_data = data;
1138
    CPUState *env = dbg_data->env;
1139

    
1140
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1141
}
1142

    
1143
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1144
{
1145
    struct kvm_set_guest_debug_data data;
1146

    
1147
    data.dbg.control = reinject_trap;
1148

    
1149
    if (env->singlestep_enabled) {
1150
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1151
    }
1152
    kvm_arch_update_guest_debug(env, &data.dbg);
1153
    data.env = env;
1154

    
1155
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1156
    return data.err;
1157
}
1158

    
1159
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1160
                          target_ulong len, int type)
1161
{
1162
    struct kvm_sw_breakpoint *bp;
1163
    CPUState *env;
1164
    int err;
1165

    
1166
    if (type == GDB_BREAKPOINT_SW) {
1167
        bp = kvm_find_sw_breakpoint(current_env, addr);
1168
        if (bp) {
1169
            bp->use_count++;
1170
            return 0;
1171
        }
1172

    
1173
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1174
        if (!bp) {
1175
            return -ENOMEM;
1176
        }
1177

    
1178
        bp->pc = addr;
1179
        bp->use_count = 1;
1180
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1181
        if (err) {
1182
            free(bp);
1183
            return err;
1184
        }
1185

    
1186
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1187
                          bp, entry);
1188
    } else {
1189
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1190
        if (err) {
1191
            return err;
1192
        }
1193
    }
1194

    
1195
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1196
        err = kvm_update_guest_debug(env, 0);
1197
        if (err) {
1198
            return err;
1199
        }
1200
    }
1201
    return 0;
1202
}
1203

    
1204
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1205
                          target_ulong len, int type)
1206
{
1207
    struct kvm_sw_breakpoint *bp;
1208
    CPUState *env;
1209
    int err;
1210

    
1211
    if (type == GDB_BREAKPOINT_SW) {
1212
        bp = kvm_find_sw_breakpoint(current_env, addr);
1213
        if (!bp) {
1214
            return -ENOENT;
1215
        }
1216

    
1217
        if (bp->use_count > 1) {
1218
            bp->use_count--;
1219
            return 0;
1220
        }
1221

    
1222
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1223
        if (err) {
1224
            return err;
1225
        }
1226

    
1227
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1228
        qemu_free(bp);
1229
    } else {
1230
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1231
        if (err) {
1232
            return err;
1233
        }
1234
    }
1235

    
1236
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1237
        err = kvm_update_guest_debug(env, 0);
1238
        if (err) {
1239
            return err;
1240
        }
1241
    }
1242
    return 0;
1243
}
1244

    
1245
void kvm_remove_all_breakpoints(CPUState *current_env)
1246
{
1247
    struct kvm_sw_breakpoint *bp, *next;
1248
    KVMState *s = current_env->kvm_state;
1249
    CPUState *env;
1250

    
1251
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1252
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1253
            /* Try harder to find a CPU that currently sees the breakpoint. */
1254
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1255
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1256
                    break;
1257
                }
1258
            }
1259
        }
1260
    }
1261
    kvm_arch_remove_all_hw_breakpoints();
1262

    
1263
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1264
        kvm_update_guest_debug(env, 0);
1265
    }
1266
}
1267

    
1268
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1269

    
1270
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1271
{
1272
    return -EINVAL;
1273
}
1274

    
1275
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1276
                          target_ulong len, int type)
1277
{
1278
    return -EINVAL;
1279
}
1280

    
1281
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1282
                          target_ulong len, int type)
1283
{
1284
    return -EINVAL;
1285
}
1286

    
1287
void kvm_remove_all_breakpoints(CPUState *current_env)
1288
{
1289
}
1290
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1291

    
1292
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1293
{
1294
    struct kvm_signal_mask *sigmask;
1295
    int r;
1296

    
1297
    if (!sigset) {
1298
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1299
    }
1300

    
1301
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1302

    
1303
    sigmask->len = 8;
1304
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1305
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1306
    free(sigmask);
1307

    
1308
    return r;
1309
}
1310

    
1311
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1312
{
1313
#ifdef KVM_IOEVENTFD
1314
    int ret;
1315
    struct kvm_ioeventfd iofd;
1316

    
1317
    iofd.datamatch = val;
1318
    iofd.addr = addr;
1319
    iofd.len = 4;
1320
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1321
    iofd.fd = fd;
1322

    
1323
    if (!kvm_enabled()) {
1324
        return -ENOSYS;
1325
    }
1326

    
1327
    if (!assign) {
1328
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1329
    }
1330

    
1331
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1332

    
1333
    if (ret < 0) {
1334
        return -errno;
1335
    }
1336

    
1337
    return 0;
1338
#else
1339
    return -ENOSYS;
1340
#endif
1341
}
1342

    
1343
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1344
{
1345
#ifdef KVM_IOEVENTFD
1346
    struct kvm_ioeventfd kick = {
1347
        .datamatch = val,
1348
        .addr = addr,
1349
        .len = 2,
1350
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1351
        .fd = fd,
1352
    };
1353
    int r;
1354
    if (!kvm_enabled()) {
1355
        return -ENOSYS;
1356
    }
1357
    if (!assign) {
1358
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1359
    }
1360
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1361
    if (r < 0) {
1362
        return r;
1363
    }
1364
    return 0;
1365
#else
1366
    return -ENOSYS;
1367
#endif
1368
}
1369

    
1370
int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
1371
{
1372
    return kvm_arch_on_sigbus_vcpu(env, code, addr);
1373
}
1374

    
1375
int kvm_on_sigbus(int code, void *addr)
1376
{
1377
    return kvm_arch_on_sigbus(code, addr);
1378
}