Statistics
| Branch: | Revision:

root / kvm-all.c @ 6792a57b

History | View | Annotate | Download (34.5 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* This check must be after config-host.h is included */
32
#ifdef CONFIG_EVENTFD
33
#include <sys/eventfd.h>
34
#endif
35

    
36
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
37
#define PAGE_SIZE TARGET_PAGE_SIZE
38

    
39
//#define DEBUG_KVM
40

    
41
#ifdef DEBUG_KVM
42
#define DPRINTF(fmt, ...) \
43
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
44
#else
45
#define DPRINTF(fmt, ...) \
46
    do { } while (0)
47
#endif
48

    
49
typedef struct KVMSlot
50
{
51
    target_phys_addr_t start_addr;
52
    ram_addr_t memory_size;
53
    ram_addr_t phys_offset;
54
    int slot;
55
    int flags;
56
} KVMSlot;
57

    
58
typedef struct kvm_dirty_log KVMDirtyLog;
59

    
60
struct KVMState
61
{
62
    KVMSlot slots[32];
63
    int fd;
64
    int vmfd;
65
    int coalesced_mmio;
66
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
67
    int broken_set_mem_region;
68
    int migration_log;
69
    int vcpu_events;
70
    int robust_singlestep;
71
    int debugregs;
72
#ifdef KVM_CAP_SET_GUEST_DEBUG
73
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
74
#endif
75
    int irqchip_in_kernel;
76
    int pit_in_kernel;
77
    int xsave, xcrs;
78
    int many_ioeventfds;
79
};
80

    
81
static KVMState *kvm_state;
82

    
83
static const KVMCapabilityInfo kvm_required_capabilites[] = {
84
    KVM_CAP_INFO(USER_MEMORY),
85
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
86
    KVM_CAP_LAST_INFO
87
};
88

    
89
static KVMSlot *kvm_alloc_slot(KVMState *s)
90
{
91
    int i;
92

    
93
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
94
        /* KVM private memory slots */
95
        if (i >= 8 && i < 12) {
96
            continue;
97
        }
98
        if (s->slots[i].memory_size == 0) {
99
            return &s->slots[i];
100
        }
101
    }
102

    
103
    fprintf(stderr, "%s: no free slot available\n", __func__);
104
    abort();
105
}
106

    
107
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
108
                                         target_phys_addr_t start_addr,
109
                                         target_phys_addr_t end_addr)
110
{
111
    int i;
112

    
113
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
114
        KVMSlot *mem = &s->slots[i];
115

    
116
        if (start_addr == mem->start_addr &&
117
            end_addr == mem->start_addr + mem->memory_size) {
118
            return mem;
119
        }
120
    }
121

    
122
    return NULL;
123
}
124

    
125
/*
126
 * Find overlapping slot with lowest start address
127
 */
128
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
129
                                            target_phys_addr_t start_addr,
130
                                            target_phys_addr_t end_addr)
131
{
132
    KVMSlot *found = NULL;
133
    int i;
134

    
135
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
136
        KVMSlot *mem = &s->slots[i];
137

    
138
        if (mem->memory_size == 0 ||
139
            (found && found->start_addr < mem->start_addr)) {
140
            continue;
141
        }
142

    
143
        if (end_addr > mem->start_addr &&
144
            start_addr < mem->start_addr + mem->memory_size) {
145
            found = mem;
146
        }
147
    }
148

    
149
    return found;
150
}
151

    
152
int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
153
                                      target_phys_addr_t *phys_addr)
154
{
155
    int i;
156

    
157
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
158
        KVMSlot *mem = &s->slots[i];
159

    
160
        if (ram_addr >= mem->phys_offset &&
161
            ram_addr < mem->phys_offset + mem->memory_size) {
162
            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
163
            return 1;
164
        }
165
    }
166

    
167
    return 0;
168
}
169

    
170
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
171
{
172
    struct kvm_userspace_memory_region mem;
173

    
174
    mem.slot = slot->slot;
175
    mem.guest_phys_addr = slot->start_addr;
176
    mem.memory_size = slot->memory_size;
177
    mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
178
    mem.flags = slot->flags;
179
    if (s->migration_log) {
180
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
181
    }
182
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
183
}
184

    
185
static void kvm_reset_vcpu(void *opaque)
186
{
187
    CPUState *env = opaque;
188

    
189
    kvm_arch_reset_vcpu(env);
190
}
191

    
192
int kvm_irqchip_in_kernel(void)
193
{
194
    return kvm_state->irqchip_in_kernel;
195
}
196

    
197
int kvm_pit_in_kernel(void)
198
{
199
    return kvm_state->pit_in_kernel;
200
}
201

    
202
int kvm_init_vcpu(CPUState *env)
203
{
204
    KVMState *s = kvm_state;
205
    long mmap_size;
206
    int ret;
207

    
208
    DPRINTF("kvm_init_vcpu\n");
209

    
210
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
211
    if (ret < 0) {
212
        DPRINTF("kvm_create_vcpu failed\n");
213
        goto err;
214
    }
215

    
216
    env->kvm_fd = ret;
217
    env->kvm_state = s;
218

    
219
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
220
    if (mmap_size < 0) {
221
        ret = mmap_size;
222
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
223
        goto err;
224
    }
225

    
226
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
227
                        env->kvm_fd, 0);
228
    if (env->kvm_run == MAP_FAILED) {
229
        ret = -errno;
230
        DPRINTF("mmap'ing vcpu state failed\n");
231
        goto err;
232
    }
233

    
234
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
235
        s->coalesced_mmio_ring =
236
            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
237
    }
238

    
239
    ret = kvm_arch_init_vcpu(env);
240
    if (ret == 0) {
241
        qemu_register_reset(kvm_reset_vcpu, env);
242
        kvm_arch_reset_vcpu(env);
243
    }
244
err:
245
    return ret;
246
}
247

    
248
/*
249
 * dirty pages logging control
250
 */
251
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
252
                                      ram_addr_t size, int flags, int mask)
253
{
254
    KVMState *s = kvm_state;
255
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
256
    int old_flags;
257

    
258
    if (mem == NULL)  {
259
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
260
                    TARGET_FMT_plx "\n", __func__, phys_addr,
261
                    (target_phys_addr_t)(phys_addr + size - 1));
262
            return -EINVAL;
263
    }
264

    
265
    old_flags = mem->flags;
266

    
267
    flags = (mem->flags & ~mask) | flags;
268
    mem->flags = flags;
269

    
270
    /* If nothing changed effectively, no need to issue ioctl */
271
    if (s->migration_log) {
272
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
273
    }
274
    if (flags == old_flags) {
275
            return 0;
276
    }
277

    
278
    return kvm_set_user_memory_region(s, mem);
279
}
280

    
281
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
282
{
283
    return kvm_dirty_pages_log_change(phys_addr, size, KVM_MEM_LOG_DIRTY_PAGES,
284
                                      KVM_MEM_LOG_DIRTY_PAGES);
285
}
286

    
287
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
288
{
289
    return kvm_dirty_pages_log_change(phys_addr, size, 0,
290
                                      KVM_MEM_LOG_DIRTY_PAGES);
291
}
292

    
293
static int kvm_set_migration_log(int enable)
294
{
295
    KVMState *s = kvm_state;
296
    KVMSlot *mem;
297
    int i, err;
298

    
299
    s->migration_log = enable;
300

    
301
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
302
        mem = &s->slots[i];
303

    
304
        if (!mem->memory_size) {
305
            continue;
306
        }
307
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
308
            continue;
309
        }
310
        err = kvm_set_user_memory_region(s, mem);
311
        if (err) {
312
            return err;
313
        }
314
    }
315
    return 0;
316
}
317

    
318
/* get kvm's dirty pages bitmap and update qemu's */
319
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
320
                                         unsigned long *bitmap,
321
                                         unsigned long offset,
322
                                         unsigned long mem_size)
323
{
324
    unsigned int i, j;
325
    unsigned long page_number, addr, addr1, c;
326
    ram_addr_t ram_addr;
327
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
328
        HOST_LONG_BITS;
329

    
330
    /*
331
     * bitmap-traveling is faster than memory-traveling (for addr...)
332
     * especially when most of the memory is not dirty.
333
     */
334
    for (i = 0; i < len; i++) {
335
        if (bitmap[i] != 0) {
336
            c = leul_to_cpu(bitmap[i]);
337
            do {
338
                j = ffsl(c) - 1;
339
                c &= ~(1ul << j);
340
                page_number = i * HOST_LONG_BITS + j;
341
                addr1 = page_number * TARGET_PAGE_SIZE;
342
                addr = offset + addr1;
343
                ram_addr = cpu_get_physical_page_desc(addr);
344
                cpu_physical_memory_set_dirty(ram_addr);
345
            } while (c != 0);
346
        }
347
    }
348
    return 0;
349
}
350

    
351
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
352

    
353
/**
354
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
355
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
356
 * This means all bits are set to dirty.
357
 *
358
 * @start_add: start of logged region.
359
 * @end_addr: end of logged region.
360
 */
361
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
362
                                          target_phys_addr_t end_addr)
363
{
364
    KVMState *s = kvm_state;
365
    unsigned long size, allocated_size = 0;
366
    KVMDirtyLog d;
367
    KVMSlot *mem;
368
    int ret = 0;
369

    
370
    d.dirty_bitmap = NULL;
371
    while (start_addr < end_addr) {
372
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
373
        if (mem == NULL) {
374
            break;
375
        }
376

    
377
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
378
        if (!d.dirty_bitmap) {
379
            d.dirty_bitmap = qemu_malloc(size);
380
        } else if (size > allocated_size) {
381
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
382
        }
383
        allocated_size = size;
384
        memset(d.dirty_bitmap, 0, allocated_size);
385

    
386
        d.slot = mem->slot;
387

    
388
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
389
            DPRINTF("ioctl failed %d\n", errno);
390
            ret = -1;
391
            break;
392
        }
393

    
394
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
395
                                      mem->start_addr, mem->memory_size);
396
        start_addr = mem->start_addr + mem->memory_size;
397
    }
398
    qemu_free(d.dirty_bitmap);
399

    
400
    return ret;
401
}
402

    
403
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
404
{
405
    int ret = -ENOSYS;
406
    KVMState *s = kvm_state;
407

    
408
    if (s->coalesced_mmio) {
409
        struct kvm_coalesced_mmio_zone zone;
410

    
411
        zone.addr = start;
412
        zone.size = size;
413

    
414
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
415
    }
416

    
417
    return ret;
418
}
419

    
420
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
421
{
422
    int ret = -ENOSYS;
423
    KVMState *s = kvm_state;
424

    
425
    if (s->coalesced_mmio) {
426
        struct kvm_coalesced_mmio_zone zone;
427

    
428
        zone.addr = start;
429
        zone.size = size;
430

    
431
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
432
    }
433

    
434
    return ret;
435
}
436

    
437
int kvm_check_extension(KVMState *s, unsigned int extension)
438
{
439
    int ret;
440

    
441
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
442
    if (ret < 0) {
443
        ret = 0;
444
    }
445

    
446
    return ret;
447
}
448

    
449
static int kvm_check_many_ioeventfds(void)
450
{
451
    /* Userspace can use ioeventfd for io notification.  This requires a host
452
     * that supports eventfd(2) and an I/O thread; since eventfd does not
453
     * support SIGIO it cannot interrupt the vcpu.
454
     *
455
     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
456
     * can avoid creating too many ioeventfds.
457
     */
458
#if defined(CONFIG_EVENTFD) && defined(CONFIG_IOTHREAD)
459
    int ioeventfds[7];
460
    int i, ret = 0;
461
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
462
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
463
        if (ioeventfds[i] < 0) {
464
            break;
465
        }
466
        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
467
        if (ret < 0) {
468
            close(ioeventfds[i]);
469
            break;
470
        }
471
    }
472

    
473
    /* Decide whether many devices are supported or not */
474
    ret = i == ARRAY_SIZE(ioeventfds);
475

    
476
    while (i-- > 0) {
477
        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
478
        close(ioeventfds[i]);
479
    }
480
    return ret;
481
#else
482
    return 0;
483
#endif
484
}
485

    
486
static const KVMCapabilityInfo *
487
kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
488
{
489
    while (list->name) {
490
        if (!kvm_check_extension(s, list->value)) {
491
            return list;
492
        }
493
        list++;
494
    }
495
    return NULL;
496
}
497

    
498
static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
499
                             ram_addr_t phys_offset)
500
{
501
    KVMState *s = kvm_state;
502
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
503
    KVMSlot *mem, old;
504
    int err;
505

    
506
    /* kvm works in page size chunks, but the function may be called
507
       with sub-page size and unaligned start address. */
508
    size = TARGET_PAGE_ALIGN(size);
509
    start_addr = TARGET_PAGE_ALIGN(start_addr);
510

    
511
    /* KVM does not support read-only slots */
512
    phys_offset &= ~IO_MEM_ROM;
513

    
514
    while (1) {
515
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
516
        if (!mem) {
517
            break;
518
        }
519

    
520
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
521
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
522
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
523
            /* The new slot fits into the existing one and comes with
524
             * identical parameters - nothing to be done. */
525
            return;
526
        }
527

    
528
        old = *mem;
529

    
530
        /* unregister the overlapping slot */
531
        mem->memory_size = 0;
532
        err = kvm_set_user_memory_region(s, mem);
533
        if (err) {
534
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
535
                    __func__, strerror(-err));
536
            abort();
537
        }
538

    
539
        /* Workaround for older KVM versions: we can't join slots, even not by
540
         * unregistering the previous ones and then registering the larger
541
         * slot. We have to maintain the existing fragmentation. Sigh.
542
         *
543
         * This workaround assumes that the new slot starts at the same
544
         * address as the first existing one. If not or if some overlapping
545
         * slot comes around later, we will fail (not seen in practice so far)
546
         * - and actually require a recent KVM version. */
547
        if (s->broken_set_mem_region &&
548
            old.start_addr == start_addr && old.memory_size < size &&
549
            flags < IO_MEM_UNASSIGNED) {
550
            mem = kvm_alloc_slot(s);
551
            mem->memory_size = old.memory_size;
552
            mem->start_addr = old.start_addr;
553
            mem->phys_offset = old.phys_offset;
554
            mem->flags = 0;
555

    
556
            err = kvm_set_user_memory_region(s, mem);
557
            if (err) {
558
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
559
                        strerror(-err));
560
                abort();
561
            }
562

    
563
            start_addr += old.memory_size;
564
            phys_offset += old.memory_size;
565
            size -= old.memory_size;
566
            continue;
567
        }
568

    
569
        /* register prefix slot */
570
        if (old.start_addr < start_addr) {
571
            mem = kvm_alloc_slot(s);
572
            mem->memory_size = start_addr - old.start_addr;
573
            mem->start_addr = old.start_addr;
574
            mem->phys_offset = old.phys_offset;
575
            mem->flags = 0;
576

    
577
            err = kvm_set_user_memory_region(s, mem);
578
            if (err) {
579
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
580
                        __func__, strerror(-err));
581
                abort();
582
            }
583
        }
584

    
585
        /* register suffix slot */
586
        if (old.start_addr + old.memory_size > start_addr + size) {
587
            ram_addr_t size_delta;
588

    
589
            mem = kvm_alloc_slot(s);
590
            mem->start_addr = start_addr + size;
591
            size_delta = mem->start_addr - old.start_addr;
592
            mem->memory_size = old.memory_size - size_delta;
593
            mem->phys_offset = old.phys_offset + size_delta;
594
            mem->flags = 0;
595

    
596
            err = kvm_set_user_memory_region(s, mem);
597
            if (err) {
598
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
599
                        __func__, strerror(-err));
600
                abort();
601
            }
602
        }
603
    }
604

    
605
    /* in case the KVM bug workaround already "consumed" the new slot */
606
    if (!size) {
607
        return;
608
    }
609
    /* KVM does not need to know about this memory */
610
    if (flags >= IO_MEM_UNASSIGNED) {
611
        return;
612
    }
613
    mem = kvm_alloc_slot(s);
614
    mem->memory_size = size;
615
    mem->start_addr = start_addr;
616
    mem->phys_offset = phys_offset;
617
    mem->flags = 0;
618

    
619
    err = kvm_set_user_memory_region(s, mem);
620
    if (err) {
621
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
622
                strerror(-err));
623
        abort();
624
    }
625
}
626

    
627
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
628
                                  target_phys_addr_t start_addr,
629
                                  ram_addr_t size, ram_addr_t phys_offset)
630
{
631
    kvm_set_phys_mem(start_addr, size, phys_offset);
632
}
633

    
634
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
635
                                        target_phys_addr_t start_addr,
636
                                        target_phys_addr_t end_addr)
637
{
638
    return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
639
}
640

    
641
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
642
                                    int enable)
643
{
644
    return kvm_set_migration_log(enable);
645
}
646

    
647
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
648
    .set_memory = kvm_client_set_memory,
649
    .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
650
    .migration_log = kvm_client_migration_log,
651
};
652

    
653
int kvm_init(void)
654
{
655
    static const char upgrade_note[] =
656
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
657
        "(see http://sourceforge.net/projects/kvm).\n";
658
    KVMState *s;
659
    const KVMCapabilityInfo *missing_cap;
660
    int ret;
661
    int i;
662

    
663
    s = qemu_mallocz(sizeof(KVMState));
664

    
665
#ifdef KVM_CAP_SET_GUEST_DEBUG
666
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
667
#endif
668
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
669
        s->slots[i].slot = i;
670
    }
671
    s->vmfd = -1;
672
    s->fd = qemu_open("/dev/kvm", O_RDWR);
673
    if (s->fd == -1) {
674
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
675
        ret = -errno;
676
        goto err;
677
    }
678

    
679
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
680
    if (ret < KVM_API_VERSION) {
681
        if (ret > 0) {
682
            ret = -EINVAL;
683
        }
684
        fprintf(stderr, "kvm version too old\n");
685
        goto err;
686
    }
687

    
688
    if (ret > KVM_API_VERSION) {
689
        ret = -EINVAL;
690
        fprintf(stderr, "kvm version not supported\n");
691
        goto err;
692
    }
693

    
694
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
695
    if (s->vmfd < 0) {
696
#ifdef TARGET_S390X
697
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
698
                        "your host kernel command line\n");
699
#endif
700
        goto err;
701
    }
702

    
703
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
704
    if (!missing_cap) {
705
        missing_cap =
706
            kvm_check_extension_list(s, kvm_arch_required_capabilities);
707
    }
708
    if (missing_cap) {
709
        ret = -EINVAL;
710
        fprintf(stderr, "kvm does not support %s\n%s",
711
                missing_cap->name, upgrade_note);
712
        goto err;
713
    }
714

    
715
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
716

    
717
    s->broken_set_mem_region = 1;
718
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
719
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
720
    if (ret > 0) {
721
        s->broken_set_mem_region = 0;
722
    }
723
#endif
724

    
725
    s->vcpu_events = 0;
726
#ifdef KVM_CAP_VCPU_EVENTS
727
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
728
#endif
729

    
730
    s->robust_singlestep = 0;
731
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
732
    s->robust_singlestep =
733
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
734
#endif
735

    
736
    s->debugregs = 0;
737
#ifdef KVM_CAP_DEBUGREGS
738
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
739
#endif
740

    
741
    s->xsave = 0;
742
#ifdef KVM_CAP_XSAVE
743
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
744
#endif
745

    
746
    s->xcrs = 0;
747
#ifdef KVM_CAP_XCRS
748
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
749
#endif
750

    
751
    ret = kvm_arch_init(s);
752
    if (ret < 0) {
753
        goto err;
754
    }
755

    
756
    kvm_state = s;
757
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
758

    
759
    s->many_ioeventfds = kvm_check_many_ioeventfds();
760

    
761
    return 0;
762

    
763
err:
764
    if (s) {
765
        if (s->vmfd != -1) {
766
            close(s->vmfd);
767
        }
768
        if (s->fd != -1) {
769
            close(s->fd);
770
        }
771
    }
772
    qemu_free(s);
773

    
774
    return ret;
775
}
776

    
777
static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
778
                          uint32_t count)
779
{
780
    int i;
781
    uint8_t *ptr = data;
782

    
783
    for (i = 0; i < count; i++) {
784
        if (direction == KVM_EXIT_IO_IN) {
785
            switch (size) {
786
            case 1:
787
                stb_p(ptr, cpu_inb(port));
788
                break;
789
            case 2:
790
                stw_p(ptr, cpu_inw(port));
791
                break;
792
            case 4:
793
                stl_p(ptr, cpu_inl(port));
794
                break;
795
            }
796
        } else {
797
            switch (size) {
798
            case 1:
799
                cpu_outb(port, ldub_p(ptr));
800
                break;
801
            case 2:
802
                cpu_outw(port, lduw_p(ptr));
803
                break;
804
            case 4:
805
                cpu_outl(port, ldl_p(ptr));
806
                break;
807
            }
808
        }
809

    
810
        ptr += size;
811
    }
812
}
813

    
814
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
815
static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
816
{
817
    fprintf(stderr, "KVM internal error.");
818
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
819
        int i;
820

    
821
        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
822
        for (i = 0; i < run->internal.ndata; ++i) {
823
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
824
                    i, (uint64_t)run->internal.data[i]);
825
        }
826
    } else {
827
        fprintf(stderr, "\n");
828
    }
829
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
830
        fprintf(stderr, "emulation failure\n");
831
        if (!kvm_arch_stop_on_emulation_error(env)) {
832
            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
833
            return 0;
834
        }
835
    }
836
    /* FIXME: Should trigger a qmp message to let management know
837
     * something went wrong.
838
     */
839
    return -1;
840
}
841
#endif
842

    
843
void kvm_flush_coalesced_mmio_buffer(void)
844
{
845
    KVMState *s = kvm_state;
846
    if (s->coalesced_mmio_ring) {
847
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
848
        while (ring->first != ring->last) {
849
            struct kvm_coalesced_mmio *ent;
850

    
851
            ent = &ring->coalesced_mmio[ring->first];
852

    
853
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
854
            smp_wmb();
855
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
856
        }
857
    }
858
}
859

    
860
static void do_kvm_cpu_synchronize_state(void *_env)
861
{
862
    CPUState *env = _env;
863

    
864
    if (!env->kvm_vcpu_dirty) {
865
        kvm_arch_get_registers(env);
866
        env->kvm_vcpu_dirty = 1;
867
    }
868
}
869

    
870
void kvm_cpu_synchronize_state(CPUState *env)
871
{
872
    if (!env->kvm_vcpu_dirty) {
873
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
874
    }
875
}
876

    
877
void kvm_cpu_synchronize_post_reset(CPUState *env)
878
{
879
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
880
    env->kvm_vcpu_dirty = 0;
881
}
882

    
883
void kvm_cpu_synchronize_post_init(CPUState *env)
884
{
885
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
886
    env->kvm_vcpu_dirty = 0;
887
}
888

    
889
int kvm_cpu_exec(CPUState *env)
890
{
891
    struct kvm_run *run = env->kvm_run;
892
    int ret;
893

    
894
    DPRINTF("kvm_cpu_exec()\n");
895

    
896
    if (kvm_arch_process_irqchip_events(env)) {
897
        env->exit_request = 0;
898
        return EXCP_HLT;
899
    }
900

    
901
    cpu_single_env = env;
902

    
903
    do {
904
        if (env->kvm_vcpu_dirty) {
905
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
906
            env->kvm_vcpu_dirty = 0;
907
        }
908

    
909
        kvm_arch_pre_run(env, run);
910
        if (env->exit_request) {
911
            DPRINTF("interrupt exit requested\n");
912
            /*
913
             * KVM requires us to reenter the kernel after IO exits to complete
914
             * instruction emulation. This self-signal will ensure that we
915
             * leave ASAP again.
916
             */
917
            qemu_cpu_kick_self();
918
        }
919
        cpu_single_env = NULL;
920
        qemu_mutex_unlock_iothread();
921

    
922
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
923

    
924
        qemu_mutex_lock_iothread();
925
        cpu_single_env = env;
926
        kvm_arch_post_run(env, run);
927

    
928
        kvm_flush_coalesced_mmio_buffer();
929

    
930
        if (ret == -EINTR || ret == -EAGAIN) {
931
            DPRINTF("io window exit\n");
932
            ret = 0;
933
            break;
934
        }
935

    
936
        if (ret < 0) {
937
            DPRINTF("kvm run failed %s\n", strerror(-ret));
938
            abort();
939
        }
940

    
941
        ret = 0; /* exit loop */
942
        switch (run->exit_reason) {
943
        case KVM_EXIT_IO:
944
            DPRINTF("handle_io\n");
945
            kvm_handle_io(run->io.port,
946
                          (uint8_t *)run + run->io.data_offset,
947
                          run->io.direction,
948
                          run->io.size,
949
                          run->io.count);
950
            ret = 1;
951
            break;
952
        case KVM_EXIT_MMIO:
953
            DPRINTF("handle_mmio\n");
954
            cpu_physical_memory_rw(run->mmio.phys_addr,
955
                                   run->mmio.data,
956
                                   run->mmio.len,
957
                                   run->mmio.is_write);
958
            ret = 1;
959
            break;
960
        case KVM_EXIT_IRQ_WINDOW_OPEN:
961
            DPRINTF("irq_window_open\n");
962
            break;
963
        case KVM_EXIT_SHUTDOWN:
964
            DPRINTF("shutdown\n");
965
            qemu_system_reset_request();
966
            break;
967
        case KVM_EXIT_UNKNOWN:
968
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
969
                    (uint64_t)run->hw.hardware_exit_reason);
970
            ret = -1;
971
            break;
972
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
973
        case KVM_EXIT_INTERNAL_ERROR:
974
            ret = kvm_handle_internal_error(env, run);
975
            break;
976
#endif
977
        case KVM_EXIT_DEBUG:
978
            DPRINTF("kvm_exit_debug\n");
979
#ifdef KVM_CAP_SET_GUEST_DEBUG
980
            if (kvm_arch_debug(&run->debug.arch)) {
981
                ret = EXCP_DEBUG;
982
                goto out;
983
            }
984
            /* re-enter, this exception was guest-internal */
985
            ret = 1;
986
#endif /* KVM_CAP_SET_GUEST_DEBUG */
987
            break;
988
        default:
989
            DPRINTF("kvm_arch_handle_exit\n");
990
            ret = kvm_arch_handle_exit(env, run);
991
            break;
992
        }
993
    } while (ret > 0);
994

    
995
    if (ret < 0) {
996
        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
997
        vm_stop(VMSTOP_PANIC);
998
    }
999
    ret = EXCP_INTERRUPT;
1000

    
1001
out:
1002
    env->exit_request = 0;
1003
    cpu_single_env = NULL;
1004
    return ret;
1005
}
1006

    
1007
int kvm_ioctl(KVMState *s, int type, ...)
1008
{
1009
    int ret;
1010
    void *arg;
1011
    va_list ap;
1012

    
1013
    va_start(ap, type);
1014
    arg = va_arg(ap, void *);
1015
    va_end(ap);
1016

    
1017
    ret = ioctl(s->fd, type, arg);
1018
    if (ret == -1) {
1019
        ret = -errno;
1020
    }
1021
    return ret;
1022
}
1023

    
1024
int kvm_vm_ioctl(KVMState *s, int type, ...)
1025
{
1026
    int ret;
1027
    void *arg;
1028
    va_list ap;
1029

    
1030
    va_start(ap, type);
1031
    arg = va_arg(ap, void *);
1032
    va_end(ap);
1033

    
1034
    ret = ioctl(s->vmfd, type, arg);
1035
    if (ret == -1) {
1036
        ret = -errno;
1037
    }
1038
    return ret;
1039
}
1040

    
1041
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
1042
{
1043
    int ret;
1044
    void *arg;
1045
    va_list ap;
1046

    
1047
    va_start(ap, type);
1048
    arg = va_arg(ap, void *);
1049
    va_end(ap);
1050

    
1051
    ret = ioctl(env->kvm_fd, type, arg);
1052
    if (ret == -1) {
1053
        ret = -errno;
1054
    }
1055
    return ret;
1056
}
1057

    
1058
int kvm_has_sync_mmu(void)
1059
{
1060
    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1061
}
1062

    
1063
int kvm_has_vcpu_events(void)
1064
{
1065
    return kvm_state->vcpu_events;
1066
}
1067

    
1068
int kvm_has_robust_singlestep(void)
1069
{
1070
    return kvm_state->robust_singlestep;
1071
}
1072

    
1073
int kvm_has_debugregs(void)
1074
{
1075
    return kvm_state->debugregs;
1076
}
1077

    
1078
int kvm_has_xsave(void)
1079
{
1080
    return kvm_state->xsave;
1081
}
1082

    
1083
int kvm_has_xcrs(void)
1084
{
1085
    return kvm_state->xcrs;
1086
}
1087

    
1088
int kvm_has_many_ioeventfds(void)
1089
{
1090
    if (!kvm_enabled()) {
1091
        return 0;
1092
    }
1093
    return kvm_state->many_ioeventfds;
1094
}
1095

    
1096
void kvm_setup_guest_memory(void *start, size_t size)
1097
{
1098
    if (!kvm_has_sync_mmu()) {
1099
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1100

    
1101
        if (ret) {
1102
            perror("qemu_madvise");
1103
            fprintf(stderr,
1104
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1105
            exit(1);
1106
        }
1107
    }
1108
}
1109

    
1110
#ifdef KVM_CAP_SET_GUEST_DEBUG
1111
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1112
                                                 target_ulong pc)
1113
{
1114
    struct kvm_sw_breakpoint *bp;
1115

    
1116
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1117
        if (bp->pc == pc) {
1118
            return bp;
1119
        }
1120
    }
1121
    return NULL;
1122
}
1123

    
1124
int kvm_sw_breakpoints_active(CPUState *env)
1125
{
1126
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1127
}
1128

    
1129
struct kvm_set_guest_debug_data {
1130
    struct kvm_guest_debug dbg;
1131
    CPUState *env;
1132
    int err;
1133
};
1134

    
1135
static void kvm_invoke_set_guest_debug(void *data)
1136
{
1137
    struct kvm_set_guest_debug_data *dbg_data = data;
1138
    CPUState *env = dbg_data->env;
1139

    
1140
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1141
}
1142

    
1143
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1144
{
1145
    struct kvm_set_guest_debug_data data;
1146

    
1147
    data.dbg.control = reinject_trap;
1148

    
1149
    if (env->singlestep_enabled) {
1150
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1151
    }
1152
    kvm_arch_update_guest_debug(env, &data.dbg);
1153
    data.env = env;
1154

    
1155
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1156
    return data.err;
1157
}
1158

    
1159
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1160
                          target_ulong len, int type)
1161
{
1162
    struct kvm_sw_breakpoint *bp;
1163
    CPUState *env;
1164
    int err;
1165

    
1166
    if (type == GDB_BREAKPOINT_SW) {
1167
        bp = kvm_find_sw_breakpoint(current_env, addr);
1168
        if (bp) {
1169
            bp->use_count++;
1170
            return 0;
1171
        }
1172

    
1173
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1174
        if (!bp) {
1175
            return -ENOMEM;
1176
        }
1177

    
1178
        bp->pc = addr;
1179
        bp->use_count = 1;
1180
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1181
        if (err) {
1182
            free(bp);
1183
            return err;
1184
        }
1185

    
1186
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1187
                          bp, entry);
1188
    } else {
1189
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1190
        if (err) {
1191
            return err;
1192
        }
1193
    }
1194

    
1195
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1196
        err = kvm_update_guest_debug(env, 0);
1197
        if (err) {
1198
            return err;
1199
        }
1200
    }
1201
    return 0;
1202
}
1203

    
1204
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1205
                          target_ulong len, int type)
1206
{
1207
    struct kvm_sw_breakpoint *bp;
1208
    CPUState *env;
1209
    int err;
1210

    
1211
    if (type == GDB_BREAKPOINT_SW) {
1212
        bp = kvm_find_sw_breakpoint(current_env, addr);
1213
        if (!bp) {
1214
            return -ENOENT;
1215
        }
1216

    
1217
        if (bp->use_count > 1) {
1218
            bp->use_count--;
1219
            return 0;
1220
        }
1221

    
1222
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1223
        if (err) {
1224
            return err;
1225
        }
1226

    
1227
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1228
        qemu_free(bp);
1229
    } else {
1230
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1231
        if (err) {
1232
            return err;
1233
        }
1234
    }
1235

    
1236
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1237
        err = kvm_update_guest_debug(env, 0);
1238
        if (err) {
1239
            return err;
1240
        }
1241
    }
1242
    return 0;
1243
}
1244

    
1245
void kvm_remove_all_breakpoints(CPUState *current_env)
1246
{
1247
    struct kvm_sw_breakpoint *bp, *next;
1248
    KVMState *s = current_env->kvm_state;
1249
    CPUState *env;
1250

    
1251
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1252
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1253
            /* Try harder to find a CPU that currently sees the breakpoint. */
1254
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1255
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1256
                    break;
1257
                }
1258
            }
1259
        }
1260
    }
1261
    kvm_arch_remove_all_hw_breakpoints();
1262

    
1263
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1264
        kvm_update_guest_debug(env, 0);
1265
    }
1266
}
1267

    
1268
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1269

    
1270
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1271
{
1272
    return -EINVAL;
1273
}
1274

    
1275
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1276
                          target_ulong len, int type)
1277
{
1278
    return -EINVAL;
1279
}
1280

    
1281
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1282
                          target_ulong len, int type)
1283
{
1284
    return -EINVAL;
1285
}
1286

    
1287
void kvm_remove_all_breakpoints(CPUState *current_env)
1288
{
1289
}
1290
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1291

    
1292
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1293
{
1294
    struct kvm_signal_mask *sigmask;
1295
    int r;
1296

    
1297
    if (!sigset) {
1298
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1299
    }
1300

    
1301
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1302

    
1303
    sigmask->len = 8;
1304
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1305
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1306
    free(sigmask);
1307

    
1308
    return r;
1309
}
1310

    
1311
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1312
{
1313
#ifdef KVM_IOEVENTFD
1314
    int ret;
1315
    struct kvm_ioeventfd iofd;
1316

    
1317
    iofd.datamatch = val;
1318
    iofd.addr = addr;
1319
    iofd.len = 4;
1320
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1321
    iofd.fd = fd;
1322

    
1323
    if (!kvm_enabled()) {
1324
        return -ENOSYS;
1325
    }
1326

    
1327
    if (!assign) {
1328
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1329
    }
1330

    
1331
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1332

    
1333
    if (ret < 0) {
1334
        return -errno;
1335
    }
1336

    
1337
    return 0;
1338
#else
1339
    return -ENOSYS;
1340
#endif
1341
}
1342

    
1343
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1344
{
1345
#ifdef KVM_IOEVENTFD
1346
    struct kvm_ioeventfd kick = {
1347
        .datamatch = val,
1348
        .addr = addr,
1349
        .len = 2,
1350
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1351
        .fd = fd,
1352
    };
1353
    int r;
1354
    if (!kvm_enabled()) {
1355
        return -ENOSYS;
1356
    }
1357
    if (!assign) {
1358
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1359
    }
1360
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1361
    if (r < 0) {
1362
        return r;
1363
    }
1364
    return 0;
1365
#else
1366
    return -ENOSYS;
1367
#endif
1368
}
1369

    
1370
int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
1371
{
1372
    return kvm_arch_on_sigbus_vcpu(env, code, addr);
1373
}
1374

    
1375
int kvm_on_sigbus(int code, void *addr)
1376
{
1377
    return kvm_arch_on_sigbus(code, addr);
1378
}