Statistics
| Branch: | Revision:

root / kvm-all.c @ 94a8d39a

History | View | Annotate | Download (34 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* This check must be after config-host.h is included */
32
#ifdef CONFIG_EVENTFD
33
#include <sys/eventfd.h>
34
#endif
35

    
36
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
37
#define PAGE_SIZE TARGET_PAGE_SIZE
38

    
39
//#define DEBUG_KVM
40

    
41
#ifdef DEBUG_KVM
42
#define DPRINTF(fmt, ...) \
43
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
44
#else
45
#define DPRINTF(fmt, ...) \
46
    do { } while (0)
47
#endif
48

    
49
typedef struct KVMSlot
50
{
51
    target_phys_addr_t start_addr;
52
    ram_addr_t memory_size;
53
    ram_addr_t phys_offset;
54
    int slot;
55
    int flags;
56
} KVMSlot;
57

    
58
typedef struct kvm_dirty_log KVMDirtyLog;
59

    
60
struct KVMState
61
{
62
    KVMSlot slots[32];
63
    int fd;
64
    int vmfd;
65
    int coalesced_mmio;
66
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
67
    int broken_set_mem_region;
68
    int migration_log;
69
    int vcpu_events;
70
    int robust_singlestep;
71
    int debugregs;
72
#ifdef KVM_CAP_SET_GUEST_DEBUG
73
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
74
#endif
75
    int irqchip_in_kernel;
76
    int pit_in_kernel;
77
    int xsave, xcrs;
78
    int many_ioeventfds;
79
};
80

    
81
static KVMState *kvm_state;
82

    
83
static const KVMCapabilityInfo kvm_required_capabilites[] = {
84
    KVM_CAP_INFO(USER_MEMORY),
85
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
86
    KVM_CAP_LAST_INFO
87
};
88

    
89
static KVMSlot *kvm_alloc_slot(KVMState *s)
90
{
91
    int i;
92

    
93
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
94
        /* KVM private memory slots */
95
        if (i >= 8 && i < 12) {
96
            continue;
97
        }
98
        if (s->slots[i].memory_size == 0) {
99
            return &s->slots[i];
100
        }
101
    }
102

    
103
    fprintf(stderr, "%s: no free slot available\n", __func__);
104
    abort();
105
}
106

    
107
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
108
                                         target_phys_addr_t start_addr,
109
                                         target_phys_addr_t end_addr)
110
{
111
    int i;
112

    
113
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
114
        KVMSlot *mem = &s->slots[i];
115

    
116
        if (start_addr == mem->start_addr &&
117
            end_addr == mem->start_addr + mem->memory_size) {
118
            return mem;
119
        }
120
    }
121

    
122
    return NULL;
123
}
124

    
125
/*
126
 * Find overlapping slot with lowest start address
127
 */
128
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
129
                                            target_phys_addr_t start_addr,
130
                                            target_phys_addr_t end_addr)
131
{
132
    KVMSlot *found = NULL;
133
    int i;
134

    
135
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
136
        KVMSlot *mem = &s->slots[i];
137

    
138
        if (mem->memory_size == 0 ||
139
            (found && found->start_addr < mem->start_addr)) {
140
            continue;
141
        }
142

    
143
        if (end_addr > mem->start_addr &&
144
            start_addr < mem->start_addr + mem->memory_size) {
145
            found = mem;
146
        }
147
    }
148

    
149
    return found;
150
}
151

    
152
int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
153
                                      target_phys_addr_t *phys_addr)
154
{
155
    int i;
156

    
157
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
158
        KVMSlot *mem = &s->slots[i];
159

    
160
        if (ram_addr >= mem->phys_offset &&
161
            ram_addr < mem->phys_offset + mem->memory_size) {
162
            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
163
            return 1;
164
        }
165
    }
166

    
167
    return 0;
168
}
169

    
170
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
171
{
172
    struct kvm_userspace_memory_region mem;
173

    
174
    mem.slot = slot->slot;
175
    mem.guest_phys_addr = slot->start_addr;
176
    mem.memory_size = slot->memory_size;
177
    mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
178
    mem.flags = slot->flags;
179
    if (s->migration_log) {
180
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
181
    }
182
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
183
}
184

    
185
static void kvm_reset_vcpu(void *opaque)
186
{
187
    CPUState *env = opaque;
188

    
189
    kvm_arch_reset_vcpu(env);
190
}
191

    
192
int kvm_irqchip_in_kernel(void)
193
{
194
    return kvm_state->irqchip_in_kernel;
195
}
196

    
197
int kvm_pit_in_kernel(void)
198
{
199
    return kvm_state->pit_in_kernel;
200
}
201

    
202

    
203
int kvm_init_vcpu(CPUState *env)
204
{
205
    KVMState *s = kvm_state;
206
    long mmap_size;
207
    int ret;
208

    
209
    DPRINTF("kvm_init_vcpu\n");
210

    
211
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
212
    if (ret < 0) {
213
        DPRINTF("kvm_create_vcpu failed\n");
214
        goto err;
215
    }
216

    
217
    env->kvm_fd = ret;
218
    env->kvm_state = s;
219

    
220
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
221
    if (mmap_size < 0) {
222
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
223
        goto err;
224
    }
225

    
226
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
227
                        env->kvm_fd, 0);
228
    if (env->kvm_run == MAP_FAILED) {
229
        ret = -errno;
230
        DPRINTF("mmap'ing vcpu state failed\n");
231
        goto err;
232
    }
233

    
234
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
235
        s->coalesced_mmio_ring =
236
            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
237
    }
238

    
239
    ret = kvm_arch_init_vcpu(env);
240
    if (ret == 0) {
241
        qemu_register_reset(kvm_reset_vcpu, env);
242
        kvm_arch_reset_vcpu(env);
243
    }
244
err:
245
    return ret;
246
}
247

    
248
/*
249
 * dirty pages logging control
250
 */
251
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
252
                                      ram_addr_t size, int flags, int mask)
253
{
254
    KVMState *s = kvm_state;
255
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
256
    int old_flags;
257

    
258
    if (mem == NULL)  {
259
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
260
                    TARGET_FMT_plx "\n", __func__, phys_addr,
261
                    (target_phys_addr_t)(phys_addr + size - 1));
262
            return -EINVAL;
263
    }
264

    
265
    old_flags = mem->flags;
266

    
267
    flags = (mem->flags & ~mask) | flags;
268
    mem->flags = flags;
269

    
270
    /* If nothing changed effectively, no need to issue ioctl */
271
    if (s->migration_log) {
272
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
273
    }
274
    if (flags == old_flags) {
275
            return 0;
276
    }
277

    
278
    return kvm_set_user_memory_region(s, mem);
279
}
280

    
281
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
282
{
283
    return kvm_dirty_pages_log_change(phys_addr, size, KVM_MEM_LOG_DIRTY_PAGES,
284
                                      KVM_MEM_LOG_DIRTY_PAGES);
285
}
286

    
287
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
288
{
289
    return kvm_dirty_pages_log_change(phys_addr, size, 0,
290
                                      KVM_MEM_LOG_DIRTY_PAGES);
291
}
292

    
293
static int kvm_set_migration_log(int enable)
294
{
295
    KVMState *s = kvm_state;
296
    KVMSlot *mem;
297
    int i, err;
298

    
299
    s->migration_log = enable;
300

    
301
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
302
        mem = &s->slots[i];
303

    
304
        if (!mem->memory_size) {
305
            continue;
306
        }
307
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
308
            continue;
309
        }
310
        err = kvm_set_user_memory_region(s, mem);
311
        if (err) {
312
            return err;
313
        }
314
    }
315
    return 0;
316
}
317

    
318
/* get kvm's dirty pages bitmap and update qemu's */
319
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
320
                                         unsigned long *bitmap,
321
                                         unsigned long offset,
322
                                         unsigned long mem_size)
323
{
324
    unsigned int i, j;
325
    unsigned long page_number, addr, addr1, c;
326
    ram_addr_t ram_addr;
327
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
328
        HOST_LONG_BITS;
329

    
330
    /*
331
     * bitmap-traveling is faster than memory-traveling (for addr...)
332
     * especially when most of the memory is not dirty.
333
     */
334
    for (i = 0; i < len; i++) {
335
        if (bitmap[i] != 0) {
336
            c = leul_to_cpu(bitmap[i]);
337
            do {
338
                j = ffsl(c) - 1;
339
                c &= ~(1ul << j);
340
                page_number = i * HOST_LONG_BITS + j;
341
                addr1 = page_number * TARGET_PAGE_SIZE;
342
                addr = offset + addr1;
343
                ram_addr = cpu_get_physical_page_desc(addr);
344
                cpu_physical_memory_set_dirty(ram_addr);
345
            } while (c != 0);
346
        }
347
    }
348
    return 0;
349
}
350

    
351
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
352

    
353
/**
354
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
355
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
356
 * This means all bits are set to dirty.
357
 *
358
 * @start_add: start of logged region.
359
 * @end_addr: end of logged region.
360
 */
361
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
362
                                          target_phys_addr_t end_addr)
363
{
364
    KVMState *s = kvm_state;
365
    unsigned long size, allocated_size = 0;
366
    KVMDirtyLog d;
367
    KVMSlot *mem;
368
    int ret = 0;
369

    
370
    d.dirty_bitmap = NULL;
371
    while (start_addr < end_addr) {
372
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
373
        if (mem == NULL) {
374
            break;
375
        }
376

    
377
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
378
        if (!d.dirty_bitmap) {
379
            d.dirty_bitmap = qemu_malloc(size);
380
        } else if (size > allocated_size) {
381
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
382
        }
383
        allocated_size = size;
384
        memset(d.dirty_bitmap, 0, allocated_size);
385

    
386
        d.slot = mem->slot;
387

    
388
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
389
            DPRINTF("ioctl failed %d\n", errno);
390
            ret = -1;
391
            break;
392
        }
393

    
394
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
395
                                      mem->start_addr, mem->memory_size);
396
        start_addr = mem->start_addr + mem->memory_size;
397
    }
398
    qemu_free(d.dirty_bitmap);
399

    
400
    return ret;
401
}
402

    
403
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
404
{
405
    int ret = -ENOSYS;
406
    KVMState *s = kvm_state;
407

    
408
    if (s->coalesced_mmio) {
409
        struct kvm_coalesced_mmio_zone zone;
410

    
411
        zone.addr = start;
412
        zone.size = size;
413

    
414
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
415
    }
416

    
417
    return ret;
418
}
419

    
420
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
421
{
422
    int ret = -ENOSYS;
423
    KVMState *s = kvm_state;
424

    
425
    if (s->coalesced_mmio) {
426
        struct kvm_coalesced_mmio_zone zone;
427

    
428
        zone.addr = start;
429
        zone.size = size;
430

    
431
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
432
    }
433

    
434
    return ret;
435
}
436

    
437
int kvm_check_extension(KVMState *s, unsigned int extension)
438
{
439
    int ret;
440

    
441
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
442
    if (ret < 0) {
443
        ret = 0;
444
    }
445

    
446
    return ret;
447
}
448

    
449
static int kvm_check_many_ioeventfds(void)
450
{
451
    /* Older kernels have a 6 device limit on the KVM io bus.  Find out so we
452
     * can avoid creating too many ioeventfds.
453
     */
454
#ifdef CONFIG_EVENTFD
455
    int ioeventfds[7];
456
    int i, ret = 0;
457
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
458
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
459
        if (ioeventfds[i] < 0) {
460
            break;
461
        }
462
        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
463
        if (ret < 0) {
464
            close(ioeventfds[i]);
465
            break;
466
        }
467
    }
468

    
469
    /* Decide whether many devices are supported or not */
470
    ret = i == ARRAY_SIZE(ioeventfds);
471

    
472
    while (i-- > 0) {
473
        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
474
        close(ioeventfds[i]);
475
    }
476
    return ret;
477
#else
478
    return 0;
479
#endif
480
}
481

    
482
static const KVMCapabilityInfo *
483
kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
484
{
485
    while (list->name) {
486
        if (!kvm_check_extension(s, list->value)) {
487
            return list;
488
        }
489
        list++;
490
    }
491
    return NULL;
492
}
493

    
494
static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
495
                             ram_addr_t phys_offset)
496
{
497
    KVMState *s = kvm_state;
498
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
499
    KVMSlot *mem, old;
500
    int err;
501

    
502
    /* kvm works in page size chunks, but the function may be called
503
       with sub-page size and unaligned start address. */
504
    size = TARGET_PAGE_ALIGN(size);
505
    start_addr = TARGET_PAGE_ALIGN(start_addr);
506

    
507
    /* KVM does not support read-only slots */
508
    phys_offset &= ~IO_MEM_ROM;
509

    
510
    while (1) {
511
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
512
        if (!mem) {
513
            break;
514
        }
515

    
516
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
517
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
518
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
519
            /* The new slot fits into the existing one and comes with
520
             * identical parameters - nothing to be done. */
521
            return;
522
        }
523

    
524
        old = *mem;
525

    
526
        /* unregister the overlapping slot */
527
        mem->memory_size = 0;
528
        err = kvm_set_user_memory_region(s, mem);
529
        if (err) {
530
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
531
                    __func__, strerror(-err));
532
            abort();
533
        }
534

    
535
        /* Workaround for older KVM versions: we can't join slots, even not by
536
         * unregistering the previous ones and then registering the larger
537
         * slot. We have to maintain the existing fragmentation. Sigh.
538
         *
539
         * This workaround assumes that the new slot starts at the same
540
         * address as the first existing one. If not or if some overlapping
541
         * slot comes around later, we will fail (not seen in practice so far)
542
         * - and actually require a recent KVM version. */
543
        if (s->broken_set_mem_region &&
544
            old.start_addr == start_addr && old.memory_size < size &&
545
            flags < IO_MEM_UNASSIGNED) {
546
            mem = kvm_alloc_slot(s);
547
            mem->memory_size = old.memory_size;
548
            mem->start_addr = old.start_addr;
549
            mem->phys_offset = old.phys_offset;
550
            mem->flags = 0;
551

    
552
            err = kvm_set_user_memory_region(s, mem);
553
            if (err) {
554
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
555
                        strerror(-err));
556
                abort();
557
            }
558

    
559
            start_addr += old.memory_size;
560
            phys_offset += old.memory_size;
561
            size -= old.memory_size;
562
            continue;
563
        }
564

    
565
        /* register prefix slot */
566
        if (old.start_addr < start_addr) {
567
            mem = kvm_alloc_slot(s);
568
            mem->memory_size = start_addr - old.start_addr;
569
            mem->start_addr = old.start_addr;
570
            mem->phys_offset = old.phys_offset;
571
            mem->flags = 0;
572

    
573
            err = kvm_set_user_memory_region(s, mem);
574
            if (err) {
575
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
576
                        __func__, strerror(-err));
577
                abort();
578
            }
579
        }
580

    
581
        /* register suffix slot */
582
        if (old.start_addr + old.memory_size > start_addr + size) {
583
            ram_addr_t size_delta;
584

    
585
            mem = kvm_alloc_slot(s);
586
            mem->start_addr = start_addr + size;
587
            size_delta = mem->start_addr - old.start_addr;
588
            mem->memory_size = old.memory_size - size_delta;
589
            mem->phys_offset = old.phys_offset + size_delta;
590
            mem->flags = 0;
591

    
592
            err = kvm_set_user_memory_region(s, mem);
593
            if (err) {
594
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
595
                        __func__, strerror(-err));
596
                abort();
597
            }
598
        }
599
    }
600

    
601
    /* in case the KVM bug workaround already "consumed" the new slot */
602
    if (!size) {
603
        return;
604
    }
605
    /* KVM does not need to know about this memory */
606
    if (flags >= IO_MEM_UNASSIGNED) {
607
        return;
608
    }
609
    mem = kvm_alloc_slot(s);
610
    mem->memory_size = size;
611
    mem->start_addr = start_addr;
612
    mem->phys_offset = phys_offset;
613
    mem->flags = 0;
614

    
615
    err = kvm_set_user_memory_region(s, mem);
616
    if (err) {
617
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
618
                strerror(-err));
619
        abort();
620
    }
621
}
622

    
623
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
624
                                  target_phys_addr_t start_addr,
625
                                  ram_addr_t size, ram_addr_t phys_offset)
626
{
627
    kvm_set_phys_mem(start_addr, size, phys_offset);
628
}
629

    
630
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
631
                                        target_phys_addr_t start_addr,
632
                                        target_phys_addr_t end_addr)
633
{
634
    return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
635
}
636

    
637
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
638
                                    int enable)
639
{
640
    return kvm_set_migration_log(enable);
641
}
642

    
643
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
644
    .set_memory = kvm_client_set_memory,
645
    .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
646
    .migration_log = kvm_client_migration_log,
647
};
648

    
649
int kvm_init(void)
650
{
651
    static const char upgrade_note[] =
652
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
653
        "(see http://sourceforge.net/projects/kvm).\n";
654
    KVMState *s;
655
    const KVMCapabilityInfo *missing_cap;
656
    int ret;
657
    int i;
658

    
659
    s = qemu_mallocz(sizeof(KVMState));
660

    
661
#ifdef KVM_CAP_SET_GUEST_DEBUG
662
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
663
#endif
664
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
665
        s->slots[i].slot = i;
666
    }
667
    s->vmfd = -1;
668
    s->fd = qemu_open("/dev/kvm", O_RDWR);
669
    if (s->fd == -1) {
670
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
671
        ret = -errno;
672
        goto err;
673
    }
674

    
675
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
676
    if (ret < KVM_API_VERSION) {
677
        if (ret > 0) {
678
            ret = -EINVAL;
679
        }
680
        fprintf(stderr, "kvm version too old\n");
681
        goto err;
682
    }
683

    
684
    if (ret > KVM_API_VERSION) {
685
        ret = -EINVAL;
686
        fprintf(stderr, "kvm version not supported\n");
687
        goto err;
688
    }
689

    
690
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
691
    if (s->vmfd < 0) {
692
#ifdef TARGET_S390X
693
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
694
                        "your host kernel command line\n");
695
#endif
696
        goto err;
697
    }
698

    
699
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
700
    if (!missing_cap) {
701
        missing_cap =
702
            kvm_check_extension_list(s, kvm_arch_required_capabilities);
703
    }
704
    if (missing_cap) {
705
        ret = -EINVAL;
706
        fprintf(stderr, "kvm does not support %s\n%s",
707
                missing_cap->name, upgrade_note);
708
        goto err;
709
    }
710

    
711
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
712

    
713
    s->broken_set_mem_region = 1;
714
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
715
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
716
    if (ret > 0) {
717
        s->broken_set_mem_region = 0;
718
    }
719
#endif
720

    
721
    s->vcpu_events = 0;
722
#ifdef KVM_CAP_VCPU_EVENTS
723
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
724
#endif
725

    
726
    s->robust_singlestep = 0;
727
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
728
    s->robust_singlestep =
729
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
730
#endif
731

    
732
    s->debugregs = 0;
733
#ifdef KVM_CAP_DEBUGREGS
734
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
735
#endif
736

    
737
    s->xsave = 0;
738
#ifdef KVM_CAP_XSAVE
739
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
740
#endif
741

    
742
    s->xcrs = 0;
743
#ifdef KVM_CAP_XCRS
744
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
745
#endif
746

    
747
    ret = kvm_arch_init(s);
748
    if (ret < 0) {
749
        goto err;
750
    }
751

    
752
    kvm_state = s;
753
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
754

    
755
    s->many_ioeventfds = kvm_check_many_ioeventfds();
756

    
757
    return 0;
758

    
759
err:
760
    if (s) {
761
        if (s->vmfd != -1) {
762
            close(s->vmfd);
763
        }
764
        if (s->fd != -1) {
765
            close(s->fd);
766
        }
767
    }
768
    qemu_free(s);
769

    
770
    return ret;
771
}
772

    
773
static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
774
                         uint32_t count)
775
{
776
    int i;
777
    uint8_t *ptr = data;
778

    
779
    for (i = 0; i < count; i++) {
780
        if (direction == KVM_EXIT_IO_IN) {
781
            switch (size) {
782
            case 1:
783
                stb_p(ptr, cpu_inb(port));
784
                break;
785
            case 2:
786
                stw_p(ptr, cpu_inw(port));
787
                break;
788
            case 4:
789
                stl_p(ptr, cpu_inl(port));
790
                break;
791
            }
792
        } else {
793
            switch (size) {
794
            case 1:
795
                cpu_outb(port, ldub_p(ptr));
796
                break;
797
            case 2:
798
                cpu_outw(port, lduw_p(ptr));
799
                break;
800
            case 4:
801
                cpu_outl(port, ldl_p(ptr));
802
                break;
803
            }
804
        }
805

    
806
        ptr += size;
807
    }
808

    
809
    return 1;
810
}
811

    
812
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
813
static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
814
{
815
    fprintf(stderr, "KVM internal error.");
816
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
817
        int i;
818

    
819
        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
820
        for (i = 0; i < run->internal.ndata; ++i) {
821
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
822
                    i, (uint64_t)run->internal.data[i]);
823
        }
824
    } else {
825
        fprintf(stderr, "\n");
826
    }
827
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
828
        fprintf(stderr, "emulation failure\n");
829
        if (!kvm_arch_stop_on_emulation_error(env)) {
830
            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
831
            return 0;
832
        }
833
    }
834
    /* FIXME: Should trigger a qmp message to let management know
835
     * something went wrong.
836
     */
837
    return -1;
838
}
839
#endif
840

    
841
void kvm_flush_coalesced_mmio_buffer(void)
842
{
843
    KVMState *s = kvm_state;
844
    if (s->coalesced_mmio_ring) {
845
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
846
        while (ring->first != ring->last) {
847
            struct kvm_coalesced_mmio *ent;
848

    
849
            ent = &ring->coalesced_mmio[ring->first];
850

    
851
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
852
            smp_wmb();
853
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
854
        }
855
    }
856
}
857

    
858
static void do_kvm_cpu_synchronize_state(void *_env)
859
{
860
    CPUState *env = _env;
861

    
862
    if (!env->kvm_vcpu_dirty) {
863
        kvm_arch_get_registers(env);
864
        env->kvm_vcpu_dirty = 1;
865
    }
866
}
867

    
868
void kvm_cpu_synchronize_state(CPUState *env)
869
{
870
    if (!env->kvm_vcpu_dirty) {
871
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
872
    }
873
}
874

    
875
void kvm_cpu_synchronize_post_reset(CPUState *env)
876
{
877
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
878
    env->kvm_vcpu_dirty = 0;
879
}
880

    
881
void kvm_cpu_synchronize_post_init(CPUState *env)
882
{
883
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
884
    env->kvm_vcpu_dirty = 0;
885
}
886

    
887
int kvm_cpu_exec(CPUState *env)
888
{
889
    struct kvm_run *run = env->kvm_run;
890
    int ret;
891

    
892
    DPRINTF("kvm_cpu_exec()\n");
893

    
894
    do {
895
#ifndef CONFIG_IOTHREAD
896
        if (env->exit_request) {
897
            DPRINTF("interrupt exit requested\n");
898
            ret = 0;
899
            break;
900
        }
901
#endif
902

    
903
        if (kvm_arch_process_irqchip_events(env)) {
904
            ret = 0;
905
            break;
906
        }
907

    
908
        if (env->kvm_vcpu_dirty) {
909
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
910
            env->kvm_vcpu_dirty = 0;
911
        }
912

    
913
        kvm_arch_pre_run(env, run);
914
        cpu_single_env = NULL;
915
        qemu_mutex_unlock_iothread();
916
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
917
        qemu_mutex_lock_iothread();
918
        cpu_single_env = env;
919
        kvm_arch_post_run(env, run);
920

    
921
        if (ret == -EINTR || ret == -EAGAIN) {
922
            cpu_exit(env);
923
            DPRINTF("io window exit\n");
924
            ret = 0;
925
            break;
926
        }
927

    
928
        if (ret < 0) {
929
            DPRINTF("kvm run failed %s\n", strerror(-ret));
930
            abort();
931
        }
932

    
933
        kvm_flush_coalesced_mmio_buffer();
934

    
935
        ret = 0; /* exit loop */
936
        switch (run->exit_reason) {
937
        case KVM_EXIT_IO:
938
            DPRINTF("handle_io\n");
939
            ret = kvm_handle_io(run->io.port,
940
                                (uint8_t *)run + run->io.data_offset,
941
                                run->io.direction,
942
                                run->io.size,
943
                                run->io.count);
944
            break;
945
        case KVM_EXIT_MMIO:
946
            DPRINTF("handle_mmio\n");
947
            cpu_physical_memory_rw(run->mmio.phys_addr,
948
                                   run->mmio.data,
949
                                   run->mmio.len,
950
                                   run->mmio.is_write);
951
            ret = 1;
952
            break;
953
        case KVM_EXIT_IRQ_WINDOW_OPEN:
954
            DPRINTF("irq_window_open\n");
955
            break;
956
        case KVM_EXIT_SHUTDOWN:
957
            DPRINTF("shutdown\n");
958
            qemu_system_reset_request();
959
            ret = 1;
960
            break;
961
        case KVM_EXIT_UNKNOWN:
962
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
963
                    (uint64_t)run->hw.hardware_exit_reason);
964
            ret = -1;
965
            break;
966
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
967
        case KVM_EXIT_INTERNAL_ERROR:
968
            ret = kvm_handle_internal_error(env, run);
969
            break;
970
#endif
971
        case KVM_EXIT_DEBUG:
972
            DPRINTF("kvm_exit_debug\n");
973
#ifdef KVM_CAP_SET_GUEST_DEBUG
974
            if (kvm_arch_debug(&run->debug.arch)) {
975
                env->exception_index = EXCP_DEBUG;
976
                return 0;
977
            }
978
            /* re-enter, this exception was guest-internal */
979
            ret = 1;
980
#endif /* KVM_CAP_SET_GUEST_DEBUG */
981
            break;
982
        default:
983
            DPRINTF("kvm_arch_handle_exit\n");
984
            ret = kvm_arch_handle_exit(env, run);
985
            break;
986
        }
987
    } while (ret > 0);
988

    
989
    if (ret < 0) {
990
        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
991
        vm_stop(0);
992
        env->exit_request = 1;
993
    }
994
    if (env->exit_request) {
995
        env->exit_request = 0;
996
        env->exception_index = EXCP_INTERRUPT;
997
    }
998

    
999
    return ret;
1000
}
1001

    
1002
int kvm_ioctl(KVMState *s, int type, ...)
1003
{
1004
    int ret;
1005
    void *arg;
1006
    va_list ap;
1007

    
1008
    va_start(ap, type);
1009
    arg = va_arg(ap, void *);
1010
    va_end(ap);
1011

    
1012
    ret = ioctl(s->fd, type, arg);
1013
    if (ret == -1) {
1014
        ret = -errno;
1015
    }
1016
    return ret;
1017
}
1018

    
1019
int kvm_vm_ioctl(KVMState *s, int type, ...)
1020
{
1021
    int ret;
1022
    void *arg;
1023
    va_list ap;
1024

    
1025
    va_start(ap, type);
1026
    arg = va_arg(ap, void *);
1027
    va_end(ap);
1028

    
1029
    ret = ioctl(s->vmfd, type, arg);
1030
    if (ret == -1) {
1031
        ret = -errno;
1032
    }
1033
    return ret;
1034
}
1035

    
1036
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
1037
{
1038
    int ret;
1039
    void *arg;
1040
    va_list ap;
1041

    
1042
    va_start(ap, type);
1043
    arg = va_arg(ap, void *);
1044
    va_end(ap);
1045

    
1046
    ret = ioctl(env->kvm_fd, type, arg);
1047
    if (ret == -1) {
1048
        ret = -errno;
1049
    }
1050
    return ret;
1051
}
1052

    
1053
int kvm_has_sync_mmu(void)
1054
{
1055
    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1056
}
1057

    
1058
int kvm_has_vcpu_events(void)
1059
{
1060
    return kvm_state->vcpu_events;
1061
}
1062

    
1063
int kvm_has_robust_singlestep(void)
1064
{
1065
    return kvm_state->robust_singlestep;
1066
}
1067

    
1068
int kvm_has_debugregs(void)
1069
{
1070
    return kvm_state->debugregs;
1071
}
1072

    
1073
int kvm_has_xsave(void)
1074
{
1075
    return kvm_state->xsave;
1076
}
1077

    
1078
int kvm_has_xcrs(void)
1079
{
1080
    return kvm_state->xcrs;
1081
}
1082

    
1083
int kvm_has_many_ioeventfds(void)
1084
{
1085
    if (!kvm_enabled()) {
1086
        return 0;
1087
    }
1088
    return kvm_state->many_ioeventfds;
1089
}
1090

    
1091
void kvm_setup_guest_memory(void *start, size_t size)
1092
{
1093
    if (!kvm_has_sync_mmu()) {
1094
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1095

    
1096
        if (ret) {
1097
            perror("qemu_madvise");
1098
            fprintf(stderr,
1099
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1100
            exit(1);
1101
        }
1102
    }
1103
}
1104

    
1105
#ifdef KVM_CAP_SET_GUEST_DEBUG
1106
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1107
                                                 target_ulong pc)
1108
{
1109
    struct kvm_sw_breakpoint *bp;
1110

    
1111
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1112
        if (bp->pc == pc) {
1113
            return bp;
1114
        }
1115
    }
1116
    return NULL;
1117
}
1118

    
1119
int kvm_sw_breakpoints_active(CPUState *env)
1120
{
1121
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1122
}
1123

    
1124
struct kvm_set_guest_debug_data {
1125
    struct kvm_guest_debug dbg;
1126
    CPUState *env;
1127
    int err;
1128
};
1129

    
1130
static void kvm_invoke_set_guest_debug(void *data)
1131
{
1132
    struct kvm_set_guest_debug_data *dbg_data = data;
1133
    CPUState *env = dbg_data->env;
1134

    
1135
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1136
}
1137

    
1138
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1139
{
1140
    struct kvm_set_guest_debug_data data;
1141

    
1142
    data.dbg.control = reinject_trap;
1143

    
1144
    if (env->singlestep_enabled) {
1145
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1146
    }
1147
    kvm_arch_update_guest_debug(env, &data.dbg);
1148
    data.env = env;
1149

    
1150
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1151
    return data.err;
1152
}
1153

    
1154
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1155
                          target_ulong len, int type)
1156
{
1157
    struct kvm_sw_breakpoint *bp;
1158
    CPUState *env;
1159
    int err;
1160

    
1161
    if (type == GDB_BREAKPOINT_SW) {
1162
        bp = kvm_find_sw_breakpoint(current_env, addr);
1163
        if (bp) {
1164
            bp->use_count++;
1165
            return 0;
1166
        }
1167

    
1168
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1169
        if (!bp) {
1170
            return -ENOMEM;
1171
        }
1172

    
1173
        bp->pc = addr;
1174
        bp->use_count = 1;
1175
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1176
        if (err) {
1177
            free(bp);
1178
            return err;
1179
        }
1180

    
1181
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1182
                          bp, entry);
1183
    } else {
1184
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1185
        if (err) {
1186
            return err;
1187
        }
1188
    }
1189

    
1190
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1191
        err = kvm_update_guest_debug(env, 0);
1192
        if (err) {
1193
            return err;
1194
        }
1195
    }
1196
    return 0;
1197
}
1198

    
1199
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1200
                          target_ulong len, int type)
1201
{
1202
    struct kvm_sw_breakpoint *bp;
1203
    CPUState *env;
1204
    int err;
1205

    
1206
    if (type == GDB_BREAKPOINT_SW) {
1207
        bp = kvm_find_sw_breakpoint(current_env, addr);
1208
        if (!bp) {
1209
            return -ENOENT;
1210
        }
1211

    
1212
        if (bp->use_count > 1) {
1213
            bp->use_count--;
1214
            return 0;
1215
        }
1216

    
1217
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1218
        if (err) {
1219
            return err;
1220
        }
1221

    
1222
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1223
        qemu_free(bp);
1224
    } else {
1225
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1226
        if (err) {
1227
            return err;
1228
        }
1229
    }
1230

    
1231
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1232
        err = kvm_update_guest_debug(env, 0);
1233
        if (err) {
1234
            return err;
1235
        }
1236
    }
1237
    return 0;
1238
}
1239

    
1240
void kvm_remove_all_breakpoints(CPUState *current_env)
1241
{
1242
    struct kvm_sw_breakpoint *bp, *next;
1243
    KVMState *s = current_env->kvm_state;
1244
    CPUState *env;
1245

    
1246
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1247
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1248
            /* Try harder to find a CPU that currently sees the breakpoint. */
1249
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1250
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1251
                    break;
1252
                }
1253
            }
1254
        }
1255
    }
1256
    kvm_arch_remove_all_hw_breakpoints();
1257

    
1258
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1259
        kvm_update_guest_debug(env, 0);
1260
    }
1261
}
1262

    
1263
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1264

    
1265
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1266
{
1267
    return -EINVAL;
1268
}
1269

    
1270
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1271
                          target_ulong len, int type)
1272
{
1273
    return -EINVAL;
1274
}
1275

    
1276
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1277
                          target_ulong len, int type)
1278
{
1279
    return -EINVAL;
1280
}
1281

    
1282
void kvm_remove_all_breakpoints(CPUState *current_env)
1283
{
1284
}
1285
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1286

    
1287
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1288
{
1289
    struct kvm_signal_mask *sigmask;
1290
    int r;
1291

    
1292
    if (!sigset) {
1293
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1294
    }
1295

    
1296
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1297

    
1298
    sigmask->len = 8;
1299
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1300
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1301
    free(sigmask);
1302

    
1303
    return r;
1304
}
1305

    
1306
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1307
{
1308
#ifdef KVM_IOEVENTFD
1309
    int ret;
1310
    struct kvm_ioeventfd iofd;
1311

    
1312
    iofd.datamatch = val;
1313
    iofd.addr = addr;
1314
    iofd.len = 4;
1315
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1316
    iofd.fd = fd;
1317

    
1318
    if (!kvm_enabled()) {
1319
        return -ENOSYS;
1320
    }
1321

    
1322
    if (!assign) {
1323
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1324
    }
1325

    
1326
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1327

    
1328
    if (ret < 0) {
1329
        return -errno;
1330
    }
1331

    
1332
    return 0;
1333
#else
1334
    return -ENOSYS;
1335
#endif
1336
}
1337

    
1338
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1339
{
1340
#ifdef KVM_IOEVENTFD
1341
    struct kvm_ioeventfd kick = {
1342
        .datamatch = val,
1343
        .addr = addr,
1344
        .len = 2,
1345
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1346
        .fd = fd,
1347
    };
1348
    int r;
1349
    if (!kvm_enabled()) {
1350
        return -ENOSYS;
1351
    }
1352
    if (!assign) {
1353
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1354
    }
1355
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1356
    if (r < 0) {
1357
        return r;
1358
    }
1359
    return 0;
1360
#else
1361
    return -ENOSYS;
1362
#endif
1363
}