Statistics
| Branch: | Revision:

root / kvm-all.c @ bb44e0d1

History | View | Annotate | Download (34.4 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* This check must be after config-host.h is included */
32
#ifdef CONFIG_EVENTFD
33
#include <sys/eventfd.h>
34
#endif
35

    
36
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
37
#define PAGE_SIZE TARGET_PAGE_SIZE
38

    
39
//#define DEBUG_KVM
40

    
41
#ifdef DEBUG_KVM
42
#define DPRINTF(fmt, ...) \
43
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
44
#else
45
#define DPRINTF(fmt, ...) \
46
    do { } while (0)
47
#endif
48

    
49
typedef struct KVMSlot
50
{
51
    target_phys_addr_t start_addr;
52
    ram_addr_t memory_size;
53
    ram_addr_t phys_offset;
54
    int slot;
55
    int flags;
56
} KVMSlot;
57

    
58
typedef struct kvm_dirty_log KVMDirtyLog;
59

    
60
struct KVMState
61
{
62
    KVMSlot slots[32];
63
    int fd;
64
    int vmfd;
65
    int coalesced_mmio;
66
#ifdef KVM_CAP_COALESCED_MMIO
67
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
68
#endif
69
    int broken_set_mem_region;
70
    int migration_log;
71
    int vcpu_events;
72
    int robust_singlestep;
73
    int debugregs;
74
#ifdef KVM_CAP_SET_GUEST_DEBUG
75
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
76
#endif
77
    int irqchip_in_kernel;
78
    int pit_in_kernel;
79
    int xsave, xcrs;
80
    int many_ioeventfds;
81
};
82

    
83
static KVMState *kvm_state;
84

    
85
static KVMSlot *kvm_alloc_slot(KVMState *s)
86
{
87
    int i;
88

    
89
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
90
        /* KVM private memory slots */
91
        if (i >= 8 && i < 12) {
92
            continue;
93
        }
94
        if (s->slots[i].memory_size == 0) {
95
            return &s->slots[i];
96
        }
97
    }
98

    
99
    fprintf(stderr, "%s: no free slot available\n", __func__);
100
    abort();
101
}
102

    
103
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
104
                                         target_phys_addr_t start_addr,
105
                                         target_phys_addr_t end_addr)
106
{
107
    int i;
108

    
109
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
110
        KVMSlot *mem = &s->slots[i];
111

    
112
        if (start_addr == mem->start_addr &&
113
            end_addr == mem->start_addr + mem->memory_size) {
114
            return mem;
115
        }
116
    }
117

    
118
    return NULL;
119
}
120

    
121
/*
122
 * Find overlapping slot with lowest start address
123
 */
124
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
125
                                            target_phys_addr_t start_addr,
126
                                            target_phys_addr_t end_addr)
127
{
128
    KVMSlot *found = NULL;
129
    int i;
130

    
131
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
132
        KVMSlot *mem = &s->slots[i];
133

    
134
        if (mem->memory_size == 0 ||
135
            (found && found->start_addr < mem->start_addr)) {
136
            continue;
137
        }
138

    
139
        if (end_addr > mem->start_addr &&
140
            start_addr < mem->start_addr + mem->memory_size) {
141
            found = mem;
142
        }
143
    }
144

    
145
    return found;
146
}
147

    
148
int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
149
                                      target_phys_addr_t *phys_addr)
150
{
151
    int i;
152

    
153
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
154
        KVMSlot *mem = &s->slots[i];
155

    
156
        if (ram_addr >= mem->phys_offset &&
157
            ram_addr < mem->phys_offset + mem->memory_size) {
158
            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
159
            return 1;
160
        }
161
    }
162

    
163
    return 0;
164
}
165

    
166
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
167
{
168
    struct kvm_userspace_memory_region mem;
169

    
170
    mem.slot = slot->slot;
171
    mem.guest_phys_addr = slot->start_addr;
172
    mem.memory_size = slot->memory_size;
173
    mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
174
    mem.flags = slot->flags;
175
    if (s->migration_log) {
176
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
177
    }
178
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
179
}
180

    
181
static void kvm_reset_vcpu(void *opaque)
182
{
183
    CPUState *env = opaque;
184

    
185
    kvm_arch_reset_vcpu(env);
186
}
187

    
188
int kvm_irqchip_in_kernel(void)
189
{
190
    return kvm_state->irqchip_in_kernel;
191
}
192

    
193
int kvm_pit_in_kernel(void)
194
{
195
    return kvm_state->pit_in_kernel;
196
}
197

    
198

    
199
int kvm_init_vcpu(CPUState *env)
200
{
201
    KVMState *s = kvm_state;
202
    long mmap_size;
203
    int ret;
204

    
205
    DPRINTF("kvm_init_vcpu\n");
206

    
207
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
208
    if (ret < 0) {
209
        DPRINTF("kvm_create_vcpu failed\n");
210
        goto err;
211
    }
212

    
213
    env->kvm_fd = ret;
214
    env->kvm_state = s;
215

    
216
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
217
    if (mmap_size < 0) {
218
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
219
        goto err;
220
    }
221

    
222
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
223
                        env->kvm_fd, 0);
224
    if (env->kvm_run == MAP_FAILED) {
225
        ret = -errno;
226
        DPRINTF("mmap'ing vcpu state failed\n");
227
        goto err;
228
    }
229

    
230
#ifdef KVM_CAP_COALESCED_MMIO
231
    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
232
        s->coalesced_mmio_ring =
233
            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
234
    }
235
#endif
236

    
237
    ret = kvm_arch_init_vcpu(env);
238
    if (ret == 0) {
239
        qemu_register_reset(kvm_reset_vcpu, env);
240
        kvm_arch_reset_vcpu(env);
241
    }
242
err:
243
    return ret;
244
}
245

    
246
/*
247
 * dirty pages logging control
248
 */
249
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
250
                                      ram_addr_t size, int flags, int mask)
251
{
252
    KVMState *s = kvm_state;
253
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
254
    int old_flags;
255

    
256
    if (mem == NULL)  {
257
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
258
                    TARGET_FMT_plx "\n", __func__, phys_addr,
259
                    (target_phys_addr_t)(phys_addr + size - 1));
260
            return -EINVAL;
261
    }
262

    
263
    old_flags = mem->flags;
264

    
265
    flags = (mem->flags & ~mask) | flags;
266
    mem->flags = flags;
267

    
268
    /* If nothing changed effectively, no need to issue ioctl */
269
    if (s->migration_log) {
270
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
271
    }
272
    if (flags == old_flags) {
273
            return 0;
274
    }
275

    
276
    return kvm_set_user_memory_region(s, mem);
277
}
278

    
279
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
280
{
281
    return kvm_dirty_pages_log_change(phys_addr, size, KVM_MEM_LOG_DIRTY_PAGES,
282
                                      KVM_MEM_LOG_DIRTY_PAGES);
283
}
284

    
285
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
286
{
287
    return kvm_dirty_pages_log_change(phys_addr, size, 0,
288
                                      KVM_MEM_LOG_DIRTY_PAGES);
289
}
290

    
291
static int kvm_set_migration_log(int enable)
292
{
293
    KVMState *s = kvm_state;
294
    KVMSlot *mem;
295
    int i, err;
296

    
297
    s->migration_log = enable;
298

    
299
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
300
        mem = &s->slots[i];
301

    
302
        if (!mem->memory_size) {
303
            continue;
304
        }
305
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
306
            continue;
307
        }
308
        err = kvm_set_user_memory_region(s, mem);
309
        if (err) {
310
            return err;
311
        }
312
    }
313
    return 0;
314
}
315

    
316
/* get kvm's dirty pages bitmap and update qemu's */
317
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
318
                                         unsigned long *bitmap,
319
                                         unsigned long offset,
320
                                         unsigned long mem_size)
321
{
322
    unsigned int i, j;
323
    unsigned long page_number, addr, addr1, c;
324
    ram_addr_t ram_addr;
325
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
326
        HOST_LONG_BITS;
327

    
328
    /*
329
     * bitmap-traveling is faster than memory-traveling (for addr...)
330
     * especially when most of the memory is not dirty.
331
     */
332
    for (i = 0; i < len; i++) {
333
        if (bitmap[i] != 0) {
334
            c = leul_to_cpu(bitmap[i]);
335
            do {
336
                j = ffsl(c) - 1;
337
                c &= ~(1ul << j);
338
                page_number = i * HOST_LONG_BITS + j;
339
                addr1 = page_number * TARGET_PAGE_SIZE;
340
                addr = offset + addr1;
341
                ram_addr = cpu_get_physical_page_desc(addr);
342
                cpu_physical_memory_set_dirty(ram_addr);
343
            } while (c != 0);
344
        }
345
    }
346
    return 0;
347
}
348

    
349
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
350

    
351
/**
352
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
353
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
354
 * This means all bits are set to dirty.
355
 *
356
 * @start_add: start of logged region.
357
 * @end_addr: end of logged region.
358
 */
359
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
360
                                          target_phys_addr_t end_addr)
361
{
362
    KVMState *s = kvm_state;
363
    unsigned long size, allocated_size = 0;
364
    KVMDirtyLog d;
365
    KVMSlot *mem;
366
    int ret = 0;
367

    
368
    d.dirty_bitmap = NULL;
369
    while (start_addr < end_addr) {
370
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
371
        if (mem == NULL) {
372
            break;
373
        }
374

    
375
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
376
        if (!d.dirty_bitmap) {
377
            d.dirty_bitmap = qemu_malloc(size);
378
        } else if (size > allocated_size) {
379
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
380
        }
381
        allocated_size = size;
382
        memset(d.dirty_bitmap, 0, allocated_size);
383

    
384
        d.slot = mem->slot;
385

    
386
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
387
            DPRINTF("ioctl failed %d\n", errno);
388
            ret = -1;
389
            break;
390
        }
391

    
392
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
393
                                      mem->start_addr, mem->memory_size);
394
        start_addr = mem->start_addr + mem->memory_size;
395
    }
396
    qemu_free(d.dirty_bitmap);
397

    
398
    return ret;
399
}
400

    
401
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
402
{
403
    int ret = -ENOSYS;
404
#ifdef KVM_CAP_COALESCED_MMIO
405
    KVMState *s = kvm_state;
406

    
407
    if (s->coalesced_mmio) {
408
        struct kvm_coalesced_mmio_zone zone;
409

    
410
        zone.addr = start;
411
        zone.size = size;
412

    
413
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
414
    }
415
#endif
416

    
417
    return ret;
418
}
419

    
420
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
421
{
422
    int ret = -ENOSYS;
423
#ifdef KVM_CAP_COALESCED_MMIO
424
    KVMState *s = kvm_state;
425

    
426
    if (s->coalesced_mmio) {
427
        struct kvm_coalesced_mmio_zone zone;
428

    
429
        zone.addr = start;
430
        zone.size = size;
431

    
432
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
433
    }
434
#endif
435

    
436
    return ret;
437
}
438

    
439
int kvm_check_extension(KVMState *s, unsigned int extension)
440
{
441
    int ret;
442

    
443
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
444
    if (ret < 0) {
445
        ret = 0;
446
    }
447

    
448
    return ret;
449
}
450

    
451
static int kvm_check_many_ioeventfds(void)
452
{
453
    /* Older kernels have a 6 device limit on the KVM io bus.  Find out so we
454
     * can avoid creating too many ioeventfds.
455
     */
456
#ifdef CONFIG_EVENTFD
457
    int ioeventfds[7];
458
    int i, ret = 0;
459
    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
460
        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
461
        if (ioeventfds[i] < 0) {
462
            break;
463
        }
464
        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
465
        if (ret < 0) {
466
            close(ioeventfds[i]);
467
            break;
468
        }
469
    }
470

    
471
    /* Decide whether many devices are supported or not */
472
    ret = i == ARRAY_SIZE(ioeventfds);
473

    
474
    while (i-- > 0) {
475
        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
476
        close(ioeventfds[i]);
477
    }
478
    return ret;
479
#else
480
    return 0;
481
#endif
482
}
483

    
484
static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
485
                             ram_addr_t phys_offset)
486
{
487
    KVMState *s = kvm_state;
488
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
489
    KVMSlot *mem, old;
490
    int err;
491

    
492
    /* kvm works in page size chunks, but the function may be called
493
       with sub-page size and unaligned start address. */
494
    size = TARGET_PAGE_ALIGN(size);
495
    start_addr = TARGET_PAGE_ALIGN(start_addr);
496

    
497
    /* KVM does not support read-only slots */
498
    phys_offset &= ~IO_MEM_ROM;
499

    
500
    while (1) {
501
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
502
        if (!mem) {
503
            break;
504
        }
505

    
506
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
507
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
508
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
509
            /* The new slot fits into the existing one and comes with
510
             * identical parameters - nothing to be done. */
511
            return;
512
        }
513

    
514
        old = *mem;
515

    
516
        /* unregister the overlapping slot */
517
        mem->memory_size = 0;
518
        err = kvm_set_user_memory_region(s, mem);
519
        if (err) {
520
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
521
                    __func__, strerror(-err));
522
            abort();
523
        }
524

    
525
        /* Workaround for older KVM versions: we can't join slots, even not by
526
         * unregistering the previous ones and then registering the larger
527
         * slot. We have to maintain the existing fragmentation. Sigh.
528
         *
529
         * This workaround assumes that the new slot starts at the same
530
         * address as the first existing one. If not or if some overlapping
531
         * slot comes around later, we will fail (not seen in practice so far)
532
         * - and actually require a recent KVM version. */
533
        if (s->broken_set_mem_region &&
534
            old.start_addr == start_addr && old.memory_size < size &&
535
            flags < IO_MEM_UNASSIGNED) {
536
            mem = kvm_alloc_slot(s);
537
            mem->memory_size = old.memory_size;
538
            mem->start_addr = old.start_addr;
539
            mem->phys_offset = old.phys_offset;
540
            mem->flags = 0;
541

    
542
            err = kvm_set_user_memory_region(s, mem);
543
            if (err) {
544
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
545
                        strerror(-err));
546
                abort();
547
            }
548

    
549
            start_addr += old.memory_size;
550
            phys_offset += old.memory_size;
551
            size -= old.memory_size;
552
            continue;
553
        }
554

    
555
        /* register prefix slot */
556
        if (old.start_addr < start_addr) {
557
            mem = kvm_alloc_slot(s);
558
            mem->memory_size = start_addr - old.start_addr;
559
            mem->start_addr = old.start_addr;
560
            mem->phys_offset = old.phys_offset;
561
            mem->flags = 0;
562

    
563
            err = kvm_set_user_memory_region(s, mem);
564
            if (err) {
565
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
566
                        __func__, strerror(-err));
567
                abort();
568
            }
569
        }
570

    
571
        /* register suffix slot */
572
        if (old.start_addr + old.memory_size > start_addr + size) {
573
            ram_addr_t size_delta;
574

    
575
            mem = kvm_alloc_slot(s);
576
            mem->start_addr = start_addr + size;
577
            size_delta = mem->start_addr - old.start_addr;
578
            mem->memory_size = old.memory_size - size_delta;
579
            mem->phys_offset = old.phys_offset + size_delta;
580
            mem->flags = 0;
581

    
582
            err = kvm_set_user_memory_region(s, mem);
583
            if (err) {
584
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
585
                        __func__, strerror(-err));
586
                abort();
587
            }
588
        }
589
    }
590

    
591
    /* in case the KVM bug workaround already "consumed" the new slot */
592
    if (!size) {
593
        return;
594
    }
595
    /* KVM does not need to know about this memory */
596
    if (flags >= IO_MEM_UNASSIGNED) {
597
        return;
598
    }
599
    mem = kvm_alloc_slot(s);
600
    mem->memory_size = size;
601
    mem->start_addr = start_addr;
602
    mem->phys_offset = phys_offset;
603
    mem->flags = 0;
604

    
605
    err = kvm_set_user_memory_region(s, mem);
606
    if (err) {
607
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
608
                strerror(-err));
609
        abort();
610
    }
611
}
612

    
613
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
614
                                  target_phys_addr_t start_addr,
615
                                  ram_addr_t size, ram_addr_t phys_offset)
616
{
617
    kvm_set_phys_mem(start_addr, size, phys_offset);
618
}
619

    
620
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
621
                                        target_phys_addr_t start_addr,
622
                                        target_phys_addr_t end_addr)
623
{
624
    return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
625
}
626

    
627
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
628
                                    int enable)
629
{
630
    return kvm_set_migration_log(enable);
631
}
632

    
633
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
634
    .set_memory = kvm_client_set_memory,
635
    .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
636
    .migration_log = kvm_client_migration_log,
637
};
638

    
639
int kvm_init(int smp_cpus)
640
{
641
    static const char upgrade_note[] =
642
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
643
        "(see http://sourceforge.net/projects/kvm).\n";
644
    KVMState *s;
645
    int ret;
646
    int i;
647

    
648
    s = qemu_mallocz(sizeof(KVMState));
649

    
650
#ifdef KVM_CAP_SET_GUEST_DEBUG
651
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
652
#endif
653
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
654
        s->slots[i].slot = i;
655
    }
656
    s->vmfd = -1;
657
    s->fd = qemu_open("/dev/kvm", O_RDWR);
658
    if (s->fd == -1) {
659
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
660
        ret = -errno;
661
        goto err;
662
    }
663

    
664
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
665
    if (ret < KVM_API_VERSION) {
666
        if (ret > 0) {
667
            ret = -EINVAL;
668
        }
669
        fprintf(stderr, "kvm version too old\n");
670
        goto err;
671
    }
672

    
673
    if (ret > KVM_API_VERSION) {
674
        ret = -EINVAL;
675
        fprintf(stderr, "kvm version not supported\n");
676
        goto err;
677
    }
678

    
679
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
680
    if (s->vmfd < 0) {
681
#ifdef TARGET_S390X
682
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
683
                        "your host kernel command line\n");
684
#endif
685
        goto err;
686
    }
687

    
688
    /* initially, KVM allocated its own memory and we had to jump through
689
     * hooks to make phys_ram_base point to this.  Modern versions of KVM
690
     * just use a user allocated buffer so we can use regular pages
691
     * unmodified.  Make sure we have a sufficiently modern version of KVM.
692
     */
693
    if (!kvm_check_extension(s, KVM_CAP_USER_MEMORY)) {
694
        ret = -EINVAL;
695
        fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n%s",
696
                upgrade_note);
697
        goto err;
698
    }
699

    
700
    /* There was a nasty bug in < kvm-80 that prevents memory slots from being
701
     * destroyed properly.  Since we rely on this capability, refuse to work
702
     * with any kernel without this capability. */
703
    if (!kvm_check_extension(s, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
704
        ret = -EINVAL;
705

    
706
        fprintf(stderr,
707
                "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
708
                upgrade_note);
709
        goto err;
710
    }
711

    
712
    s->coalesced_mmio = 0;
713
#ifdef KVM_CAP_COALESCED_MMIO
714
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
715
    s->coalesced_mmio_ring = NULL;
716
#endif
717

    
718
    s->broken_set_mem_region = 1;
719
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
720
    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
721
    if (ret > 0) {
722
        s->broken_set_mem_region = 0;
723
    }
724
#endif
725

    
726
    s->vcpu_events = 0;
727
#ifdef KVM_CAP_VCPU_EVENTS
728
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
729
#endif
730

    
731
    s->robust_singlestep = 0;
732
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
733
    s->robust_singlestep =
734
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
735
#endif
736

    
737
    s->debugregs = 0;
738
#ifdef KVM_CAP_DEBUGREGS
739
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
740
#endif
741

    
742
    s->xsave = 0;
743
#ifdef KVM_CAP_XSAVE
744
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
745
#endif
746

    
747
    s->xcrs = 0;
748
#ifdef KVM_CAP_XCRS
749
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
750
#endif
751

    
752
    ret = kvm_arch_init(s, smp_cpus);
753
    if (ret < 0) {
754
        goto err;
755
    }
756

    
757
    kvm_state = s;
758
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
759

    
760
    s->many_ioeventfds = kvm_check_many_ioeventfds();
761

    
762
    return 0;
763

    
764
err:
765
    if (s) {
766
        if (s->vmfd != -1) {
767
            close(s->vmfd);
768
        }
769
        if (s->fd != -1) {
770
            close(s->fd);
771
        }
772
    }
773
    qemu_free(s);
774

    
775
    return ret;
776
}
777

    
778
static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
779
                         uint32_t count)
780
{
781
    int i;
782
    uint8_t *ptr = data;
783

    
784
    for (i = 0; i < count; i++) {
785
        if (direction == KVM_EXIT_IO_IN) {
786
            switch (size) {
787
            case 1:
788
                stb_p(ptr, cpu_inb(port));
789
                break;
790
            case 2:
791
                stw_p(ptr, cpu_inw(port));
792
                break;
793
            case 4:
794
                stl_p(ptr, cpu_inl(port));
795
                break;
796
            }
797
        } else {
798
            switch (size) {
799
            case 1:
800
                cpu_outb(port, ldub_p(ptr));
801
                break;
802
            case 2:
803
                cpu_outw(port, lduw_p(ptr));
804
                break;
805
            case 4:
806
                cpu_outl(port, ldl_p(ptr));
807
                break;
808
            }
809
        }
810

    
811
        ptr += size;
812
    }
813

    
814
    return 1;
815
}
816

    
817
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
818
static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
819
{
820
    fprintf(stderr, "KVM internal error.");
821
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
822
        int i;
823

    
824
        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
825
        for (i = 0; i < run->internal.ndata; ++i) {
826
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
827
                    i, (uint64_t)run->internal.data[i]);
828
        }
829
    } else {
830
        fprintf(stderr, "\n");
831
    }
832
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
833
        fprintf(stderr, "emulation failure\n");
834
        if (!kvm_arch_stop_on_emulation_error(env)) {
835
            cpu_dump_state(env, stderr, fprintf, 0);
836
            return 0;
837
        }
838
    }
839
    /* FIXME: Should trigger a qmp message to let management know
840
     * something went wrong.
841
     */
842
    return -1;
843
}
844
#endif
845

    
846
void kvm_flush_coalesced_mmio_buffer(void)
847
{
848
#ifdef KVM_CAP_COALESCED_MMIO
849
    KVMState *s = kvm_state;
850
    if (s->coalesced_mmio_ring) {
851
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
852
        while (ring->first != ring->last) {
853
            struct kvm_coalesced_mmio *ent;
854

    
855
            ent = &ring->coalesced_mmio[ring->first];
856

    
857
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
858
            smp_wmb();
859
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
860
        }
861
    }
862
#endif
863
}
864

    
865
static void do_kvm_cpu_synchronize_state(void *_env)
866
{
867
    CPUState *env = _env;
868

    
869
    if (!env->kvm_vcpu_dirty) {
870
        kvm_arch_get_registers(env);
871
        env->kvm_vcpu_dirty = 1;
872
    }
873
}
874

    
875
void kvm_cpu_synchronize_state(CPUState *env)
876
{
877
    if (!env->kvm_vcpu_dirty) {
878
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
879
    }
880
}
881

    
882
void kvm_cpu_synchronize_post_reset(CPUState *env)
883
{
884
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
885
    env->kvm_vcpu_dirty = 0;
886
}
887

    
888
void kvm_cpu_synchronize_post_init(CPUState *env)
889
{
890
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
891
    env->kvm_vcpu_dirty = 0;
892
}
893

    
894
int kvm_cpu_exec(CPUState *env)
895
{
896
    struct kvm_run *run = env->kvm_run;
897
    int ret;
898

    
899
    DPRINTF("kvm_cpu_exec()\n");
900

    
901
    do {
902
#ifndef CONFIG_IOTHREAD
903
        if (env->exit_request) {
904
            DPRINTF("interrupt exit requested\n");
905
            ret = 0;
906
            break;
907
        }
908
#endif
909

    
910
        if (kvm_arch_process_irqchip_events(env)) {
911
            ret = 0;
912
            break;
913
        }
914

    
915
        if (env->kvm_vcpu_dirty) {
916
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
917
            env->kvm_vcpu_dirty = 0;
918
        }
919

    
920
        kvm_arch_pre_run(env, run);
921
        cpu_single_env = NULL;
922
        qemu_mutex_unlock_iothread();
923
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
924
        qemu_mutex_lock_iothread();
925
        cpu_single_env = env;
926
        kvm_arch_post_run(env, run);
927

    
928
        if (ret == -EINTR || ret == -EAGAIN) {
929
            cpu_exit(env);
930
            DPRINTF("io window exit\n");
931
            ret = 0;
932
            break;
933
        }
934

    
935
        if (ret < 0) {
936
            DPRINTF("kvm run failed %s\n", strerror(-ret));
937
            abort();
938
        }
939

    
940
        kvm_flush_coalesced_mmio_buffer();
941

    
942
        ret = 0; /* exit loop */
943
        switch (run->exit_reason) {
944
        case KVM_EXIT_IO:
945
            DPRINTF("handle_io\n");
946
            ret = kvm_handle_io(run->io.port,
947
                                (uint8_t *)run + run->io.data_offset,
948
                                run->io.direction,
949
                                run->io.size,
950
                                run->io.count);
951
            break;
952
        case KVM_EXIT_MMIO:
953
            DPRINTF("handle_mmio\n");
954
            cpu_physical_memory_rw(run->mmio.phys_addr,
955
                                   run->mmio.data,
956
                                   run->mmio.len,
957
                                   run->mmio.is_write);
958
            ret = 1;
959
            break;
960
        case KVM_EXIT_IRQ_WINDOW_OPEN:
961
            DPRINTF("irq_window_open\n");
962
            break;
963
        case KVM_EXIT_SHUTDOWN:
964
            DPRINTF("shutdown\n");
965
            qemu_system_reset_request();
966
            ret = 1;
967
            break;
968
        case KVM_EXIT_UNKNOWN:
969
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
970
                    (uint64_t)run->hw.hardware_exit_reason);
971
            ret = -1;
972
            break;
973
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
974
        case KVM_EXIT_INTERNAL_ERROR:
975
            ret = kvm_handle_internal_error(env, run);
976
            break;
977
#endif
978
        case KVM_EXIT_DEBUG:
979
            DPRINTF("kvm_exit_debug\n");
980
#ifdef KVM_CAP_SET_GUEST_DEBUG
981
            if (kvm_arch_debug(&run->debug.arch)) {
982
                env->exception_index = EXCP_DEBUG;
983
                return 0;
984
            }
985
            /* re-enter, this exception was guest-internal */
986
            ret = 1;
987
#endif /* KVM_CAP_SET_GUEST_DEBUG */
988
            break;
989
        default:
990
            DPRINTF("kvm_arch_handle_exit\n");
991
            ret = kvm_arch_handle_exit(env, run);
992
            break;
993
        }
994
    } while (ret > 0);
995

    
996
    if (ret < 0) {
997
        cpu_dump_state(env, stderr, fprintf, 0);
998
        vm_stop(0);
999
        env->exit_request = 1;
1000
    }
1001
    if (env->exit_request) {
1002
        env->exit_request = 0;
1003
        env->exception_index = EXCP_INTERRUPT;
1004
    }
1005

    
1006
    return ret;
1007
}
1008

    
1009
int kvm_ioctl(KVMState *s, int type, ...)
1010
{
1011
    int ret;
1012
    void *arg;
1013
    va_list ap;
1014

    
1015
    va_start(ap, type);
1016
    arg = va_arg(ap, void *);
1017
    va_end(ap);
1018

    
1019
    ret = ioctl(s->fd, type, arg);
1020
    if (ret == -1) {
1021
        ret = -errno;
1022
    }
1023
    return ret;
1024
}
1025

    
1026
int kvm_vm_ioctl(KVMState *s, int type, ...)
1027
{
1028
    int ret;
1029
    void *arg;
1030
    va_list ap;
1031

    
1032
    va_start(ap, type);
1033
    arg = va_arg(ap, void *);
1034
    va_end(ap);
1035

    
1036
    ret = ioctl(s->vmfd, type, arg);
1037
    if (ret == -1) {
1038
        ret = -errno;
1039
    }
1040
    return ret;
1041
}
1042

    
1043
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
1044
{
1045
    int ret;
1046
    void *arg;
1047
    va_list ap;
1048

    
1049
    va_start(ap, type);
1050
    arg = va_arg(ap, void *);
1051
    va_end(ap);
1052

    
1053
    ret = ioctl(env->kvm_fd, type, arg);
1054
    if (ret == -1) {
1055
        ret = -errno;
1056
    }
1057
    return ret;
1058
}
1059

    
1060
int kvm_has_sync_mmu(void)
1061
{
1062
#ifdef KVM_CAP_SYNC_MMU
1063
    KVMState *s = kvm_state;
1064

    
1065
    return kvm_check_extension(s, KVM_CAP_SYNC_MMU);
1066
#else
1067
    return 0;
1068
#endif
1069
}
1070

    
1071
int kvm_has_vcpu_events(void)
1072
{
1073
    return kvm_state->vcpu_events;
1074
}
1075

    
1076
int kvm_has_robust_singlestep(void)
1077
{
1078
    return kvm_state->robust_singlestep;
1079
}
1080

    
1081
int kvm_has_debugregs(void)
1082
{
1083
    return kvm_state->debugregs;
1084
}
1085

    
1086
int kvm_has_xsave(void)
1087
{
1088
    return kvm_state->xsave;
1089
}
1090

    
1091
int kvm_has_xcrs(void)
1092
{
1093
    return kvm_state->xcrs;
1094
}
1095

    
1096
int kvm_has_many_ioeventfds(void)
1097
{
1098
    if (!kvm_enabled()) {
1099
        return 0;
1100
    }
1101
    return kvm_state->many_ioeventfds;
1102
}
1103

    
1104
void kvm_setup_guest_memory(void *start, size_t size)
1105
{
1106
    if (!kvm_has_sync_mmu()) {
1107
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1108

    
1109
        if (ret) {
1110
            perror("qemu_madvise");
1111
            fprintf(stderr,
1112
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1113
            exit(1);
1114
        }
1115
    }
1116
}
1117

    
1118
#ifdef KVM_CAP_SET_GUEST_DEBUG
1119
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1120
                                                 target_ulong pc)
1121
{
1122
    struct kvm_sw_breakpoint *bp;
1123

    
1124
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1125
        if (bp->pc == pc) {
1126
            return bp;
1127
        }
1128
    }
1129
    return NULL;
1130
}
1131

    
1132
int kvm_sw_breakpoints_active(CPUState *env)
1133
{
1134
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1135
}
1136

    
1137
struct kvm_set_guest_debug_data {
1138
    struct kvm_guest_debug dbg;
1139
    CPUState *env;
1140
    int err;
1141
};
1142

    
1143
static void kvm_invoke_set_guest_debug(void *data)
1144
{
1145
    struct kvm_set_guest_debug_data *dbg_data = data;
1146
    CPUState *env = dbg_data->env;
1147

    
1148
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1149
}
1150

    
1151
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1152
{
1153
    struct kvm_set_guest_debug_data data;
1154

    
1155
    data.dbg.control = reinject_trap;
1156

    
1157
    if (env->singlestep_enabled) {
1158
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1159
    }
1160
    kvm_arch_update_guest_debug(env, &data.dbg);
1161
    data.env = env;
1162

    
1163
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1164
    return data.err;
1165
}
1166

    
1167
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1168
                          target_ulong len, int type)
1169
{
1170
    struct kvm_sw_breakpoint *bp;
1171
    CPUState *env;
1172
    int err;
1173

    
1174
    if (type == GDB_BREAKPOINT_SW) {
1175
        bp = kvm_find_sw_breakpoint(current_env, addr);
1176
        if (bp) {
1177
            bp->use_count++;
1178
            return 0;
1179
        }
1180

    
1181
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1182
        if (!bp) {
1183
            return -ENOMEM;
1184
        }
1185

    
1186
        bp->pc = addr;
1187
        bp->use_count = 1;
1188
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1189
        if (err) {
1190
            free(bp);
1191
            return err;
1192
        }
1193

    
1194
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1195
                          bp, entry);
1196
    } else {
1197
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1198
        if (err) {
1199
            return err;
1200
        }
1201
    }
1202

    
1203
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1204
        err = kvm_update_guest_debug(env, 0);
1205
        if (err) {
1206
            return err;
1207
        }
1208
    }
1209
    return 0;
1210
}
1211

    
1212
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1213
                          target_ulong len, int type)
1214
{
1215
    struct kvm_sw_breakpoint *bp;
1216
    CPUState *env;
1217
    int err;
1218

    
1219
    if (type == GDB_BREAKPOINT_SW) {
1220
        bp = kvm_find_sw_breakpoint(current_env, addr);
1221
        if (!bp) {
1222
            return -ENOENT;
1223
        }
1224

    
1225
        if (bp->use_count > 1) {
1226
            bp->use_count--;
1227
            return 0;
1228
        }
1229

    
1230
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1231
        if (err) {
1232
            return err;
1233
        }
1234

    
1235
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1236
        qemu_free(bp);
1237
    } else {
1238
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1239
        if (err) {
1240
            return err;
1241
        }
1242
    }
1243

    
1244
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1245
        err = kvm_update_guest_debug(env, 0);
1246
        if (err) {
1247
            return err;
1248
        }
1249
    }
1250
    return 0;
1251
}
1252

    
1253
void kvm_remove_all_breakpoints(CPUState *current_env)
1254
{
1255
    struct kvm_sw_breakpoint *bp, *next;
1256
    KVMState *s = current_env->kvm_state;
1257
    CPUState *env;
1258

    
1259
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1260
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1261
            /* Try harder to find a CPU that currently sees the breakpoint. */
1262
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1263
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1264
                    break;
1265
                }
1266
            }
1267
        }
1268
    }
1269
    kvm_arch_remove_all_hw_breakpoints();
1270

    
1271
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1272
        kvm_update_guest_debug(env, 0);
1273
    }
1274
}
1275

    
1276
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1277

    
1278
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1279
{
1280
    return -EINVAL;
1281
}
1282

    
1283
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1284
                          target_ulong len, int type)
1285
{
1286
    return -EINVAL;
1287
}
1288

    
1289
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1290
                          target_ulong len, int type)
1291
{
1292
    return -EINVAL;
1293
}
1294

    
1295
void kvm_remove_all_breakpoints(CPUState *current_env)
1296
{
1297
}
1298
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1299

    
1300
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1301
{
1302
    struct kvm_signal_mask *sigmask;
1303
    int r;
1304

    
1305
    if (!sigset) {
1306
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1307
    }
1308

    
1309
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1310

    
1311
    sigmask->len = 8;
1312
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1313
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1314
    free(sigmask);
1315

    
1316
    return r;
1317
}
1318

    
1319
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1320
{
1321
#ifdef KVM_IOEVENTFD
1322
    int ret;
1323
    struct kvm_ioeventfd iofd;
1324

    
1325
    iofd.datamatch = val;
1326
    iofd.addr = addr;
1327
    iofd.len = 4;
1328
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1329
    iofd.fd = fd;
1330

    
1331
    if (!kvm_enabled()) {
1332
        return -ENOSYS;
1333
    }
1334

    
1335
    if (!assign) {
1336
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1337
    }
1338

    
1339
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1340

    
1341
    if (ret < 0) {
1342
        return -errno;
1343
    }
1344

    
1345
    return 0;
1346
#else
1347
    return -ENOSYS;
1348
#endif
1349
}
1350

    
1351
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1352
{
1353
#ifdef KVM_IOEVENTFD
1354
    struct kvm_ioeventfd kick = {
1355
        .datamatch = val,
1356
        .addr = addr,
1357
        .len = 2,
1358
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1359
        .fd = fd,
1360
    };
1361
    int r;
1362
    if (!kvm_enabled()) {
1363
        return -ENOSYS;
1364
    }
1365
    if (!assign) {
1366
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1367
    }
1368
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1369
    if (r < 0) {
1370
        return r;
1371
    }
1372
    return 0;
1373
#else
1374
    return -ENOSYS;
1375
#endif
1376
}