Statistics
| Branch: | Revision:

root / kvm-all.c @ 983dfc3b

History | View | Annotate | Download (32.8 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
32
#define PAGE_SIZE TARGET_PAGE_SIZE
33

    
34
//#define DEBUG_KVM
35

    
36
#ifdef DEBUG_KVM
37
#define DPRINTF(fmt, ...) \
38
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
39
#else
40
#define DPRINTF(fmt, ...) \
41
    do { } while (0)
42
#endif
43

    
44
typedef struct KVMSlot
45
{
46
    target_phys_addr_t start_addr;
47
    ram_addr_t memory_size;
48
    ram_addr_t phys_offset;
49
    int slot;
50
    int flags;
51
} KVMSlot;
52

    
53
typedef struct kvm_dirty_log KVMDirtyLog;
54

    
55
struct KVMState
56
{
57
    KVMSlot slots[32];
58
    int fd;
59
    int vmfd;
60
    int coalesced_mmio;
61
#ifdef KVM_CAP_COALESCED_MMIO
62
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
63
#endif
64
    int broken_set_mem_region;
65
    int migration_log;
66
    int vcpu_events;
67
    int robust_singlestep;
68
    int debugregs;
69
#ifdef KVM_CAP_SET_GUEST_DEBUG
70
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
71
#endif
72
    int irqchip_in_kernel;
73
    int pit_in_kernel;
74
    int xsave, xcrs;
75
};
76

    
77
static KVMState *kvm_state;
78

    
79
static KVMSlot *kvm_alloc_slot(KVMState *s)
80
{
81
    int i;
82

    
83
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
84
        /* KVM private memory slots */
85
        if (i >= 8 && i < 12)
86
            continue;
87
        if (s->slots[i].memory_size == 0)
88
            return &s->slots[i];
89
    }
90

    
91
    fprintf(stderr, "%s: no free slot available\n", __func__);
92
    abort();
93
}
94

    
95
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
96
                                         target_phys_addr_t start_addr,
97
                                         target_phys_addr_t end_addr)
98
{
99
    int i;
100

    
101
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
102
        KVMSlot *mem = &s->slots[i];
103

    
104
        if (start_addr == mem->start_addr &&
105
            end_addr == mem->start_addr + mem->memory_size) {
106
            return mem;
107
        }
108
    }
109

    
110
    return NULL;
111
}
112

    
113
/*
114
 * Find overlapping slot with lowest start address
115
 */
116
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
117
                                            target_phys_addr_t start_addr,
118
                                            target_phys_addr_t end_addr)
119
{
120
    KVMSlot *found = NULL;
121
    int i;
122

    
123
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
124
        KVMSlot *mem = &s->slots[i];
125

    
126
        if (mem->memory_size == 0 ||
127
            (found && found->start_addr < mem->start_addr)) {
128
            continue;
129
        }
130

    
131
        if (end_addr > mem->start_addr &&
132
            start_addr < mem->start_addr + mem->memory_size) {
133
            found = mem;
134
        }
135
    }
136

    
137
    return found;
138
}
139

    
140
int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
141
                                      target_phys_addr_t *phys_addr)
142
{
143
    int i;
144

    
145
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
146
        KVMSlot *mem = &s->slots[i];
147

    
148
        if (ram_addr >= mem->phys_offset &&
149
            ram_addr < mem->phys_offset + mem->memory_size) {
150
            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
151
            return 1;
152
        }
153
    }
154

    
155
    return 0;
156
}
157

    
158
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
159
{
160
    struct kvm_userspace_memory_region mem;
161

    
162
    mem.slot = slot->slot;
163
    mem.guest_phys_addr = slot->start_addr;
164
    mem.memory_size = slot->memory_size;
165
    mem.userspace_addr = (unsigned long)qemu_get_ram_ptr(slot->phys_offset);
166
    mem.flags = slot->flags;
167
    if (s->migration_log) {
168
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
169
    }
170
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
171
}
172

    
173
static void kvm_reset_vcpu(void *opaque)
174
{
175
    CPUState *env = opaque;
176

    
177
    kvm_arch_reset_vcpu(env);
178
}
179

    
180
int kvm_irqchip_in_kernel(void)
181
{
182
    return kvm_state->irqchip_in_kernel;
183
}
184

    
185
int kvm_pit_in_kernel(void)
186
{
187
    return kvm_state->pit_in_kernel;
188
}
189

    
190

    
191
int kvm_init_vcpu(CPUState *env)
192
{
193
    KVMState *s = kvm_state;
194
    long mmap_size;
195
    int ret;
196

    
197
    DPRINTF("kvm_init_vcpu\n");
198

    
199
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
200
    if (ret < 0) {
201
        DPRINTF("kvm_create_vcpu failed\n");
202
        goto err;
203
    }
204

    
205
    env->kvm_fd = ret;
206
    env->kvm_state = s;
207

    
208
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
209
    if (mmap_size < 0) {
210
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
211
        goto err;
212
    }
213

    
214
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
215
                        env->kvm_fd, 0);
216
    if (env->kvm_run == MAP_FAILED) {
217
        ret = -errno;
218
        DPRINTF("mmap'ing vcpu state failed\n");
219
        goto err;
220
    }
221

    
222
#ifdef KVM_CAP_COALESCED_MMIO
223
    if (s->coalesced_mmio && !s->coalesced_mmio_ring)
224
        s->coalesced_mmio_ring = (void *) env->kvm_run +
225
                s->coalesced_mmio * PAGE_SIZE;
226
#endif
227

    
228
    ret = kvm_arch_init_vcpu(env);
229
    if (ret == 0) {
230
        qemu_register_reset(kvm_reset_vcpu, env);
231
        kvm_arch_reset_vcpu(env);
232
    }
233
err:
234
    return ret;
235
}
236

    
237
/*
238
 * dirty pages logging control
239
 */
240
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
241
                                      ram_addr_t size, int flags, int mask)
242
{
243
    KVMState *s = kvm_state;
244
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
245
    int old_flags;
246

    
247
    if (mem == NULL)  {
248
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
249
                    TARGET_FMT_plx "\n", __func__, phys_addr,
250
                    (target_phys_addr_t)(phys_addr + size - 1));
251
            return -EINVAL;
252
    }
253

    
254
    old_flags = mem->flags;
255

    
256
    flags = (mem->flags & ~mask) | flags;
257
    mem->flags = flags;
258

    
259
    /* If nothing changed effectively, no need to issue ioctl */
260
    if (s->migration_log) {
261
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
262
    }
263
    if (flags == old_flags) {
264
            return 0;
265
    }
266

    
267
    return kvm_set_user_memory_region(s, mem);
268
}
269

    
270
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
271
{
272
        return kvm_dirty_pages_log_change(phys_addr, size,
273
                                          KVM_MEM_LOG_DIRTY_PAGES,
274
                                          KVM_MEM_LOG_DIRTY_PAGES);
275
}
276

    
277
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
278
{
279
        return kvm_dirty_pages_log_change(phys_addr, size,
280
                                          0,
281
                                          KVM_MEM_LOG_DIRTY_PAGES);
282
}
283

    
284
static int kvm_set_migration_log(int enable)
285
{
286
    KVMState *s = kvm_state;
287
    KVMSlot *mem;
288
    int i, err;
289

    
290
    s->migration_log = enable;
291

    
292
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
293
        mem = &s->slots[i];
294

    
295
        if (!mem->memory_size) {
296
            continue;
297
        }
298
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
299
            continue;
300
        }
301
        err = kvm_set_user_memory_region(s, mem);
302
        if (err) {
303
            return err;
304
        }
305
    }
306
    return 0;
307
}
308

    
309
/* get kvm's dirty pages bitmap and update qemu's */
310
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
311
                                         unsigned long *bitmap,
312
                                         unsigned long offset,
313
                                         unsigned long mem_size)
314
{
315
    unsigned int i, j;
316
    unsigned long page_number, addr, addr1, c;
317
    ram_addr_t ram_addr;
318
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
319
        HOST_LONG_BITS;
320

    
321
    /*
322
     * bitmap-traveling is faster than memory-traveling (for addr...)
323
     * especially when most of the memory is not dirty.
324
     */
325
    for (i = 0; i < len; i++) {
326
        if (bitmap[i] != 0) {
327
            c = leul_to_cpu(bitmap[i]);
328
            do {
329
                j = ffsl(c) - 1;
330
                c &= ~(1ul << j);
331
                page_number = i * HOST_LONG_BITS + j;
332
                addr1 = page_number * TARGET_PAGE_SIZE;
333
                addr = offset + addr1;
334
                ram_addr = cpu_get_physical_page_desc(addr);
335
                cpu_physical_memory_set_dirty(ram_addr);
336
            } while (c != 0);
337
        }
338
    }
339
    return 0;
340
}
341

    
342
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
343

    
344
/**
345
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
346
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
347
 * This means all bits are set to dirty.
348
 *
349
 * @start_add: start of logged region.
350
 * @end_addr: end of logged region.
351
 */
352
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
353
                                          target_phys_addr_t end_addr)
354
{
355
    KVMState *s = kvm_state;
356
    unsigned long size, allocated_size = 0;
357
    KVMDirtyLog d;
358
    KVMSlot *mem;
359
    int ret = 0;
360

    
361
    d.dirty_bitmap = NULL;
362
    while (start_addr < end_addr) {
363
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
364
        if (mem == NULL) {
365
            break;
366
        }
367

    
368
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
369
        if (!d.dirty_bitmap) {
370
            d.dirty_bitmap = qemu_malloc(size);
371
        } else if (size > allocated_size) {
372
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
373
        }
374
        allocated_size = size;
375
        memset(d.dirty_bitmap, 0, allocated_size);
376

    
377
        d.slot = mem->slot;
378

    
379
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
380
            DPRINTF("ioctl failed %d\n", errno);
381
            ret = -1;
382
            break;
383
        }
384

    
385
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
386
                                      mem->start_addr, mem->memory_size);
387
        start_addr = mem->start_addr + mem->memory_size;
388
    }
389
    qemu_free(d.dirty_bitmap);
390

    
391
    return ret;
392
}
393

    
394
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
395
{
396
    int ret = -ENOSYS;
397
#ifdef KVM_CAP_COALESCED_MMIO
398
    KVMState *s = kvm_state;
399

    
400
    if (s->coalesced_mmio) {
401
        struct kvm_coalesced_mmio_zone zone;
402

    
403
        zone.addr = start;
404
        zone.size = size;
405

    
406
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
407
    }
408
#endif
409

    
410
    return ret;
411
}
412

    
413
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
414
{
415
    int ret = -ENOSYS;
416
#ifdef KVM_CAP_COALESCED_MMIO
417
    KVMState *s = kvm_state;
418

    
419
    if (s->coalesced_mmio) {
420
        struct kvm_coalesced_mmio_zone zone;
421

    
422
        zone.addr = start;
423
        zone.size = size;
424

    
425
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
426
    }
427
#endif
428

    
429
    return ret;
430
}
431

    
432
int kvm_check_extension(KVMState *s, unsigned int extension)
433
{
434
    int ret;
435

    
436
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
437
    if (ret < 0) {
438
        ret = 0;
439
    }
440

    
441
    return ret;
442
}
443

    
444
static void kvm_set_phys_mem(target_phys_addr_t start_addr,
445
                             ram_addr_t size,
446
                             ram_addr_t phys_offset)
447
{
448
    KVMState *s = kvm_state;
449
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
450
    KVMSlot *mem, old;
451
    int err;
452

    
453
    /* kvm works in page size chunks, but the function may be called
454
       with sub-page size and unaligned start address. */
455
    size = TARGET_PAGE_ALIGN(size);
456
    start_addr = TARGET_PAGE_ALIGN(start_addr);
457

    
458
    /* KVM does not support read-only slots */
459
    phys_offset &= ~IO_MEM_ROM;
460

    
461
    while (1) {
462
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
463
        if (!mem) {
464
            break;
465
        }
466

    
467
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
468
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
469
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
470
            /* The new slot fits into the existing one and comes with
471
             * identical parameters - nothing to be done. */
472
            return;
473
        }
474

    
475
        old = *mem;
476

    
477
        /* unregister the overlapping slot */
478
        mem->memory_size = 0;
479
        err = kvm_set_user_memory_region(s, mem);
480
        if (err) {
481
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
482
                    __func__, strerror(-err));
483
            abort();
484
        }
485

    
486
        /* Workaround for older KVM versions: we can't join slots, even not by
487
         * unregistering the previous ones and then registering the larger
488
         * slot. We have to maintain the existing fragmentation. Sigh.
489
         *
490
         * This workaround assumes that the new slot starts at the same
491
         * address as the first existing one. If not or if some overlapping
492
         * slot comes around later, we will fail (not seen in practice so far)
493
         * - and actually require a recent KVM version. */
494
        if (s->broken_set_mem_region &&
495
            old.start_addr == start_addr && old.memory_size < size &&
496
            flags < IO_MEM_UNASSIGNED) {
497
            mem = kvm_alloc_slot(s);
498
            mem->memory_size = old.memory_size;
499
            mem->start_addr = old.start_addr;
500
            mem->phys_offset = old.phys_offset;
501
            mem->flags = 0;
502

    
503
            err = kvm_set_user_memory_region(s, mem);
504
            if (err) {
505
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
506
                        strerror(-err));
507
                abort();
508
            }
509

    
510
            start_addr += old.memory_size;
511
            phys_offset += old.memory_size;
512
            size -= old.memory_size;
513
            continue;
514
        }
515

    
516
        /* register prefix slot */
517
        if (old.start_addr < start_addr) {
518
            mem = kvm_alloc_slot(s);
519
            mem->memory_size = start_addr - old.start_addr;
520
            mem->start_addr = old.start_addr;
521
            mem->phys_offset = old.phys_offset;
522
            mem->flags = 0;
523

    
524
            err = kvm_set_user_memory_region(s, mem);
525
            if (err) {
526
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
527
                        __func__, strerror(-err));
528
                abort();
529
            }
530
        }
531

    
532
        /* register suffix slot */
533
        if (old.start_addr + old.memory_size > start_addr + size) {
534
            ram_addr_t size_delta;
535

    
536
            mem = kvm_alloc_slot(s);
537
            mem->start_addr = start_addr + size;
538
            size_delta = mem->start_addr - old.start_addr;
539
            mem->memory_size = old.memory_size - size_delta;
540
            mem->phys_offset = old.phys_offset + size_delta;
541
            mem->flags = 0;
542

    
543
            err = kvm_set_user_memory_region(s, mem);
544
            if (err) {
545
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
546
                        __func__, strerror(-err));
547
                abort();
548
            }
549
        }
550
    }
551

    
552
    /* in case the KVM bug workaround already "consumed" the new slot */
553
    if (!size)
554
        return;
555

    
556
    /* KVM does not need to know about this memory */
557
    if (flags >= IO_MEM_UNASSIGNED)
558
        return;
559

    
560
    mem = kvm_alloc_slot(s);
561
    mem->memory_size = size;
562
    mem->start_addr = start_addr;
563
    mem->phys_offset = phys_offset;
564
    mem->flags = 0;
565

    
566
    err = kvm_set_user_memory_region(s, mem);
567
    if (err) {
568
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
569
                strerror(-err));
570
        abort();
571
    }
572
}
573

    
574
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
575
                                  target_phys_addr_t start_addr,
576
                                  ram_addr_t size,
577
                                  ram_addr_t phys_offset)
578
{
579
        kvm_set_phys_mem(start_addr, size, phys_offset);
580
}
581

    
582
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
583
                                        target_phys_addr_t start_addr,
584
                                        target_phys_addr_t end_addr)
585
{
586
        return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
587
}
588

    
589
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
590
                                    int enable)
591
{
592
        return kvm_set_migration_log(enable);
593
}
594

    
595
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
596
        .set_memory = kvm_client_set_memory,
597
        .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
598
        .migration_log = kvm_client_migration_log,
599
};
600

    
601
int kvm_init(int smp_cpus)
602
{
603
    static const char upgrade_note[] =
604
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
605
        "(see http://sourceforge.net/projects/kvm).\n";
606
    KVMState *s;
607
    int ret;
608
    int i;
609

    
610
    s = qemu_mallocz(sizeof(KVMState));
611

    
612
#ifdef KVM_CAP_SET_GUEST_DEBUG
613
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
614
#endif
615
    for (i = 0; i < ARRAY_SIZE(s->slots); i++)
616
        s->slots[i].slot = i;
617

    
618
    s->vmfd = -1;
619
    s->fd = qemu_open("/dev/kvm", O_RDWR);
620
    if (s->fd == -1) {
621
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
622
        ret = -errno;
623
        goto err;
624
    }
625

    
626
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
627
    if (ret < KVM_API_VERSION) {
628
        if (ret > 0)
629
            ret = -EINVAL;
630
        fprintf(stderr, "kvm version too old\n");
631
        goto err;
632
    }
633

    
634
    if (ret > KVM_API_VERSION) {
635
        ret = -EINVAL;
636
        fprintf(stderr, "kvm version not supported\n");
637
        goto err;
638
    }
639

    
640
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
641
    if (s->vmfd < 0) {
642
#ifdef TARGET_S390X
643
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
644
                        "your host kernel command line\n");
645
#endif
646
        goto err;
647
    }
648

    
649
    /* initially, KVM allocated its own memory and we had to jump through
650
     * hooks to make phys_ram_base point to this.  Modern versions of KVM
651
     * just use a user allocated buffer so we can use regular pages
652
     * unmodified.  Make sure we have a sufficiently modern version of KVM.
653
     */
654
    if (!kvm_check_extension(s, KVM_CAP_USER_MEMORY)) {
655
        ret = -EINVAL;
656
        fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n%s",
657
                upgrade_note);
658
        goto err;
659
    }
660

    
661
    /* There was a nasty bug in < kvm-80 that prevents memory slots from being
662
     * destroyed properly.  Since we rely on this capability, refuse to work
663
     * with any kernel without this capability. */
664
    if (!kvm_check_extension(s, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
665
        ret = -EINVAL;
666

    
667
        fprintf(stderr,
668
                "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
669
                upgrade_note);
670
        goto err;
671
    }
672

    
673
    s->coalesced_mmio = 0;
674
#ifdef KVM_CAP_COALESCED_MMIO
675
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
676
    s->coalesced_mmio_ring = NULL;
677
#endif
678

    
679
    s->broken_set_mem_region = 1;
680
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
681
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
682
    if (ret > 0) {
683
        s->broken_set_mem_region = 0;
684
    }
685
#endif
686

    
687
    s->vcpu_events = 0;
688
#ifdef KVM_CAP_VCPU_EVENTS
689
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
690
#endif
691

    
692
    s->robust_singlestep = 0;
693
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
694
    s->robust_singlestep =
695
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
696
#endif
697

    
698
    s->debugregs = 0;
699
#ifdef KVM_CAP_DEBUGREGS
700
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
701
#endif
702

    
703
    s->xsave = 0;
704
#ifdef KVM_CAP_XSAVE
705
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
706
#endif
707

    
708
    s->xcrs = 0;
709
#ifdef KVM_CAP_XCRS
710
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
711
#endif
712

    
713
    ret = kvm_arch_init(s, smp_cpus);
714
    if (ret < 0)
715
        goto err;
716

    
717
    kvm_state = s;
718
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
719

    
720
    return 0;
721

    
722
err:
723
    if (s) {
724
        if (s->vmfd != -1)
725
            close(s->vmfd);
726
        if (s->fd != -1)
727
            close(s->fd);
728
    }
729
    qemu_free(s);
730

    
731
    return ret;
732
}
733

    
734
static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
735
                         uint32_t count)
736
{
737
    int i;
738
    uint8_t *ptr = data;
739

    
740
    for (i = 0; i < count; i++) {
741
        if (direction == KVM_EXIT_IO_IN) {
742
            switch (size) {
743
            case 1:
744
                stb_p(ptr, cpu_inb(port));
745
                break;
746
            case 2:
747
                stw_p(ptr, cpu_inw(port));
748
                break;
749
            case 4:
750
                stl_p(ptr, cpu_inl(port));
751
                break;
752
            }
753
        } else {
754
            switch (size) {
755
            case 1:
756
                cpu_outb(port, ldub_p(ptr));
757
                break;
758
            case 2:
759
                cpu_outw(port, lduw_p(ptr));
760
                break;
761
            case 4:
762
                cpu_outl(port, ldl_p(ptr));
763
                break;
764
            }
765
        }
766

    
767
        ptr += size;
768
    }
769

    
770
    return 1;
771
}
772

    
773
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
774
static void kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
775
{
776

    
777
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
778
        int i;
779

    
780
        fprintf(stderr, "KVM internal error. Suberror: %d\n",
781
                run->internal.suberror);
782

    
783
        for (i = 0; i < run->internal.ndata; ++i) {
784
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
785
                    i, (uint64_t)run->internal.data[i]);
786
        }
787
    }
788
    cpu_dump_state(env, stderr, fprintf, 0);
789
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
790
        fprintf(stderr, "emulation failure\n");
791
        if (!kvm_arch_stop_on_emulation_error(env))
792
                return;
793
    }
794
    /* FIXME: Should trigger a qmp message to let management know
795
     * something went wrong.
796
     */
797
    vm_stop(0);
798
}
799
#endif
800

    
801
void kvm_flush_coalesced_mmio_buffer(void)
802
{
803
#ifdef KVM_CAP_COALESCED_MMIO
804
    KVMState *s = kvm_state;
805
    if (s->coalesced_mmio_ring) {
806
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
807
        while (ring->first != ring->last) {
808
            struct kvm_coalesced_mmio *ent;
809

    
810
            ent = &ring->coalesced_mmio[ring->first];
811

    
812
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
813
            smp_wmb();
814
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
815
        }
816
    }
817
#endif
818
}
819

    
820
static void do_kvm_cpu_synchronize_state(void *_env)
821
{
822
    CPUState *env = _env;
823

    
824
    if (!env->kvm_vcpu_dirty) {
825
        kvm_arch_get_registers(env);
826
        env->kvm_vcpu_dirty = 1;
827
    }
828
}
829

    
830
void kvm_cpu_synchronize_state(CPUState *env)
831
{
832
    if (!env->kvm_vcpu_dirty)
833
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
834
}
835

    
836
void kvm_cpu_synchronize_post_reset(CPUState *env)
837
{
838
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
839
    env->kvm_vcpu_dirty = 0;
840
}
841

    
842
void kvm_cpu_synchronize_post_init(CPUState *env)
843
{
844
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
845
    env->kvm_vcpu_dirty = 0;
846
}
847

    
848
int kvm_cpu_exec(CPUState *env)
849
{
850
    struct kvm_run *run = env->kvm_run;
851
    int ret;
852

    
853
    DPRINTF("kvm_cpu_exec()\n");
854

    
855
    do {
856
#ifndef CONFIG_IOTHREAD
857
        if (env->exit_request) {
858
            DPRINTF("interrupt exit requested\n");
859
            ret = 0;
860
            break;
861
        }
862
#endif
863

    
864
        if (kvm_arch_process_irqchip_events(env)) {
865
            ret = 0;
866
            break;
867
        }
868

    
869
        if (env->kvm_vcpu_dirty) {
870
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
871
            env->kvm_vcpu_dirty = 0;
872
        }
873

    
874
        kvm_arch_pre_run(env, run);
875
        cpu_single_env = NULL;
876
        qemu_mutex_unlock_iothread();
877
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
878
        qemu_mutex_lock_iothread();
879
        cpu_single_env = env;
880
        kvm_arch_post_run(env, run);
881

    
882
        if (ret == -EINTR || ret == -EAGAIN) {
883
            cpu_exit(env);
884
            DPRINTF("io window exit\n");
885
            ret = 0;
886
            break;
887
        }
888

    
889
        if (ret < 0) {
890
            DPRINTF("kvm run failed %s\n", strerror(-ret));
891
            abort();
892
        }
893

    
894
        kvm_flush_coalesced_mmio_buffer();
895

    
896
        ret = 0; /* exit loop */
897
        switch (run->exit_reason) {
898
        case KVM_EXIT_IO:
899
            DPRINTF("handle_io\n");
900
            ret = kvm_handle_io(run->io.port,
901
                                (uint8_t *)run + run->io.data_offset,
902
                                run->io.direction,
903
                                run->io.size,
904
                                run->io.count);
905
            break;
906
        case KVM_EXIT_MMIO:
907
            DPRINTF("handle_mmio\n");
908
            cpu_physical_memory_rw(run->mmio.phys_addr,
909
                                   run->mmio.data,
910
                                   run->mmio.len,
911
                                   run->mmio.is_write);
912
            ret = 1;
913
            break;
914
        case KVM_EXIT_IRQ_WINDOW_OPEN:
915
            DPRINTF("irq_window_open\n");
916
            break;
917
        case KVM_EXIT_SHUTDOWN:
918
            DPRINTF("shutdown\n");
919
            qemu_system_reset_request();
920
            ret = 1;
921
            break;
922
        case KVM_EXIT_UNKNOWN:
923
            DPRINTF("kvm_exit_unknown\n");
924
            break;
925
        case KVM_EXIT_FAIL_ENTRY:
926
            DPRINTF("kvm_exit_fail_entry\n");
927
            break;
928
        case KVM_EXIT_EXCEPTION:
929
            DPRINTF("kvm_exit_exception\n");
930
            break;
931
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
932
        case KVM_EXIT_INTERNAL_ERROR:
933
            kvm_handle_internal_error(env, run);
934
            break;
935
#endif
936
        case KVM_EXIT_DEBUG:
937
            DPRINTF("kvm_exit_debug\n");
938
#ifdef KVM_CAP_SET_GUEST_DEBUG
939
            if (kvm_arch_debug(&run->debug.arch)) {
940
                env->exception_index = EXCP_DEBUG;
941
                return 0;
942
            }
943
            /* re-enter, this exception was guest-internal */
944
            ret = 1;
945
#endif /* KVM_CAP_SET_GUEST_DEBUG */
946
            break;
947
        default:
948
            DPRINTF("kvm_arch_handle_exit\n");
949
            ret = kvm_arch_handle_exit(env, run);
950
            break;
951
        }
952
    } while (ret > 0);
953

    
954
    if (env->exit_request) {
955
        env->exit_request = 0;
956
        env->exception_index = EXCP_INTERRUPT;
957
    }
958

    
959
    return ret;
960
}
961

    
962
int kvm_ioctl(KVMState *s, int type, ...)
963
{
964
    int ret;
965
    void *arg;
966
    va_list ap;
967

    
968
    va_start(ap, type);
969
    arg = va_arg(ap, void *);
970
    va_end(ap);
971

    
972
    ret = ioctl(s->fd, type, arg);
973
    if (ret == -1)
974
        ret = -errno;
975

    
976
    return ret;
977
}
978

    
979
int kvm_vm_ioctl(KVMState *s, int type, ...)
980
{
981
    int ret;
982
    void *arg;
983
    va_list ap;
984

    
985
    va_start(ap, type);
986
    arg = va_arg(ap, void *);
987
    va_end(ap);
988

    
989
    ret = ioctl(s->vmfd, type, arg);
990
    if (ret == -1)
991
        ret = -errno;
992

    
993
    return ret;
994
}
995

    
996
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
997
{
998
    int ret;
999
    void *arg;
1000
    va_list ap;
1001

    
1002
    va_start(ap, type);
1003
    arg = va_arg(ap, void *);
1004
    va_end(ap);
1005

    
1006
    ret = ioctl(env->kvm_fd, type, arg);
1007
    if (ret == -1)
1008
        ret = -errno;
1009

    
1010
    return ret;
1011
}
1012

    
1013
int kvm_has_sync_mmu(void)
1014
{
1015
#ifdef KVM_CAP_SYNC_MMU
1016
    KVMState *s = kvm_state;
1017

    
1018
    return kvm_check_extension(s, KVM_CAP_SYNC_MMU);
1019
#else
1020
    return 0;
1021
#endif
1022
}
1023

    
1024
int kvm_has_vcpu_events(void)
1025
{
1026
    return kvm_state->vcpu_events;
1027
}
1028

    
1029
int kvm_has_robust_singlestep(void)
1030
{
1031
    return kvm_state->robust_singlestep;
1032
}
1033

    
1034
int kvm_has_debugregs(void)
1035
{
1036
    return kvm_state->debugregs;
1037
}
1038

    
1039
int kvm_has_xsave(void)
1040
{
1041
    return kvm_state->xsave;
1042
}
1043

    
1044
int kvm_has_xcrs(void)
1045
{
1046
    return kvm_state->xcrs;
1047
}
1048

    
1049
void kvm_setup_guest_memory(void *start, size_t size)
1050
{
1051
    if (!kvm_has_sync_mmu()) {
1052
        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1053

    
1054
        if (ret) {
1055
            perror("qemu_madvise");
1056
            fprintf(stderr,
1057
                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1058
            exit(1);
1059
        }
1060
    }
1061
}
1062

    
1063
#ifdef KVM_CAP_SET_GUEST_DEBUG
1064
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1065
                                                 target_ulong pc)
1066
{
1067
    struct kvm_sw_breakpoint *bp;
1068

    
1069
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1070
        if (bp->pc == pc)
1071
            return bp;
1072
    }
1073
    return NULL;
1074
}
1075

    
1076
int kvm_sw_breakpoints_active(CPUState *env)
1077
{
1078
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1079
}
1080

    
1081
struct kvm_set_guest_debug_data {
1082
    struct kvm_guest_debug dbg;
1083
    CPUState *env;
1084
    int err;
1085
};
1086

    
1087
static void kvm_invoke_set_guest_debug(void *data)
1088
{
1089
    struct kvm_set_guest_debug_data *dbg_data = data;
1090
    CPUState *env = dbg_data->env;
1091

    
1092
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1093
}
1094

    
1095
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1096
{
1097
    struct kvm_set_guest_debug_data data;
1098

    
1099
    data.dbg.control = reinject_trap;
1100

    
1101
    if (env->singlestep_enabled) {
1102
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1103
    }
1104
    kvm_arch_update_guest_debug(env, &data.dbg);
1105
    data.env = env;
1106

    
1107
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1108
    return data.err;
1109
}
1110

    
1111
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1112
                          target_ulong len, int type)
1113
{
1114
    struct kvm_sw_breakpoint *bp;
1115
    CPUState *env;
1116
    int err;
1117

    
1118
    if (type == GDB_BREAKPOINT_SW) {
1119
        bp = kvm_find_sw_breakpoint(current_env, addr);
1120
        if (bp) {
1121
            bp->use_count++;
1122
            return 0;
1123
        }
1124

    
1125
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1126
        if (!bp)
1127
            return -ENOMEM;
1128

    
1129
        bp->pc = addr;
1130
        bp->use_count = 1;
1131
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1132
        if (err) {
1133
            free(bp);
1134
            return err;
1135
        }
1136

    
1137
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1138
                          bp, entry);
1139
    } else {
1140
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1141
        if (err)
1142
            return err;
1143
    }
1144

    
1145
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1146
        err = kvm_update_guest_debug(env, 0);
1147
        if (err)
1148
            return err;
1149
    }
1150
    return 0;
1151
}
1152

    
1153
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1154
                          target_ulong len, int type)
1155
{
1156
    struct kvm_sw_breakpoint *bp;
1157
    CPUState *env;
1158
    int err;
1159

    
1160
    if (type == GDB_BREAKPOINT_SW) {
1161
        bp = kvm_find_sw_breakpoint(current_env, addr);
1162
        if (!bp)
1163
            return -ENOENT;
1164

    
1165
        if (bp->use_count > 1) {
1166
            bp->use_count--;
1167
            return 0;
1168
        }
1169

    
1170
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1171
        if (err)
1172
            return err;
1173

    
1174
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1175
        qemu_free(bp);
1176
    } else {
1177
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1178
        if (err)
1179
            return err;
1180
    }
1181

    
1182
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1183
        err = kvm_update_guest_debug(env, 0);
1184
        if (err)
1185
            return err;
1186
    }
1187
    return 0;
1188
}
1189

    
1190
void kvm_remove_all_breakpoints(CPUState *current_env)
1191
{
1192
    struct kvm_sw_breakpoint *bp, *next;
1193
    KVMState *s = current_env->kvm_state;
1194
    CPUState *env;
1195

    
1196
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1197
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1198
            /* Try harder to find a CPU that currently sees the breakpoint. */
1199
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1200
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0)
1201
                    break;
1202
            }
1203
        }
1204
    }
1205
    kvm_arch_remove_all_hw_breakpoints();
1206

    
1207
    for (env = first_cpu; env != NULL; env = env->next_cpu)
1208
        kvm_update_guest_debug(env, 0);
1209
}
1210

    
1211
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1212

    
1213
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1214
{
1215
    return -EINVAL;
1216
}
1217

    
1218
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1219
                          target_ulong len, int type)
1220
{
1221
    return -EINVAL;
1222
}
1223

    
1224
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1225
                          target_ulong len, int type)
1226
{
1227
    return -EINVAL;
1228
}
1229

    
1230
void kvm_remove_all_breakpoints(CPUState *current_env)
1231
{
1232
}
1233
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1234

    
1235
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1236
{
1237
    struct kvm_signal_mask *sigmask;
1238
    int r;
1239

    
1240
    if (!sigset)
1241
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1242

    
1243
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1244

    
1245
    sigmask->len = 8;
1246
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1247
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1248
    free(sigmask);
1249

    
1250
    return r;
1251
}
1252

    
1253
int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1254
{
1255
#ifdef KVM_IOEVENTFD
1256
    int ret;
1257
    struct kvm_ioeventfd iofd;
1258

    
1259
    iofd.datamatch = val;
1260
    iofd.addr = addr;
1261
    iofd.len = 4;
1262
    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1263
    iofd.fd = fd;
1264

    
1265
    if (!kvm_enabled()) {
1266
        return -ENOSYS;
1267
    }
1268

    
1269
    if (!assign) {
1270
        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1271
    }
1272

    
1273
    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1274

    
1275
    if (ret < 0) {
1276
        return -errno;
1277
    }
1278

    
1279
    return 0;
1280
#else
1281
    return -ENOSYS;
1282
#endif
1283
}
1284

    
1285
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1286
{
1287
#ifdef KVM_IOEVENTFD
1288
    struct kvm_ioeventfd kick = {
1289
        .datamatch = val,
1290
        .addr = addr,
1291
        .len = 2,
1292
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1293
        .fd = fd,
1294
    };
1295
    int r;
1296
    if (!kvm_enabled())
1297
        return -ENOSYS;
1298
    if (!assign)
1299
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1300
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1301
    if (r < 0)
1302
        return r;
1303
    return 0;
1304
#else
1305
    return -ENOSYS;
1306
#endif
1307
}