Statistics
| Branch: | Revision:

root / kvm-all.c @ 70fedd76

History | View | Annotate | Download (32 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
32
#define PAGE_SIZE TARGET_PAGE_SIZE
33

    
34
//#define DEBUG_KVM
35

    
36
#ifdef DEBUG_KVM
37
#define DPRINTF(fmt, ...) \
38
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
39
#else
40
#define DPRINTF(fmt, ...) \
41
    do { } while (0)
42
#endif
43

    
44
typedef struct KVMSlot
45
{
46
    target_phys_addr_t start_addr;
47
    ram_addr_t memory_size;
48
    ram_addr_t phys_offset;
49
    int slot;
50
    int flags;
51
} KVMSlot;
52

    
53
typedef struct kvm_dirty_log KVMDirtyLog;
54

    
55
struct KVMState
56
{
57
    KVMSlot slots[32];
58
    int fd;
59
    int vmfd;
60
    int coalesced_mmio;
61
#ifdef KVM_CAP_COALESCED_MMIO
62
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
63
#endif
64
    int broken_set_mem_region;
65
    int migration_log;
66
    int vcpu_events;
67
    int robust_singlestep;
68
    int debugregs;
69
#ifdef KVM_CAP_SET_GUEST_DEBUG
70
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
71
#endif
72
    int irqchip_in_kernel;
73
    int pit_in_kernel;
74
    int xsave, xcrs;
75
};
76

    
77
static KVMState *kvm_state;
78

    
79
static KVMSlot *kvm_alloc_slot(KVMState *s)
80
{
81
    int i;
82

    
83
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
84
        /* KVM private memory slots */
85
        if (i >= 8 && i < 12)
86
            continue;
87
        if (s->slots[i].memory_size == 0)
88
            return &s->slots[i];
89
    }
90

    
91
    fprintf(stderr, "%s: no free slot available\n", __func__);
92
    abort();
93
}
94

    
95
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
96
                                         target_phys_addr_t start_addr,
97
                                         target_phys_addr_t end_addr)
98
{
99
    int i;
100

    
101
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
102
        KVMSlot *mem = &s->slots[i];
103

    
104
        if (start_addr == mem->start_addr &&
105
            end_addr == mem->start_addr + mem->memory_size) {
106
            return mem;
107
        }
108
    }
109

    
110
    return NULL;
111
}
112

    
113
/*
114
 * Find overlapping slot with lowest start address
115
 */
116
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
117
                                            target_phys_addr_t start_addr,
118
                                            target_phys_addr_t end_addr)
119
{
120
    KVMSlot *found = NULL;
121
    int i;
122

    
123
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
124
        KVMSlot *mem = &s->slots[i];
125

    
126
        if (mem->memory_size == 0 ||
127
            (found && found->start_addr < mem->start_addr)) {
128
            continue;
129
        }
130

    
131
        if (end_addr > mem->start_addr &&
132
            start_addr < mem->start_addr + mem->memory_size) {
133
            found = mem;
134
        }
135
    }
136

    
137
    return found;
138
}
139

    
140
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
141
{
142
    struct kvm_userspace_memory_region mem;
143

    
144
    mem.slot = slot->slot;
145
    mem.guest_phys_addr = slot->start_addr;
146
    mem.memory_size = slot->memory_size;
147
    mem.userspace_addr = (unsigned long)qemu_get_ram_ptr(slot->phys_offset);
148
    mem.flags = slot->flags;
149
    if (s->migration_log) {
150
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
151
    }
152
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
153
}
154

    
155
static void kvm_reset_vcpu(void *opaque)
156
{
157
    CPUState *env = opaque;
158

    
159
    kvm_arch_reset_vcpu(env);
160
}
161

    
162
int kvm_irqchip_in_kernel(void)
163
{
164
    return kvm_state->irqchip_in_kernel;
165
}
166

    
167
int kvm_pit_in_kernel(void)
168
{
169
    return kvm_state->pit_in_kernel;
170
}
171

    
172

    
173
int kvm_init_vcpu(CPUState *env)
174
{
175
    KVMState *s = kvm_state;
176
    long mmap_size;
177
    int ret;
178

    
179
    DPRINTF("kvm_init_vcpu\n");
180

    
181
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
182
    if (ret < 0) {
183
        DPRINTF("kvm_create_vcpu failed\n");
184
        goto err;
185
    }
186

    
187
    env->kvm_fd = ret;
188
    env->kvm_state = s;
189

    
190
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
191
    if (mmap_size < 0) {
192
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
193
        goto err;
194
    }
195

    
196
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
197
                        env->kvm_fd, 0);
198
    if (env->kvm_run == MAP_FAILED) {
199
        ret = -errno;
200
        DPRINTF("mmap'ing vcpu state failed\n");
201
        goto err;
202
    }
203

    
204
#ifdef KVM_CAP_COALESCED_MMIO
205
    if (s->coalesced_mmio && !s->coalesced_mmio_ring)
206
        s->coalesced_mmio_ring = (void *) env->kvm_run +
207
                s->coalesced_mmio * PAGE_SIZE;
208
#endif
209

    
210
    ret = kvm_arch_init_vcpu(env);
211
    if (ret == 0) {
212
        qemu_register_reset(kvm_reset_vcpu, env);
213
        kvm_arch_reset_vcpu(env);
214
    }
215
err:
216
    return ret;
217
}
218

    
219
/*
220
 * dirty pages logging control
221
 */
222
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
223
                                      ram_addr_t size, int flags, int mask)
224
{
225
    KVMState *s = kvm_state;
226
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
227
    int old_flags;
228

    
229
    if (mem == NULL)  {
230
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
231
                    TARGET_FMT_plx "\n", __func__, phys_addr,
232
                    (target_phys_addr_t)(phys_addr + size - 1));
233
            return -EINVAL;
234
    }
235

    
236
    old_flags = mem->flags;
237

    
238
    flags = (mem->flags & ~mask) | flags;
239
    mem->flags = flags;
240

    
241
    /* If nothing changed effectively, no need to issue ioctl */
242
    if (s->migration_log) {
243
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
244
    }
245
    if (flags == old_flags) {
246
            return 0;
247
    }
248

    
249
    return kvm_set_user_memory_region(s, mem);
250
}
251

    
252
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
253
{
254
        return kvm_dirty_pages_log_change(phys_addr, size,
255
                                          KVM_MEM_LOG_DIRTY_PAGES,
256
                                          KVM_MEM_LOG_DIRTY_PAGES);
257
}
258

    
259
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
260
{
261
        return kvm_dirty_pages_log_change(phys_addr, size,
262
                                          0,
263
                                          KVM_MEM_LOG_DIRTY_PAGES);
264
}
265

    
266
static int kvm_set_migration_log(int enable)
267
{
268
    KVMState *s = kvm_state;
269
    KVMSlot *mem;
270
    int i, err;
271

    
272
    s->migration_log = enable;
273

    
274
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
275
        mem = &s->slots[i];
276

    
277
        if (!mem->memory_size) {
278
            continue;
279
        }
280
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
281
            continue;
282
        }
283
        err = kvm_set_user_memory_region(s, mem);
284
        if (err) {
285
            return err;
286
        }
287
    }
288
    return 0;
289
}
290

    
291
/* get kvm's dirty pages bitmap and update qemu's */
292
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
293
                                         unsigned long *bitmap,
294
                                         unsigned long offset,
295
                                         unsigned long mem_size)
296
{
297
    unsigned int i, j;
298
    unsigned long page_number, addr, addr1, c;
299
    ram_addr_t ram_addr;
300
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
301
        HOST_LONG_BITS;
302

    
303
    /*
304
     * bitmap-traveling is faster than memory-traveling (for addr...)
305
     * especially when most of the memory is not dirty.
306
     */
307
    for (i = 0; i < len; i++) {
308
        if (bitmap[i] != 0) {
309
            c = leul_to_cpu(bitmap[i]);
310
            do {
311
                j = ffsl(c) - 1;
312
                c &= ~(1ul << j);
313
                page_number = i * HOST_LONG_BITS + j;
314
                addr1 = page_number * TARGET_PAGE_SIZE;
315
                addr = offset + addr1;
316
                ram_addr = cpu_get_physical_page_desc(addr);
317
                cpu_physical_memory_set_dirty(ram_addr);
318
            } while (c != 0);
319
        }
320
    }
321
    return 0;
322
}
323

    
324
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
325

    
326
/**
327
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
328
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
329
 * This means all bits are set to dirty.
330
 *
331
 * @start_add: start of logged region.
332
 * @end_addr: end of logged region.
333
 */
334
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
335
                                          target_phys_addr_t end_addr)
336
{
337
    KVMState *s = kvm_state;
338
    unsigned long size, allocated_size = 0;
339
    KVMDirtyLog d;
340
    KVMSlot *mem;
341
    int ret = 0;
342

    
343
    d.dirty_bitmap = NULL;
344
    while (start_addr < end_addr) {
345
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
346
        if (mem == NULL) {
347
            break;
348
        }
349

    
350
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
351
        if (!d.dirty_bitmap) {
352
            d.dirty_bitmap = qemu_malloc(size);
353
        } else if (size > allocated_size) {
354
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
355
        }
356
        allocated_size = size;
357
        memset(d.dirty_bitmap, 0, allocated_size);
358

    
359
        d.slot = mem->slot;
360

    
361
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
362
            DPRINTF("ioctl failed %d\n", errno);
363
            ret = -1;
364
            break;
365
        }
366

    
367
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
368
                                      mem->start_addr, mem->memory_size);
369
        start_addr = mem->start_addr + mem->memory_size;
370
    }
371
    qemu_free(d.dirty_bitmap);
372

    
373
    return ret;
374
}
375

    
376
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
377
{
378
    int ret = -ENOSYS;
379
#ifdef KVM_CAP_COALESCED_MMIO
380
    KVMState *s = kvm_state;
381

    
382
    if (s->coalesced_mmio) {
383
        struct kvm_coalesced_mmio_zone zone;
384

    
385
        zone.addr = start;
386
        zone.size = size;
387

    
388
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
389
    }
390
#endif
391

    
392
    return ret;
393
}
394

    
395
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
396
{
397
    int ret = -ENOSYS;
398
#ifdef KVM_CAP_COALESCED_MMIO
399
    KVMState *s = kvm_state;
400

    
401
    if (s->coalesced_mmio) {
402
        struct kvm_coalesced_mmio_zone zone;
403

    
404
        zone.addr = start;
405
        zone.size = size;
406

    
407
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
408
    }
409
#endif
410

    
411
    return ret;
412
}
413

    
414
int kvm_check_extension(KVMState *s, unsigned int extension)
415
{
416
    int ret;
417

    
418
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
419
    if (ret < 0) {
420
        ret = 0;
421
    }
422

    
423
    return ret;
424
}
425

    
426
static void kvm_set_phys_mem(target_phys_addr_t start_addr,
427
                             ram_addr_t size,
428
                             ram_addr_t phys_offset)
429
{
430
    KVMState *s = kvm_state;
431
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
432
    KVMSlot *mem, old;
433
    int err;
434

    
435
    if (start_addr & ~TARGET_PAGE_MASK) {
436
        if (flags >= IO_MEM_UNASSIGNED) {
437
            if (!kvm_lookup_overlapping_slot(s, start_addr,
438
                                             start_addr + size)) {
439
                return;
440
            }
441
            fprintf(stderr, "Unaligned split of a KVM memory slot\n");
442
        } else {
443
            fprintf(stderr, "Only page-aligned memory slots supported\n");
444
        }
445
        abort();
446
    }
447

    
448
    /* KVM does not support read-only slots */
449
    phys_offset &= ~IO_MEM_ROM;
450

    
451
    while (1) {
452
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
453
        if (!mem) {
454
            break;
455
        }
456

    
457
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
458
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
459
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
460
            /* The new slot fits into the existing one and comes with
461
             * identical parameters - nothing to be done. */
462
            return;
463
        }
464

    
465
        old = *mem;
466

    
467
        /* unregister the overlapping slot */
468
        mem->memory_size = 0;
469
        err = kvm_set_user_memory_region(s, mem);
470
        if (err) {
471
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
472
                    __func__, strerror(-err));
473
            abort();
474
        }
475

    
476
        /* Workaround for older KVM versions: we can't join slots, even not by
477
         * unregistering the previous ones and then registering the larger
478
         * slot. We have to maintain the existing fragmentation. Sigh.
479
         *
480
         * This workaround assumes that the new slot starts at the same
481
         * address as the first existing one. If not or if some overlapping
482
         * slot comes around later, we will fail (not seen in practice so far)
483
         * - and actually require a recent KVM version. */
484
        if (s->broken_set_mem_region &&
485
            old.start_addr == start_addr && old.memory_size < size &&
486
            flags < IO_MEM_UNASSIGNED) {
487
            mem = kvm_alloc_slot(s);
488
            mem->memory_size = old.memory_size;
489
            mem->start_addr = old.start_addr;
490
            mem->phys_offset = old.phys_offset;
491
            mem->flags = 0;
492

    
493
            err = kvm_set_user_memory_region(s, mem);
494
            if (err) {
495
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
496
                        strerror(-err));
497
                abort();
498
            }
499

    
500
            start_addr += old.memory_size;
501
            phys_offset += old.memory_size;
502
            size -= old.memory_size;
503
            continue;
504
        }
505

    
506
        /* register prefix slot */
507
        if (old.start_addr < start_addr) {
508
            mem = kvm_alloc_slot(s);
509
            mem->memory_size = start_addr - old.start_addr;
510
            mem->start_addr = old.start_addr;
511
            mem->phys_offset = old.phys_offset;
512
            mem->flags = 0;
513

    
514
            err = kvm_set_user_memory_region(s, mem);
515
            if (err) {
516
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
517
                        __func__, strerror(-err));
518
                abort();
519
            }
520
        }
521

    
522
        /* register suffix slot */
523
        if (old.start_addr + old.memory_size > start_addr + size) {
524
            ram_addr_t size_delta;
525

    
526
            mem = kvm_alloc_slot(s);
527
            mem->start_addr = start_addr + size;
528
            size_delta = mem->start_addr - old.start_addr;
529
            mem->memory_size = old.memory_size - size_delta;
530
            mem->phys_offset = old.phys_offset + size_delta;
531
            mem->flags = 0;
532

    
533
            err = kvm_set_user_memory_region(s, mem);
534
            if (err) {
535
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
536
                        __func__, strerror(-err));
537
                abort();
538
            }
539
        }
540
    }
541

    
542
    /* in case the KVM bug workaround already "consumed" the new slot */
543
    if (!size)
544
        return;
545

    
546
    /* KVM does not need to know about this memory */
547
    if (flags >= IO_MEM_UNASSIGNED)
548
        return;
549

    
550
    mem = kvm_alloc_slot(s);
551
    mem->memory_size = size;
552
    mem->start_addr = start_addr;
553
    mem->phys_offset = phys_offset;
554
    mem->flags = 0;
555

    
556
    err = kvm_set_user_memory_region(s, mem);
557
    if (err) {
558
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
559
                strerror(-err));
560
        abort();
561
    }
562
}
563

    
564
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
565
                                  target_phys_addr_t start_addr,
566
                                  ram_addr_t size,
567
                                  ram_addr_t phys_offset)
568
{
569
        kvm_set_phys_mem(start_addr, size, phys_offset);
570
}
571

    
572
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
573
                                        target_phys_addr_t start_addr,
574
                                        target_phys_addr_t end_addr)
575
{
576
        return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
577
}
578

    
579
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
580
                                    int enable)
581
{
582
        return kvm_set_migration_log(enable);
583
}
584

    
585
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
586
        .set_memory = kvm_client_set_memory,
587
        .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
588
        .migration_log = kvm_client_migration_log,
589
};
590

    
591
int kvm_init(int smp_cpus)
592
{
593
    static const char upgrade_note[] =
594
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
595
        "(see http://sourceforge.net/projects/kvm).\n";
596
    KVMState *s;
597
    int ret;
598
    int i;
599

    
600
    s = qemu_mallocz(sizeof(KVMState));
601

    
602
#ifdef KVM_CAP_SET_GUEST_DEBUG
603
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
604
#endif
605
    for (i = 0; i < ARRAY_SIZE(s->slots); i++)
606
        s->slots[i].slot = i;
607

    
608
    s->vmfd = -1;
609
    s->fd = qemu_open("/dev/kvm", O_RDWR);
610
    if (s->fd == -1) {
611
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
612
        ret = -errno;
613
        goto err;
614
    }
615

    
616
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
617
    if (ret < KVM_API_VERSION) {
618
        if (ret > 0)
619
            ret = -EINVAL;
620
        fprintf(stderr, "kvm version too old\n");
621
        goto err;
622
    }
623

    
624
    if (ret > KVM_API_VERSION) {
625
        ret = -EINVAL;
626
        fprintf(stderr, "kvm version not supported\n");
627
        goto err;
628
    }
629

    
630
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
631
    if (s->vmfd < 0) {
632
#ifdef TARGET_S390X
633
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
634
                        "your host kernel command line\n");
635
#endif
636
        goto err;
637
    }
638

    
639
    /* initially, KVM allocated its own memory and we had to jump through
640
     * hooks to make phys_ram_base point to this.  Modern versions of KVM
641
     * just use a user allocated buffer so we can use regular pages
642
     * unmodified.  Make sure we have a sufficiently modern version of KVM.
643
     */
644
    if (!kvm_check_extension(s, KVM_CAP_USER_MEMORY)) {
645
        ret = -EINVAL;
646
        fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n%s",
647
                upgrade_note);
648
        goto err;
649
    }
650

    
651
    /* There was a nasty bug in < kvm-80 that prevents memory slots from being
652
     * destroyed properly.  Since we rely on this capability, refuse to work
653
     * with any kernel without this capability. */
654
    if (!kvm_check_extension(s, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
655
        ret = -EINVAL;
656

    
657
        fprintf(stderr,
658
                "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
659
                upgrade_note);
660
        goto err;
661
    }
662

    
663
    s->coalesced_mmio = 0;
664
#ifdef KVM_CAP_COALESCED_MMIO
665
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
666
    s->coalesced_mmio_ring = NULL;
667
#endif
668

    
669
    s->broken_set_mem_region = 1;
670
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
671
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
672
    if (ret > 0) {
673
        s->broken_set_mem_region = 0;
674
    }
675
#endif
676

    
677
    s->vcpu_events = 0;
678
#ifdef KVM_CAP_VCPU_EVENTS
679
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
680
#endif
681

    
682
    s->robust_singlestep = 0;
683
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
684
    s->robust_singlestep =
685
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
686
#endif
687

    
688
    s->debugregs = 0;
689
#ifdef KVM_CAP_DEBUGREGS
690
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
691
#endif
692

    
693
    s->xsave = 0;
694
#ifdef KVM_CAP_XSAVE
695
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
696
#endif
697

    
698
    s->xcrs = 0;
699
#ifdef KVM_CAP_XCRS
700
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
701
#endif
702

    
703
    ret = kvm_arch_init(s, smp_cpus);
704
    if (ret < 0)
705
        goto err;
706

    
707
    kvm_state = s;
708
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
709

    
710
    return 0;
711

    
712
err:
713
    if (s) {
714
        if (s->vmfd != -1)
715
            close(s->vmfd);
716
        if (s->fd != -1)
717
            close(s->fd);
718
    }
719
    qemu_free(s);
720

    
721
    return ret;
722
}
723

    
724
static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
725
                         uint32_t count)
726
{
727
    int i;
728
    uint8_t *ptr = data;
729

    
730
    for (i = 0; i < count; i++) {
731
        if (direction == KVM_EXIT_IO_IN) {
732
            switch (size) {
733
            case 1:
734
                stb_p(ptr, cpu_inb(port));
735
                break;
736
            case 2:
737
                stw_p(ptr, cpu_inw(port));
738
                break;
739
            case 4:
740
                stl_p(ptr, cpu_inl(port));
741
                break;
742
            }
743
        } else {
744
            switch (size) {
745
            case 1:
746
                cpu_outb(port, ldub_p(ptr));
747
                break;
748
            case 2:
749
                cpu_outw(port, lduw_p(ptr));
750
                break;
751
            case 4:
752
                cpu_outl(port, ldl_p(ptr));
753
                break;
754
            }
755
        }
756

    
757
        ptr += size;
758
    }
759

    
760
    return 1;
761
}
762

    
763
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
764
static void kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
765
{
766

    
767
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
768
        int i;
769

    
770
        fprintf(stderr, "KVM internal error. Suberror: %d\n",
771
                run->internal.suberror);
772

    
773
        for (i = 0; i < run->internal.ndata; ++i) {
774
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
775
                    i, (uint64_t)run->internal.data[i]);
776
        }
777
    }
778
    cpu_dump_state(env, stderr, fprintf, 0);
779
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
780
        fprintf(stderr, "emulation failure\n");
781
        if (!kvm_arch_stop_on_emulation_error(env))
782
                return;
783
    }
784
    /* FIXME: Should trigger a qmp message to let management know
785
     * something went wrong.
786
     */
787
    vm_stop(0);
788
}
789
#endif
790

    
791
void kvm_flush_coalesced_mmio_buffer(void)
792
{
793
#ifdef KVM_CAP_COALESCED_MMIO
794
    KVMState *s = kvm_state;
795
    if (s->coalesced_mmio_ring) {
796
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
797
        while (ring->first != ring->last) {
798
            struct kvm_coalesced_mmio *ent;
799

    
800
            ent = &ring->coalesced_mmio[ring->first];
801

    
802
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
803
            smp_wmb();
804
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
805
        }
806
    }
807
#endif
808
}
809

    
810
static void do_kvm_cpu_synchronize_state(void *_env)
811
{
812
    CPUState *env = _env;
813

    
814
    if (!env->kvm_vcpu_dirty) {
815
        kvm_arch_get_registers(env);
816
        env->kvm_vcpu_dirty = 1;
817
    }
818
}
819

    
820
void kvm_cpu_synchronize_state(CPUState *env)
821
{
822
    if (!env->kvm_vcpu_dirty)
823
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
824
}
825

    
826
void kvm_cpu_synchronize_post_reset(CPUState *env)
827
{
828
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
829
    env->kvm_vcpu_dirty = 0;
830
}
831

    
832
void kvm_cpu_synchronize_post_init(CPUState *env)
833
{
834
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
835
    env->kvm_vcpu_dirty = 0;
836
}
837

    
838
int kvm_cpu_exec(CPUState *env)
839
{
840
    struct kvm_run *run = env->kvm_run;
841
    int ret;
842

    
843
    DPRINTF("kvm_cpu_exec()\n");
844

    
845
    do {
846
#ifndef CONFIG_IOTHREAD
847
        if (env->exit_request) {
848
            DPRINTF("interrupt exit requested\n");
849
            ret = 0;
850
            break;
851
        }
852
#endif
853

    
854
        if (kvm_arch_process_irqchip_events(env)) {
855
            ret = 0;
856
            break;
857
        }
858

    
859
        if (env->kvm_vcpu_dirty) {
860
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
861
            env->kvm_vcpu_dirty = 0;
862
        }
863

    
864
        kvm_arch_pre_run(env, run);
865
        cpu_single_env = NULL;
866
        qemu_mutex_unlock_iothread();
867
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
868
        qemu_mutex_lock_iothread();
869
        cpu_single_env = env;
870
        kvm_arch_post_run(env, run);
871

    
872
        if (ret == -EINTR || ret == -EAGAIN) {
873
            cpu_exit(env);
874
            DPRINTF("io window exit\n");
875
            ret = 0;
876
            break;
877
        }
878

    
879
        if (ret < 0) {
880
            DPRINTF("kvm run failed %s\n", strerror(-ret));
881
            abort();
882
        }
883

    
884
        kvm_flush_coalesced_mmio_buffer();
885

    
886
        ret = 0; /* exit loop */
887
        switch (run->exit_reason) {
888
        case KVM_EXIT_IO:
889
            DPRINTF("handle_io\n");
890
            ret = kvm_handle_io(run->io.port,
891
                                (uint8_t *)run + run->io.data_offset,
892
                                run->io.direction,
893
                                run->io.size,
894
                                run->io.count);
895
            break;
896
        case KVM_EXIT_MMIO:
897
            DPRINTF("handle_mmio\n");
898
            cpu_physical_memory_rw(run->mmio.phys_addr,
899
                                   run->mmio.data,
900
                                   run->mmio.len,
901
                                   run->mmio.is_write);
902
            ret = 1;
903
            break;
904
        case KVM_EXIT_IRQ_WINDOW_OPEN:
905
            DPRINTF("irq_window_open\n");
906
            break;
907
        case KVM_EXIT_SHUTDOWN:
908
            DPRINTF("shutdown\n");
909
            qemu_system_reset_request();
910
            ret = 1;
911
            break;
912
        case KVM_EXIT_UNKNOWN:
913
            DPRINTF("kvm_exit_unknown\n");
914
            break;
915
        case KVM_EXIT_FAIL_ENTRY:
916
            DPRINTF("kvm_exit_fail_entry\n");
917
            break;
918
        case KVM_EXIT_EXCEPTION:
919
            DPRINTF("kvm_exit_exception\n");
920
            break;
921
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
922
        case KVM_EXIT_INTERNAL_ERROR:
923
            kvm_handle_internal_error(env, run);
924
            break;
925
#endif
926
        case KVM_EXIT_DEBUG:
927
            DPRINTF("kvm_exit_debug\n");
928
#ifdef KVM_CAP_SET_GUEST_DEBUG
929
            if (kvm_arch_debug(&run->debug.arch)) {
930
                env->exception_index = EXCP_DEBUG;
931
                return 0;
932
            }
933
            /* re-enter, this exception was guest-internal */
934
            ret = 1;
935
#endif /* KVM_CAP_SET_GUEST_DEBUG */
936
            break;
937
        default:
938
            DPRINTF("kvm_arch_handle_exit\n");
939
            ret = kvm_arch_handle_exit(env, run);
940
            break;
941
        }
942
    } while (ret > 0);
943

    
944
    if (env->exit_request) {
945
        env->exit_request = 0;
946
        env->exception_index = EXCP_INTERRUPT;
947
    }
948

    
949
    return ret;
950
}
951

    
952
int kvm_ioctl(KVMState *s, int type, ...)
953
{
954
    int ret;
955
    void *arg;
956
    va_list ap;
957

    
958
    va_start(ap, type);
959
    arg = va_arg(ap, void *);
960
    va_end(ap);
961

    
962
    ret = ioctl(s->fd, type, arg);
963
    if (ret == -1)
964
        ret = -errno;
965

    
966
    return ret;
967
}
968

    
969
int kvm_vm_ioctl(KVMState *s, int type, ...)
970
{
971
    int ret;
972
    void *arg;
973
    va_list ap;
974

    
975
    va_start(ap, type);
976
    arg = va_arg(ap, void *);
977
    va_end(ap);
978

    
979
    ret = ioctl(s->vmfd, type, arg);
980
    if (ret == -1)
981
        ret = -errno;
982

    
983
    return ret;
984
}
985

    
986
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
987
{
988
    int ret;
989
    void *arg;
990
    va_list ap;
991

    
992
    va_start(ap, type);
993
    arg = va_arg(ap, void *);
994
    va_end(ap);
995

    
996
    ret = ioctl(env->kvm_fd, type, arg);
997
    if (ret == -1)
998
        ret = -errno;
999

    
1000
    return ret;
1001
}
1002

    
1003
int kvm_has_sync_mmu(void)
1004
{
1005
#ifdef KVM_CAP_SYNC_MMU
1006
    KVMState *s = kvm_state;
1007

    
1008
    return kvm_check_extension(s, KVM_CAP_SYNC_MMU);
1009
#else
1010
    return 0;
1011
#endif
1012
}
1013

    
1014
int kvm_has_vcpu_events(void)
1015
{
1016
    return kvm_state->vcpu_events;
1017
}
1018

    
1019
int kvm_has_robust_singlestep(void)
1020
{
1021
    return kvm_state->robust_singlestep;
1022
}
1023

    
1024
int kvm_has_debugregs(void)
1025
{
1026
    return kvm_state->debugregs;
1027
}
1028

    
1029
int kvm_has_xsave(void)
1030
{
1031
    return kvm_state->xsave;
1032
}
1033

    
1034
int kvm_has_xcrs(void)
1035
{
1036
    return kvm_state->xcrs;
1037
}
1038

    
1039
void kvm_setup_guest_memory(void *start, size_t size)
1040
{
1041
    if (!kvm_has_sync_mmu()) {
1042
#ifdef MADV_DONTFORK
1043
        int ret = madvise(start, size, MADV_DONTFORK);
1044

    
1045
        if (ret) {
1046
            perror("madvice");
1047
            exit(1);
1048
        }
1049
#else
1050
        fprintf(stderr,
1051
                "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1052
        exit(1);
1053
#endif
1054
    }
1055
}
1056

    
1057
#ifdef KVM_CAP_SET_GUEST_DEBUG
1058
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1059
                                                 target_ulong pc)
1060
{
1061
    struct kvm_sw_breakpoint *bp;
1062

    
1063
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1064
        if (bp->pc == pc)
1065
            return bp;
1066
    }
1067
    return NULL;
1068
}
1069

    
1070
int kvm_sw_breakpoints_active(CPUState *env)
1071
{
1072
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1073
}
1074

    
1075
struct kvm_set_guest_debug_data {
1076
    struct kvm_guest_debug dbg;
1077
    CPUState *env;
1078
    int err;
1079
};
1080

    
1081
static void kvm_invoke_set_guest_debug(void *data)
1082
{
1083
    struct kvm_set_guest_debug_data *dbg_data = data;
1084
    CPUState *env = dbg_data->env;
1085

    
1086
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1087
}
1088

    
1089
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1090
{
1091
    struct kvm_set_guest_debug_data data;
1092

    
1093
    data.dbg.control = reinject_trap;
1094

    
1095
    if (env->singlestep_enabled) {
1096
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1097
    }
1098
    kvm_arch_update_guest_debug(env, &data.dbg);
1099
    data.env = env;
1100

    
1101
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1102
    return data.err;
1103
}
1104

    
1105
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1106
                          target_ulong len, int type)
1107
{
1108
    struct kvm_sw_breakpoint *bp;
1109
    CPUState *env;
1110
    int err;
1111

    
1112
    if (type == GDB_BREAKPOINT_SW) {
1113
        bp = kvm_find_sw_breakpoint(current_env, addr);
1114
        if (bp) {
1115
            bp->use_count++;
1116
            return 0;
1117
        }
1118

    
1119
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1120
        if (!bp)
1121
            return -ENOMEM;
1122

    
1123
        bp->pc = addr;
1124
        bp->use_count = 1;
1125
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1126
        if (err) {
1127
            free(bp);
1128
            return err;
1129
        }
1130

    
1131
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1132
                          bp, entry);
1133
    } else {
1134
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1135
        if (err)
1136
            return err;
1137
    }
1138

    
1139
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1140
        err = kvm_update_guest_debug(env, 0);
1141
        if (err)
1142
            return err;
1143
    }
1144
    return 0;
1145
}
1146

    
1147
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1148
                          target_ulong len, int type)
1149
{
1150
    struct kvm_sw_breakpoint *bp;
1151
    CPUState *env;
1152
    int err;
1153

    
1154
    if (type == GDB_BREAKPOINT_SW) {
1155
        bp = kvm_find_sw_breakpoint(current_env, addr);
1156
        if (!bp)
1157
            return -ENOENT;
1158

    
1159
        if (bp->use_count > 1) {
1160
            bp->use_count--;
1161
            return 0;
1162
        }
1163

    
1164
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1165
        if (err)
1166
            return err;
1167

    
1168
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1169
        qemu_free(bp);
1170
    } else {
1171
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1172
        if (err)
1173
            return err;
1174
    }
1175

    
1176
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1177
        err = kvm_update_guest_debug(env, 0);
1178
        if (err)
1179
            return err;
1180
    }
1181
    return 0;
1182
}
1183

    
1184
void kvm_remove_all_breakpoints(CPUState *current_env)
1185
{
1186
    struct kvm_sw_breakpoint *bp, *next;
1187
    KVMState *s = current_env->kvm_state;
1188
    CPUState *env;
1189

    
1190
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1191
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1192
            /* Try harder to find a CPU that currently sees the breakpoint. */
1193
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1194
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0)
1195
                    break;
1196
            }
1197
        }
1198
    }
1199
    kvm_arch_remove_all_hw_breakpoints();
1200

    
1201
    for (env = first_cpu; env != NULL; env = env->next_cpu)
1202
        kvm_update_guest_debug(env, 0);
1203
}
1204

    
1205
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1206

    
1207
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1208
{
1209
    return -EINVAL;
1210
}
1211

    
1212
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1213
                          target_ulong len, int type)
1214
{
1215
    return -EINVAL;
1216
}
1217

    
1218
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1219
                          target_ulong len, int type)
1220
{
1221
    return -EINVAL;
1222
}
1223

    
1224
void kvm_remove_all_breakpoints(CPUState *current_env)
1225
{
1226
}
1227
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1228

    
1229
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1230
{
1231
    struct kvm_signal_mask *sigmask;
1232
    int r;
1233

    
1234
    if (!sigset)
1235
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1236

    
1237
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1238

    
1239
    sigmask->len = 8;
1240
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1241
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1242
    free(sigmask);
1243

    
1244
    return r;
1245
}
1246

    
1247
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1248
{
1249
#ifdef KVM_IOEVENTFD
1250
    struct kvm_ioeventfd kick = {
1251
        .datamatch = val,
1252
        .addr = addr,
1253
        .len = 2,
1254
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1255
        .fd = fd,
1256
    };
1257
    int r;
1258
    if (!kvm_enabled())
1259
        return -ENOSYS;
1260
    if (!assign)
1261
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1262
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1263
    if (r < 0)
1264
        return r;
1265
    return 0;
1266
#else
1267
    return -ENOSYS;
1268
#endif
1269
}