Statistics
| Branch: | Revision:

root / kvm-all.c @ 4556bd8b

History | View | Annotate | Download (31.9 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
32
#define PAGE_SIZE TARGET_PAGE_SIZE
33

    
34
//#define DEBUG_KVM
35

    
36
#ifdef DEBUG_KVM
37
#define DPRINTF(fmt, ...) \
38
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
39
#else
40
#define DPRINTF(fmt, ...) \
41
    do { } while (0)
42
#endif
43

    
44
typedef struct KVMSlot
45
{
46
    target_phys_addr_t start_addr;
47
    ram_addr_t memory_size;
48
    ram_addr_t phys_offset;
49
    int slot;
50
    int flags;
51
} KVMSlot;
52

    
53
typedef struct kvm_dirty_log KVMDirtyLog;
54

    
55
struct KVMState
56
{
57
    KVMSlot slots[32];
58
    int fd;
59
    int vmfd;
60
    int coalesced_mmio;
61
#ifdef KVM_CAP_COALESCED_MMIO
62
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
63
#endif
64
    int broken_set_mem_region;
65
    int migration_log;
66
    int vcpu_events;
67
    int robust_singlestep;
68
    int debugregs;
69
#ifdef KVM_CAP_SET_GUEST_DEBUG
70
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
71
#endif
72
    int irqchip_in_kernel;
73
    int pit_in_kernel;
74
};
75

    
76
static KVMState *kvm_state;
77

    
78
static KVMSlot *kvm_alloc_slot(KVMState *s)
79
{
80
    int i;
81

    
82
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
83
        /* KVM private memory slots */
84
        if (i >= 8 && i < 12)
85
            continue;
86
        if (s->slots[i].memory_size == 0)
87
            return &s->slots[i];
88
    }
89

    
90
    fprintf(stderr, "%s: no free slot available\n", __func__);
91
    abort();
92
}
93

    
94
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
95
                                         target_phys_addr_t start_addr,
96
                                         target_phys_addr_t end_addr)
97
{
98
    int i;
99

    
100
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
101
        KVMSlot *mem = &s->slots[i];
102

    
103
        if (start_addr == mem->start_addr &&
104
            end_addr == mem->start_addr + mem->memory_size) {
105
            return mem;
106
        }
107
    }
108

    
109
    return NULL;
110
}
111

    
112
/*
113
 * Find overlapping slot with lowest start address
114
 */
115
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
116
                                            target_phys_addr_t start_addr,
117
                                            target_phys_addr_t end_addr)
118
{
119
    KVMSlot *found = NULL;
120
    int i;
121

    
122
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
123
        KVMSlot *mem = &s->slots[i];
124

    
125
        if (mem->memory_size == 0 ||
126
            (found && found->start_addr < mem->start_addr)) {
127
            continue;
128
        }
129

    
130
        if (end_addr > mem->start_addr &&
131
            start_addr < mem->start_addr + mem->memory_size) {
132
            found = mem;
133
        }
134
    }
135

    
136
    return found;
137
}
138

    
139
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
140
{
141
    struct kvm_userspace_memory_region mem;
142

    
143
    mem.slot = slot->slot;
144
    mem.guest_phys_addr = slot->start_addr;
145
    mem.memory_size = slot->memory_size;
146
    mem.userspace_addr = (unsigned long)qemu_get_ram_ptr(slot->phys_offset);
147
    mem.flags = slot->flags;
148
    if (s->migration_log) {
149
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
150
    }
151
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
152
}
153

    
154
static void kvm_reset_vcpu(void *opaque)
155
{
156
    CPUState *env = opaque;
157

    
158
    kvm_arch_reset_vcpu(env);
159
}
160

    
161
int kvm_irqchip_in_kernel(void)
162
{
163
    return kvm_state->irqchip_in_kernel;
164
}
165

    
166
int kvm_pit_in_kernel(void)
167
{
168
    return kvm_state->pit_in_kernel;
169
}
170

    
171

    
172
int kvm_init_vcpu(CPUState *env)
173
{
174
    KVMState *s = kvm_state;
175
    long mmap_size;
176
    int ret;
177

    
178
    DPRINTF("kvm_init_vcpu\n");
179

    
180
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
181
    if (ret < 0) {
182
        DPRINTF("kvm_create_vcpu failed\n");
183
        goto err;
184
    }
185

    
186
    env->kvm_fd = ret;
187
    env->kvm_state = s;
188

    
189
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
190
    if (mmap_size < 0) {
191
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
192
        goto err;
193
    }
194

    
195
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
196
                        env->kvm_fd, 0);
197
    if (env->kvm_run == MAP_FAILED) {
198
        ret = -errno;
199
        DPRINTF("mmap'ing vcpu state failed\n");
200
        goto err;
201
    }
202

    
203
#ifdef KVM_CAP_COALESCED_MMIO
204
    if (s->coalesced_mmio && !s->coalesced_mmio_ring)
205
        s->coalesced_mmio_ring = (void *) env->kvm_run +
206
                s->coalesced_mmio * PAGE_SIZE;
207
#endif
208

    
209
    ret = kvm_arch_init_vcpu(env);
210
    if (ret == 0) {
211
        qemu_register_reset(kvm_reset_vcpu, env);
212
        kvm_arch_reset_vcpu(env);
213
    }
214
err:
215
    return ret;
216
}
217

    
218
/*
219
 * dirty pages logging control
220
 */
221
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
222
                                      ram_addr_t size, int flags, int mask)
223
{
224
    KVMState *s = kvm_state;
225
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
226
    int old_flags;
227

    
228
    if (mem == NULL)  {
229
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
230
                    TARGET_FMT_plx "\n", __func__, phys_addr,
231
                    (target_phys_addr_t)(phys_addr + size - 1));
232
            return -EINVAL;
233
    }
234

    
235
    old_flags = mem->flags;
236

    
237
    flags = (mem->flags & ~mask) | flags;
238
    mem->flags = flags;
239

    
240
    /* If nothing changed effectively, no need to issue ioctl */
241
    if (s->migration_log) {
242
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
243
    }
244
    if (flags == old_flags) {
245
            return 0;
246
    }
247

    
248
    return kvm_set_user_memory_region(s, mem);
249
}
250

    
251
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
252
{
253
        return kvm_dirty_pages_log_change(phys_addr, size,
254
                                          KVM_MEM_LOG_DIRTY_PAGES,
255
                                          KVM_MEM_LOG_DIRTY_PAGES);
256
}
257

    
258
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
259
{
260
        return kvm_dirty_pages_log_change(phys_addr, size,
261
                                          0,
262
                                          KVM_MEM_LOG_DIRTY_PAGES);
263
}
264

    
265
static int kvm_set_migration_log(int enable)
266
{
267
    KVMState *s = kvm_state;
268
    KVMSlot *mem;
269
    int i, err;
270

    
271
    s->migration_log = enable;
272

    
273
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
274
        mem = &s->slots[i];
275

    
276
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
277
            continue;
278
        }
279
        err = kvm_set_user_memory_region(s, mem);
280
        if (err) {
281
            return err;
282
        }
283
    }
284
    return 0;
285
}
286

    
287
/* get kvm's dirty pages bitmap and update qemu's */
288
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
289
                                         unsigned long *bitmap,
290
                                         unsigned long offset,
291
                                         unsigned long mem_size)
292
{
293
    unsigned int i, j;
294
    unsigned long page_number, addr, addr1, c;
295
    ram_addr_t ram_addr;
296
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
297
        HOST_LONG_BITS;
298

    
299
    /*
300
     * bitmap-traveling is faster than memory-traveling (for addr...)
301
     * especially when most of the memory is not dirty.
302
     */
303
    for (i = 0; i < len; i++) {
304
        if (bitmap[i] != 0) {
305
            c = leul_to_cpu(bitmap[i]);
306
            do {
307
                j = ffsl(c) - 1;
308
                c &= ~(1ul << j);
309
                page_number = i * HOST_LONG_BITS + j;
310
                addr1 = page_number * TARGET_PAGE_SIZE;
311
                addr = offset + addr1;
312
                ram_addr = cpu_get_physical_page_desc(addr);
313
                cpu_physical_memory_set_dirty(ram_addr);
314
            } while (c != 0);
315
        }
316
    }
317
    return 0;
318
}
319

    
320
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
321

    
322
/**
323
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
324
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
325
 * This means all bits are set to dirty.
326
 *
327
 * @start_add: start of logged region.
328
 * @end_addr: end of logged region.
329
 */
330
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
331
                                          target_phys_addr_t end_addr)
332
{
333
    KVMState *s = kvm_state;
334
    unsigned long size, allocated_size = 0;
335
    KVMDirtyLog d;
336
    KVMSlot *mem;
337
    int ret = 0;
338

    
339
    d.dirty_bitmap = NULL;
340
    while (start_addr < end_addr) {
341
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
342
        if (mem == NULL) {
343
            break;
344
        }
345

    
346
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
347
        if (!d.dirty_bitmap) {
348
            d.dirty_bitmap = qemu_malloc(size);
349
        } else if (size > allocated_size) {
350
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
351
        }
352
        allocated_size = size;
353
        memset(d.dirty_bitmap, 0, allocated_size);
354

    
355
        d.slot = mem->slot;
356

    
357
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
358
            DPRINTF("ioctl failed %d\n", errno);
359
            ret = -1;
360
            break;
361
        }
362

    
363
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
364
                                      mem->start_addr, mem->memory_size);
365
        start_addr = mem->start_addr + mem->memory_size;
366
    }
367
    qemu_free(d.dirty_bitmap);
368

    
369
    return ret;
370
}
371

    
372
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
373
{
374
    int ret = -ENOSYS;
375
#ifdef KVM_CAP_COALESCED_MMIO
376
    KVMState *s = kvm_state;
377

    
378
    if (s->coalesced_mmio) {
379
        struct kvm_coalesced_mmio_zone zone;
380

    
381
        zone.addr = start;
382
        zone.size = size;
383

    
384
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
385
    }
386
#endif
387

    
388
    return ret;
389
}
390

    
391
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
392
{
393
    int ret = -ENOSYS;
394
#ifdef KVM_CAP_COALESCED_MMIO
395
    KVMState *s = kvm_state;
396

    
397
    if (s->coalesced_mmio) {
398
        struct kvm_coalesced_mmio_zone zone;
399

    
400
        zone.addr = start;
401
        zone.size = size;
402

    
403
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
404
    }
405
#endif
406

    
407
    return ret;
408
}
409

    
410
int kvm_check_extension(KVMState *s, unsigned int extension)
411
{
412
    int ret;
413

    
414
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
415
    if (ret < 0) {
416
        ret = 0;
417
    }
418

    
419
    return ret;
420
}
421

    
422
static void kvm_set_phys_mem(target_phys_addr_t start_addr,
423
                             ram_addr_t size,
424
                             ram_addr_t phys_offset)
425
{
426
    KVMState *s = kvm_state;
427
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
428
    KVMSlot *mem, old;
429
    int err;
430

    
431
    if (start_addr & ~TARGET_PAGE_MASK) {
432
        if (flags >= IO_MEM_UNASSIGNED) {
433
            if (!kvm_lookup_overlapping_slot(s, start_addr,
434
                                             start_addr + size)) {
435
                return;
436
            }
437
            fprintf(stderr, "Unaligned split of a KVM memory slot\n");
438
        } else {
439
            fprintf(stderr, "Only page-aligned memory slots supported\n");
440
        }
441
        abort();
442
    }
443

    
444
    /* KVM does not support read-only slots */
445
    phys_offset &= ~IO_MEM_ROM;
446

    
447
    while (1) {
448
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
449
        if (!mem) {
450
            break;
451
        }
452

    
453
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
454
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
455
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
456
            /* The new slot fits into the existing one and comes with
457
             * identical parameters - nothing to be done. */
458
            return;
459
        }
460

    
461
        old = *mem;
462

    
463
        /* unregister the overlapping slot */
464
        mem->memory_size = 0;
465
        err = kvm_set_user_memory_region(s, mem);
466
        if (err) {
467
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
468
                    __func__, strerror(-err));
469
            abort();
470
        }
471

    
472
        /* Workaround for older KVM versions: we can't join slots, even not by
473
         * unregistering the previous ones and then registering the larger
474
         * slot. We have to maintain the existing fragmentation. Sigh.
475
         *
476
         * This workaround assumes that the new slot starts at the same
477
         * address as the first existing one. If not or if some overlapping
478
         * slot comes around later, we will fail (not seen in practice so far)
479
         * - and actually require a recent KVM version. */
480
        if (s->broken_set_mem_region &&
481
            old.start_addr == start_addr && old.memory_size < size &&
482
            flags < IO_MEM_UNASSIGNED) {
483
            mem = kvm_alloc_slot(s);
484
            mem->memory_size = old.memory_size;
485
            mem->start_addr = old.start_addr;
486
            mem->phys_offset = old.phys_offset;
487
            mem->flags = 0;
488

    
489
            err = kvm_set_user_memory_region(s, mem);
490
            if (err) {
491
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
492
                        strerror(-err));
493
                abort();
494
            }
495

    
496
            start_addr += old.memory_size;
497
            phys_offset += old.memory_size;
498
            size -= old.memory_size;
499
            continue;
500
        }
501

    
502
        /* register prefix slot */
503
        if (old.start_addr < start_addr) {
504
            mem = kvm_alloc_slot(s);
505
            mem->memory_size = start_addr - old.start_addr;
506
            mem->start_addr = old.start_addr;
507
            mem->phys_offset = old.phys_offset;
508
            mem->flags = 0;
509

    
510
            err = kvm_set_user_memory_region(s, mem);
511
            if (err) {
512
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
513
                        __func__, strerror(-err));
514
                abort();
515
            }
516
        }
517

    
518
        /* register suffix slot */
519
        if (old.start_addr + old.memory_size > start_addr + size) {
520
            ram_addr_t size_delta;
521

    
522
            mem = kvm_alloc_slot(s);
523
            mem->start_addr = start_addr + size;
524
            size_delta = mem->start_addr - old.start_addr;
525
            mem->memory_size = old.memory_size - size_delta;
526
            mem->phys_offset = old.phys_offset + size_delta;
527
            mem->flags = 0;
528

    
529
            err = kvm_set_user_memory_region(s, mem);
530
            if (err) {
531
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
532
                        __func__, strerror(-err));
533
                abort();
534
            }
535
        }
536
    }
537

    
538
    /* in case the KVM bug workaround already "consumed" the new slot */
539
    if (!size)
540
        return;
541

    
542
    /* KVM does not need to know about this memory */
543
    if (flags >= IO_MEM_UNASSIGNED)
544
        return;
545

    
546
    mem = kvm_alloc_slot(s);
547
    mem->memory_size = size;
548
    mem->start_addr = start_addr;
549
    mem->phys_offset = phys_offset;
550
    mem->flags = 0;
551

    
552
    err = kvm_set_user_memory_region(s, mem);
553
    if (err) {
554
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
555
                strerror(-err));
556
        abort();
557
    }
558
}
559

    
560
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
561
                                  target_phys_addr_t start_addr,
562
                                  ram_addr_t size,
563
                                  ram_addr_t phys_offset)
564
{
565
        kvm_set_phys_mem(start_addr, size, phys_offset);
566
}
567

    
568
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
569
                                        target_phys_addr_t start_addr,
570
                                        target_phys_addr_t end_addr)
571
{
572
        return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
573
}
574

    
575
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
576
                                    int enable)
577
{
578
        return kvm_set_migration_log(enable);
579
}
580

    
581
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
582
        .set_memory = kvm_client_set_memory,
583
        .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
584
        .migration_log = kvm_client_migration_log,
585
};
586

    
587
int kvm_init(int smp_cpus)
588
{
589
    static const char upgrade_note[] =
590
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
591
        "(see http://sourceforge.net/projects/kvm).\n";
592
    KVMState *s;
593
    int ret;
594
    int i;
595

    
596
    s = qemu_mallocz(sizeof(KVMState));
597

    
598
#ifdef KVM_CAP_SET_GUEST_DEBUG
599
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
600
#endif
601
    for (i = 0; i < ARRAY_SIZE(s->slots); i++)
602
        s->slots[i].slot = i;
603

    
604
    s->vmfd = -1;
605
    s->fd = qemu_open("/dev/kvm", O_RDWR);
606
    if (s->fd == -1) {
607
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
608
        ret = -errno;
609
        goto err;
610
    }
611

    
612
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
613
    if (ret < KVM_API_VERSION) {
614
        if (ret > 0)
615
            ret = -EINVAL;
616
        fprintf(stderr, "kvm version too old\n");
617
        goto err;
618
    }
619

    
620
    if (ret > KVM_API_VERSION) {
621
        ret = -EINVAL;
622
        fprintf(stderr, "kvm version not supported\n");
623
        goto err;
624
    }
625

    
626
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
627
    if (s->vmfd < 0) {
628
#ifdef TARGET_S390X
629
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
630
                        "your host kernel command line\n");
631
#endif
632
        goto err;
633
    }
634

    
635
    /* initially, KVM allocated its own memory and we had to jump through
636
     * hooks to make phys_ram_base point to this.  Modern versions of KVM
637
     * just use a user allocated buffer so we can use regular pages
638
     * unmodified.  Make sure we have a sufficiently modern version of KVM.
639
     */
640
    if (!kvm_check_extension(s, KVM_CAP_USER_MEMORY)) {
641
        ret = -EINVAL;
642
        fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n%s",
643
                upgrade_note);
644
        goto err;
645
    }
646

    
647
    /* There was a nasty bug in < kvm-80 that prevents memory slots from being
648
     * destroyed properly.  Since we rely on this capability, refuse to work
649
     * with any kernel without this capability. */
650
    if (!kvm_check_extension(s, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
651
        ret = -EINVAL;
652

    
653
        fprintf(stderr,
654
                "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
655
                upgrade_note);
656
        goto err;
657
    }
658

    
659
    s->coalesced_mmio = 0;
660
#ifdef KVM_CAP_COALESCED_MMIO
661
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
662
    s->coalesced_mmio_ring = NULL;
663
#endif
664

    
665
    s->broken_set_mem_region = 1;
666
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
667
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
668
    if (ret > 0) {
669
        s->broken_set_mem_region = 0;
670
    }
671
#endif
672

    
673
    s->vcpu_events = 0;
674
#ifdef KVM_CAP_VCPU_EVENTS
675
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
676
#endif
677

    
678
    s->robust_singlestep = 0;
679
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
680
    s->robust_singlestep =
681
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
682
#endif
683

    
684
    s->debugregs = 0;
685
#ifdef KVM_CAP_DEBUGREGS
686
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
687
#endif
688

    
689
    ret = kvm_arch_init(s, smp_cpus);
690
    if (ret < 0)
691
        goto err;
692

    
693
    kvm_state = s;
694
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
695

    
696
    return 0;
697

    
698
err:
699
    if (s) {
700
        if (s->vmfd != -1)
701
            close(s->vmfd);
702
        if (s->fd != -1)
703
            close(s->fd);
704
    }
705
    qemu_free(s);
706

    
707
    return ret;
708
}
709

    
710
static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
711
                         uint32_t count)
712
{
713
    int i;
714
    uint8_t *ptr = data;
715

    
716
    for (i = 0; i < count; i++) {
717
        if (direction == KVM_EXIT_IO_IN) {
718
            switch (size) {
719
            case 1:
720
                stb_p(ptr, cpu_inb(port));
721
                break;
722
            case 2:
723
                stw_p(ptr, cpu_inw(port));
724
                break;
725
            case 4:
726
                stl_p(ptr, cpu_inl(port));
727
                break;
728
            }
729
        } else {
730
            switch (size) {
731
            case 1:
732
                cpu_outb(port, ldub_p(ptr));
733
                break;
734
            case 2:
735
                cpu_outw(port, lduw_p(ptr));
736
                break;
737
            case 4:
738
                cpu_outl(port, ldl_p(ptr));
739
                break;
740
            }
741
        }
742

    
743
        ptr += size;
744
    }
745

    
746
    return 1;
747
}
748

    
749
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
750
static void kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
751
{
752

    
753
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
754
        int i;
755

    
756
        fprintf(stderr, "KVM internal error. Suberror: %d\n",
757
                run->internal.suberror);
758

    
759
        for (i = 0; i < run->internal.ndata; ++i) {
760
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
761
                    i, (uint64_t)run->internal.data[i]);
762
        }
763
    }
764
    cpu_dump_state(env, stderr, fprintf, 0);
765
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
766
        fprintf(stderr, "emulation failure\n");
767
        if (!kvm_arch_stop_on_emulation_error(env))
768
                return;
769
    }
770
    /* FIXME: Should trigger a qmp message to let management know
771
     * something went wrong.
772
     */
773
    vm_stop(0);
774
}
775
#endif
776

    
777
void kvm_flush_coalesced_mmio_buffer(void)
778
{
779
#ifdef KVM_CAP_COALESCED_MMIO
780
    KVMState *s = kvm_state;
781
    if (s->coalesced_mmio_ring) {
782
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
783
        while (ring->first != ring->last) {
784
            struct kvm_coalesced_mmio *ent;
785

    
786
            ent = &ring->coalesced_mmio[ring->first];
787

    
788
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
789
            smp_wmb();
790
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
791
        }
792
    }
793
#endif
794
}
795

    
796
static void do_kvm_cpu_synchronize_state(void *_env)
797
{
798
    CPUState *env = _env;
799

    
800
    if (!env->kvm_vcpu_dirty) {
801
        kvm_arch_get_registers(env);
802
        env->kvm_vcpu_dirty = 1;
803
    }
804
}
805

    
806
void kvm_cpu_synchronize_state(CPUState *env)
807
{
808
    if (!env->kvm_vcpu_dirty)
809
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
810
}
811

    
812
void kvm_cpu_synchronize_post_reset(CPUState *env)
813
{
814
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
815
    env->kvm_vcpu_dirty = 0;
816
}
817

    
818
void kvm_cpu_synchronize_post_init(CPUState *env)
819
{
820
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
821
    env->kvm_vcpu_dirty = 0;
822
}
823

    
824
int kvm_cpu_exec(CPUState *env)
825
{
826
    struct kvm_run *run = env->kvm_run;
827
    int ret;
828

    
829
    DPRINTF("kvm_cpu_exec()\n");
830

    
831
    do {
832
#ifndef CONFIG_IOTHREAD
833
        if (env->exit_request) {
834
            DPRINTF("interrupt exit requested\n");
835
            ret = 0;
836
            break;
837
        }
838
#endif
839

    
840
        if (kvm_arch_process_irqchip_events(env)) {
841
            ret = 0;
842
            break;
843
        }
844

    
845
        if (env->kvm_vcpu_dirty) {
846
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
847
            env->kvm_vcpu_dirty = 0;
848
        }
849

    
850
        kvm_arch_pre_run(env, run);
851
        cpu_single_env = NULL;
852
        qemu_mutex_unlock_iothread();
853
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
854
        qemu_mutex_lock_iothread();
855
        cpu_single_env = env;
856
        kvm_arch_post_run(env, run);
857

    
858
        if (ret == -EINTR || ret == -EAGAIN) {
859
            cpu_exit(env);
860
            DPRINTF("io window exit\n");
861
            ret = 0;
862
            break;
863
        }
864

    
865
        if (ret < 0) {
866
            DPRINTF("kvm run failed %s\n", strerror(-ret));
867
            abort();
868
        }
869

    
870
        kvm_flush_coalesced_mmio_buffer();
871

    
872
        ret = 0; /* exit loop */
873
        switch (run->exit_reason) {
874
        case KVM_EXIT_IO:
875
            DPRINTF("handle_io\n");
876
            ret = kvm_handle_io(run->io.port,
877
                                (uint8_t *)run + run->io.data_offset,
878
                                run->io.direction,
879
                                run->io.size,
880
                                run->io.count);
881
            break;
882
        case KVM_EXIT_MMIO:
883
            DPRINTF("handle_mmio\n");
884
            cpu_physical_memory_rw(run->mmio.phys_addr,
885
                                   run->mmio.data,
886
                                   run->mmio.len,
887
                                   run->mmio.is_write);
888
            ret = 1;
889
            break;
890
        case KVM_EXIT_IRQ_WINDOW_OPEN:
891
            DPRINTF("irq_window_open\n");
892
            break;
893
        case KVM_EXIT_SHUTDOWN:
894
            DPRINTF("shutdown\n");
895
            qemu_system_reset_request();
896
            ret = 1;
897
            break;
898
        case KVM_EXIT_UNKNOWN:
899
            DPRINTF("kvm_exit_unknown\n");
900
            break;
901
        case KVM_EXIT_FAIL_ENTRY:
902
            DPRINTF("kvm_exit_fail_entry\n");
903
            break;
904
        case KVM_EXIT_EXCEPTION:
905
            DPRINTF("kvm_exit_exception\n");
906
            break;
907
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
908
        case KVM_EXIT_INTERNAL_ERROR:
909
            kvm_handle_internal_error(env, run);
910
            break;
911
#endif
912
        case KVM_EXIT_DEBUG:
913
            DPRINTF("kvm_exit_debug\n");
914
#ifdef KVM_CAP_SET_GUEST_DEBUG
915
            if (kvm_arch_debug(&run->debug.arch)) {
916
                gdb_set_stop_cpu(env);
917
                vm_stop(EXCP_DEBUG);
918
                env->exception_index = EXCP_DEBUG;
919
                return 0;
920
            }
921
            /* re-enter, this exception was guest-internal */
922
            ret = 1;
923
#endif /* KVM_CAP_SET_GUEST_DEBUG */
924
            break;
925
        default:
926
            DPRINTF("kvm_arch_handle_exit\n");
927
            ret = kvm_arch_handle_exit(env, run);
928
            break;
929
        }
930
    } while (ret > 0);
931

    
932
    if (env->exit_request) {
933
        env->exit_request = 0;
934
        env->exception_index = EXCP_INTERRUPT;
935
    }
936

    
937
    return ret;
938
}
939

    
940
int kvm_ioctl(KVMState *s, int type, ...)
941
{
942
    int ret;
943
    void *arg;
944
    va_list ap;
945

    
946
    va_start(ap, type);
947
    arg = va_arg(ap, void *);
948
    va_end(ap);
949

    
950
    ret = ioctl(s->fd, type, arg);
951
    if (ret == -1)
952
        ret = -errno;
953

    
954
    return ret;
955
}
956

    
957
int kvm_vm_ioctl(KVMState *s, int type, ...)
958
{
959
    int ret;
960
    void *arg;
961
    va_list ap;
962

    
963
    va_start(ap, type);
964
    arg = va_arg(ap, void *);
965
    va_end(ap);
966

    
967
    ret = ioctl(s->vmfd, type, arg);
968
    if (ret == -1)
969
        ret = -errno;
970

    
971
    return ret;
972
}
973

    
974
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
975
{
976
    int ret;
977
    void *arg;
978
    va_list ap;
979

    
980
    va_start(ap, type);
981
    arg = va_arg(ap, void *);
982
    va_end(ap);
983

    
984
    ret = ioctl(env->kvm_fd, type, arg);
985
    if (ret == -1)
986
        ret = -errno;
987

    
988
    return ret;
989
}
990

    
991
int kvm_has_sync_mmu(void)
992
{
993
#ifdef KVM_CAP_SYNC_MMU
994
    KVMState *s = kvm_state;
995

    
996
    return kvm_check_extension(s, KVM_CAP_SYNC_MMU);
997
#else
998
    return 0;
999
#endif
1000
}
1001

    
1002
int kvm_has_vcpu_events(void)
1003
{
1004
    return kvm_state->vcpu_events;
1005
}
1006

    
1007
int kvm_has_robust_singlestep(void)
1008
{
1009
    return kvm_state->robust_singlestep;
1010
}
1011

    
1012
int kvm_has_debugregs(void)
1013
{
1014
    return kvm_state->debugregs;
1015
}
1016

    
1017
void kvm_setup_guest_memory(void *start, size_t size)
1018
{
1019
    if (!kvm_has_sync_mmu()) {
1020
#ifdef MADV_DONTFORK
1021
        int ret = madvise(start, size, MADV_DONTFORK);
1022

    
1023
        if (ret) {
1024
            perror("madvice");
1025
            exit(1);
1026
        }
1027
#else
1028
        fprintf(stderr,
1029
                "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1030
        exit(1);
1031
#endif
1032
    }
1033
}
1034

    
1035
#ifdef KVM_CAP_SET_GUEST_DEBUG
1036
static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
1037
{
1038
#ifdef CONFIG_IOTHREAD
1039
    if (env != cpu_single_env) {
1040
        abort();
1041
    }
1042
#endif
1043
    func(data);
1044
}
1045

    
1046
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1047
                                                 target_ulong pc)
1048
{
1049
    struct kvm_sw_breakpoint *bp;
1050

    
1051
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1052
        if (bp->pc == pc)
1053
            return bp;
1054
    }
1055
    return NULL;
1056
}
1057

    
1058
int kvm_sw_breakpoints_active(CPUState *env)
1059
{
1060
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1061
}
1062

    
1063
struct kvm_set_guest_debug_data {
1064
    struct kvm_guest_debug dbg;
1065
    CPUState *env;
1066
    int err;
1067
};
1068

    
1069
static void kvm_invoke_set_guest_debug(void *data)
1070
{
1071
    struct kvm_set_guest_debug_data *dbg_data = data;
1072
    CPUState *env = dbg_data->env;
1073

    
1074
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1075
}
1076

    
1077
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1078
{
1079
    struct kvm_set_guest_debug_data data;
1080

    
1081
    data.dbg.control = reinject_trap;
1082

    
1083
    if (env->singlestep_enabled) {
1084
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1085
    }
1086
    kvm_arch_update_guest_debug(env, &data.dbg);
1087
    data.env = env;
1088

    
1089
    on_vcpu(env, kvm_invoke_set_guest_debug, &data);
1090
    return data.err;
1091
}
1092

    
1093
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1094
                          target_ulong len, int type)
1095
{
1096
    struct kvm_sw_breakpoint *bp;
1097
    CPUState *env;
1098
    int err;
1099

    
1100
    if (type == GDB_BREAKPOINT_SW) {
1101
        bp = kvm_find_sw_breakpoint(current_env, addr);
1102
        if (bp) {
1103
            bp->use_count++;
1104
            return 0;
1105
        }
1106

    
1107
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1108
        if (!bp)
1109
            return -ENOMEM;
1110

    
1111
        bp->pc = addr;
1112
        bp->use_count = 1;
1113
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1114
        if (err) {
1115
            free(bp);
1116
            return err;
1117
        }
1118

    
1119
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1120
                          bp, entry);
1121
    } else {
1122
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1123
        if (err)
1124
            return err;
1125
    }
1126

    
1127
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1128
        err = kvm_update_guest_debug(env, 0);
1129
        if (err)
1130
            return err;
1131
    }
1132
    return 0;
1133
}
1134

    
1135
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1136
                          target_ulong len, int type)
1137
{
1138
    struct kvm_sw_breakpoint *bp;
1139
    CPUState *env;
1140
    int err;
1141

    
1142
    if (type == GDB_BREAKPOINT_SW) {
1143
        bp = kvm_find_sw_breakpoint(current_env, addr);
1144
        if (!bp)
1145
            return -ENOENT;
1146

    
1147
        if (bp->use_count > 1) {
1148
            bp->use_count--;
1149
            return 0;
1150
        }
1151

    
1152
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1153
        if (err)
1154
            return err;
1155

    
1156
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1157
        qemu_free(bp);
1158
    } else {
1159
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1160
        if (err)
1161
            return err;
1162
    }
1163

    
1164
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1165
        err = kvm_update_guest_debug(env, 0);
1166
        if (err)
1167
            return err;
1168
    }
1169
    return 0;
1170
}
1171

    
1172
void kvm_remove_all_breakpoints(CPUState *current_env)
1173
{
1174
    struct kvm_sw_breakpoint *bp, *next;
1175
    KVMState *s = current_env->kvm_state;
1176
    CPUState *env;
1177

    
1178
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1179
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1180
            /* Try harder to find a CPU that currently sees the breakpoint. */
1181
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1182
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0)
1183
                    break;
1184
            }
1185
        }
1186
    }
1187
    kvm_arch_remove_all_hw_breakpoints();
1188

    
1189
    for (env = first_cpu; env != NULL; env = env->next_cpu)
1190
        kvm_update_guest_debug(env, 0);
1191
}
1192

    
1193
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1194

    
1195
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1196
{
1197
    return -EINVAL;
1198
}
1199

    
1200
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1201
                          target_ulong len, int type)
1202
{
1203
    return -EINVAL;
1204
}
1205

    
1206
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1207
                          target_ulong len, int type)
1208
{
1209
    return -EINVAL;
1210
}
1211

    
1212
void kvm_remove_all_breakpoints(CPUState *current_env)
1213
{
1214
}
1215
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1216

    
1217
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1218
{
1219
    struct kvm_signal_mask *sigmask;
1220
    int r;
1221

    
1222
    if (!sigset)
1223
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1224

    
1225
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1226

    
1227
    sigmask->len = 8;
1228
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1229
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1230
    free(sigmask);
1231

    
1232
    return r;
1233
}
1234

    
1235
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1236
{
1237
#ifdef KVM_IOEVENTFD
1238
    struct kvm_ioeventfd kick = {
1239
        .datamatch = val,
1240
        .addr = addr,
1241
        .len = 2,
1242
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1243
        .fd = fd,
1244
    };
1245
    int r;
1246
    if (!kvm_enabled())
1247
        return -ENOSYS;
1248
    if (!assign)
1249
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1250
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1251
    if (r < 0)
1252
        return r;
1253
    return 0;
1254
#else
1255
    return -ENOSYS;
1256
#endif
1257
}