Statistics
| Branch: | Revision:

root / kvm-all.c @ 0af691d7

History | View | Annotate | Download (31.8 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
32
#define PAGE_SIZE TARGET_PAGE_SIZE
33

    
34
//#define DEBUG_KVM
35

    
36
#ifdef DEBUG_KVM
37
#define DPRINTF(fmt, ...) \
38
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
39
#else
40
#define DPRINTF(fmt, ...) \
41
    do { } while (0)
42
#endif
43

    
44
typedef struct KVMSlot
45
{
46
    target_phys_addr_t start_addr;
47
    ram_addr_t memory_size;
48
    ram_addr_t phys_offset;
49
    int slot;
50
    int flags;
51
} KVMSlot;
52

    
53
typedef struct kvm_dirty_log KVMDirtyLog;
54

    
55
struct KVMState
56
{
57
    KVMSlot slots[32];
58
    int fd;
59
    int vmfd;
60
    int coalesced_mmio;
61
#ifdef KVM_CAP_COALESCED_MMIO
62
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
63
#endif
64
    int broken_set_mem_region;
65
    int migration_log;
66
    int vcpu_events;
67
    int robust_singlestep;
68
    int debugregs;
69
#ifdef KVM_CAP_SET_GUEST_DEBUG
70
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
71
#endif
72
    int irqchip_in_kernel;
73
    int pit_in_kernel;
74
};
75

    
76
static KVMState *kvm_state;
77

    
78
static KVMSlot *kvm_alloc_slot(KVMState *s)
79
{
80
    int i;
81

    
82
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
83
        /* KVM private memory slots */
84
        if (i >= 8 && i < 12)
85
            continue;
86
        if (s->slots[i].memory_size == 0)
87
            return &s->slots[i];
88
    }
89

    
90
    fprintf(stderr, "%s: no free slot available\n", __func__);
91
    abort();
92
}
93

    
94
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
95
                                         target_phys_addr_t start_addr,
96
                                         target_phys_addr_t end_addr)
97
{
98
    int i;
99

    
100
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
101
        KVMSlot *mem = &s->slots[i];
102

    
103
        if (start_addr == mem->start_addr &&
104
            end_addr == mem->start_addr + mem->memory_size) {
105
            return mem;
106
        }
107
    }
108

    
109
    return NULL;
110
}
111

    
112
/*
113
 * Find overlapping slot with lowest start address
114
 */
115
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
116
                                            target_phys_addr_t start_addr,
117
                                            target_phys_addr_t end_addr)
118
{
119
    KVMSlot *found = NULL;
120
    int i;
121

    
122
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
123
        KVMSlot *mem = &s->slots[i];
124

    
125
        if (mem->memory_size == 0 ||
126
            (found && found->start_addr < mem->start_addr)) {
127
            continue;
128
        }
129

    
130
        if (end_addr > mem->start_addr &&
131
            start_addr < mem->start_addr + mem->memory_size) {
132
            found = mem;
133
        }
134
    }
135

    
136
    return found;
137
}
138

    
139
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
140
{
141
    struct kvm_userspace_memory_region mem;
142

    
143
    mem.slot = slot->slot;
144
    mem.guest_phys_addr = slot->start_addr;
145
    mem.memory_size = slot->memory_size;
146
    mem.userspace_addr = (unsigned long)qemu_get_ram_ptr(slot->phys_offset);
147
    mem.flags = slot->flags;
148
    if (s->migration_log) {
149
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
150
    }
151
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
152
}
153

    
154
static void kvm_reset_vcpu(void *opaque)
155
{
156
    CPUState *env = opaque;
157

    
158
    kvm_arch_reset_vcpu(env);
159
}
160

    
161
int kvm_irqchip_in_kernel(void)
162
{
163
    return kvm_state->irqchip_in_kernel;
164
}
165

    
166
int kvm_pit_in_kernel(void)
167
{
168
    return kvm_state->pit_in_kernel;
169
}
170

    
171

    
172
int kvm_init_vcpu(CPUState *env)
173
{
174
    KVMState *s = kvm_state;
175
    long mmap_size;
176
    int ret;
177

    
178
    DPRINTF("kvm_init_vcpu\n");
179

    
180
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
181
    if (ret < 0) {
182
        DPRINTF("kvm_create_vcpu failed\n");
183
        goto err;
184
    }
185

    
186
    env->kvm_fd = ret;
187
    env->kvm_state = s;
188

    
189
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
190
    if (mmap_size < 0) {
191
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
192
        goto err;
193
    }
194

    
195
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
196
                        env->kvm_fd, 0);
197
    if (env->kvm_run == MAP_FAILED) {
198
        ret = -errno;
199
        DPRINTF("mmap'ing vcpu state failed\n");
200
        goto err;
201
    }
202

    
203
#ifdef KVM_CAP_COALESCED_MMIO
204
    if (s->coalesced_mmio && !s->coalesced_mmio_ring)
205
        s->coalesced_mmio_ring = (void *) env->kvm_run +
206
                s->coalesced_mmio * PAGE_SIZE;
207
#endif
208

    
209
    ret = kvm_arch_init_vcpu(env);
210
    if (ret == 0) {
211
        qemu_register_reset(kvm_reset_vcpu, env);
212
        kvm_arch_reset_vcpu(env);
213
    }
214
err:
215
    return ret;
216
}
217

    
218
/*
219
 * dirty pages logging control
220
 */
221
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
222
                                      ram_addr_t size, int flags, int mask)
223
{
224
    KVMState *s = kvm_state;
225
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
226
    int old_flags;
227

    
228
    if (mem == NULL)  {
229
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
230
                    TARGET_FMT_plx "\n", __func__, phys_addr,
231
                    (target_phys_addr_t)(phys_addr + size - 1));
232
            return -EINVAL;
233
    }
234

    
235
    old_flags = mem->flags;
236

    
237
    flags = (mem->flags & ~mask) | flags;
238
    mem->flags = flags;
239

    
240
    /* If nothing changed effectively, no need to issue ioctl */
241
    if (s->migration_log) {
242
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
243
    }
244
    if (flags == old_flags) {
245
            return 0;
246
    }
247

    
248
    return kvm_set_user_memory_region(s, mem);
249
}
250

    
251
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
252
{
253
        return kvm_dirty_pages_log_change(phys_addr, size,
254
                                          KVM_MEM_LOG_DIRTY_PAGES,
255
                                          KVM_MEM_LOG_DIRTY_PAGES);
256
}
257

    
258
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
259
{
260
        return kvm_dirty_pages_log_change(phys_addr, size,
261
                                          0,
262
                                          KVM_MEM_LOG_DIRTY_PAGES);
263
}
264

    
265
static int kvm_set_migration_log(int enable)
266
{
267
    KVMState *s = kvm_state;
268
    KVMSlot *mem;
269
    int i, err;
270

    
271
    s->migration_log = enable;
272

    
273
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
274
        mem = &s->slots[i];
275

    
276
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
277
            continue;
278
        }
279
        err = kvm_set_user_memory_region(s, mem);
280
        if (err) {
281
            return err;
282
        }
283
    }
284
    return 0;
285
}
286

    
287
/* get kvm's dirty pages bitmap and update qemu's */
288
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
289
                                         unsigned long *bitmap,
290
                                         unsigned long offset,
291
                                         unsigned long mem_size)
292
{
293
    unsigned int i, j;
294
    unsigned long page_number, addr, addr1, c;
295
    ram_addr_t ram_addr;
296
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
297
        HOST_LONG_BITS;
298

    
299
    /*
300
     * bitmap-traveling is faster than memory-traveling (for addr...)
301
     * especially when most of the memory is not dirty.
302
     */
303
    for (i = 0; i < len; i++) {
304
        if (bitmap[i] != 0) {
305
            c = leul_to_cpu(bitmap[i]);
306
            do {
307
                j = ffsl(c) - 1;
308
                c &= ~(1ul << j);
309
                page_number = i * HOST_LONG_BITS + j;
310
                addr1 = page_number * TARGET_PAGE_SIZE;
311
                addr = offset + addr1;
312
                ram_addr = cpu_get_physical_page_desc(addr);
313
                cpu_physical_memory_set_dirty(ram_addr);
314
            } while (c != 0);
315
        }
316
    }
317
    return 0;
318
}
319

    
320
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
321

    
322
/**
323
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
324
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
325
 * This means all bits are set to dirty.
326
 *
327
 * @start_add: start of logged region.
328
 * @end_addr: end of logged region.
329
 */
330
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
331
                                          target_phys_addr_t end_addr)
332
{
333
    KVMState *s = kvm_state;
334
    unsigned long size, allocated_size = 0;
335
    KVMDirtyLog d;
336
    KVMSlot *mem;
337
    int ret = 0;
338

    
339
    d.dirty_bitmap = NULL;
340
    while (start_addr < end_addr) {
341
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
342
        if (mem == NULL) {
343
            break;
344
        }
345

    
346
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
347
        if (!d.dirty_bitmap) {
348
            d.dirty_bitmap = qemu_malloc(size);
349
        } else if (size > allocated_size) {
350
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
351
        }
352
        allocated_size = size;
353
        memset(d.dirty_bitmap, 0, allocated_size);
354

    
355
        d.slot = mem->slot;
356

    
357
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
358
            DPRINTF("ioctl failed %d\n", errno);
359
            ret = -1;
360
            break;
361
        }
362

    
363
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
364
                                      mem->start_addr, mem->memory_size);
365
        start_addr = mem->start_addr + mem->memory_size;
366
    }
367
    qemu_free(d.dirty_bitmap);
368

    
369
    return ret;
370
}
371

    
372
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
373
{
374
    int ret = -ENOSYS;
375
#ifdef KVM_CAP_COALESCED_MMIO
376
    KVMState *s = kvm_state;
377

    
378
    if (s->coalesced_mmio) {
379
        struct kvm_coalesced_mmio_zone zone;
380

    
381
        zone.addr = start;
382
        zone.size = size;
383

    
384
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
385
    }
386
#endif
387

    
388
    return ret;
389
}
390

    
391
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
392
{
393
    int ret = -ENOSYS;
394
#ifdef KVM_CAP_COALESCED_MMIO
395
    KVMState *s = kvm_state;
396

    
397
    if (s->coalesced_mmio) {
398
        struct kvm_coalesced_mmio_zone zone;
399

    
400
        zone.addr = start;
401
        zone.size = size;
402

    
403
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
404
    }
405
#endif
406

    
407
    return ret;
408
}
409

    
410
int kvm_check_extension(KVMState *s, unsigned int extension)
411
{
412
    int ret;
413

    
414
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
415
    if (ret < 0) {
416
        ret = 0;
417
    }
418

    
419
    return ret;
420
}
421

    
422
static void kvm_set_phys_mem(target_phys_addr_t start_addr,
423
                             ram_addr_t size,
424
                             ram_addr_t phys_offset)
425
{
426
    KVMState *s = kvm_state;
427
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
428
    KVMSlot *mem, old;
429
    int err;
430

    
431
    if (start_addr & ~TARGET_PAGE_MASK) {
432
        if (flags >= IO_MEM_UNASSIGNED) {
433
            if (!kvm_lookup_overlapping_slot(s, start_addr,
434
                                             start_addr + size)) {
435
                return;
436
            }
437
            fprintf(stderr, "Unaligned split of a KVM memory slot\n");
438
        } else {
439
            fprintf(stderr, "Only page-aligned memory slots supported\n");
440
        }
441
        abort();
442
    }
443

    
444
    /* KVM does not support read-only slots */
445
    phys_offset &= ~IO_MEM_ROM;
446

    
447
    while (1) {
448
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
449
        if (!mem) {
450
            break;
451
        }
452

    
453
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
454
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
455
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
456
            /* The new slot fits into the existing one and comes with
457
             * identical parameters - nothing to be done. */
458
            return;
459
        }
460

    
461
        old = *mem;
462

    
463
        /* unregister the overlapping slot */
464
        mem->memory_size = 0;
465
        err = kvm_set_user_memory_region(s, mem);
466
        if (err) {
467
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
468
                    __func__, strerror(-err));
469
            abort();
470
        }
471

    
472
        /* Workaround for older KVM versions: we can't join slots, even not by
473
         * unregistering the previous ones and then registering the larger
474
         * slot. We have to maintain the existing fragmentation. Sigh.
475
         *
476
         * This workaround assumes that the new slot starts at the same
477
         * address as the first existing one. If not or if some overlapping
478
         * slot comes around later, we will fail (not seen in practice so far)
479
         * - and actually require a recent KVM version. */
480
        if (s->broken_set_mem_region &&
481
            old.start_addr == start_addr && old.memory_size < size &&
482
            flags < IO_MEM_UNASSIGNED) {
483
            mem = kvm_alloc_slot(s);
484
            mem->memory_size = old.memory_size;
485
            mem->start_addr = old.start_addr;
486
            mem->phys_offset = old.phys_offset;
487
            mem->flags = 0;
488

    
489
            err = kvm_set_user_memory_region(s, mem);
490
            if (err) {
491
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
492
                        strerror(-err));
493
                abort();
494
            }
495

    
496
            start_addr += old.memory_size;
497
            phys_offset += old.memory_size;
498
            size -= old.memory_size;
499
            continue;
500
        }
501

    
502
        /* register prefix slot */
503
        if (old.start_addr < start_addr) {
504
            mem = kvm_alloc_slot(s);
505
            mem->memory_size = start_addr - old.start_addr;
506
            mem->start_addr = old.start_addr;
507
            mem->phys_offset = old.phys_offset;
508
            mem->flags = 0;
509

    
510
            err = kvm_set_user_memory_region(s, mem);
511
            if (err) {
512
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
513
                        __func__, strerror(-err));
514
                abort();
515
            }
516
        }
517

    
518
        /* register suffix slot */
519
        if (old.start_addr + old.memory_size > start_addr + size) {
520
            ram_addr_t size_delta;
521

    
522
            mem = kvm_alloc_slot(s);
523
            mem->start_addr = start_addr + size;
524
            size_delta = mem->start_addr - old.start_addr;
525
            mem->memory_size = old.memory_size - size_delta;
526
            mem->phys_offset = old.phys_offset + size_delta;
527
            mem->flags = 0;
528

    
529
            err = kvm_set_user_memory_region(s, mem);
530
            if (err) {
531
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
532
                        __func__, strerror(-err));
533
                abort();
534
            }
535
        }
536
    }
537

    
538
    /* in case the KVM bug workaround already "consumed" the new slot */
539
    if (!size)
540
        return;
541

    
542
    /* KVM does not need to know about this memory */
543
    if (flags >= IO_MEM_UNASSIGNED)
544
        return;
545

    
546
    mem = kvm_alloc_slot(s);
547
    mem->memory_size = size;
548
    mem->start_addr = start_addr;
549
    mem->phys_offset = phys_offset;
550
    mem->flags = 0;
551

    
552
    err = kvm_set_user_memory_region(s, mem);
553
    if (err) {
554
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
555
                strerror(-err));
556
        abort();
557
    }
558
}
559

    
560
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
561
                                  target_phys_addr_t start_addr,
562
                                  ram_addr_t size,
563
                                  ram_addr_t phys_offset)
564
{
565
        kvm_set_phys_mem(start_addr, size, phys_offset);
566
}
567

    
568
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
569
                                        target_phys_addr_t start_addr,
570
                                        target_phys_addr_t end_addr)
571
{
572
        return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
573
}
574

    
575
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
576
                                    int enable)
577
{
578
        return kvm_set_migration_log(enable);
579
}
580

    
581
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
582
        .set_memory = kvm_client_set_memory,
583
        .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
584
        .migration_log = kvm_client_migration_log,
585
};
586

    
587
int kvm_init(int smp_cpus)
588
{
589
    static const char upgrade_note[] =
590
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
591
        "(see http://sourceforge.net/projects/kvm).\n";
592
    KVMState *s;
593
    int ret;
594
    int i;
595

    
596
    s = qemu_mallocz(sizeof(KVMState));
597

    
598
#ifdef KVM_CAP_SET_GUEST_DEBUG
599
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
600
#endif
601
    for (i = 0; i < ARRAY_SIZE(s->slots); i++)
602
        s->slots[i].slot = i;
603

    
604
    s->vmfd = -1;
605
    s->fd = qemu_open("/dev/kvm", O_RDWR);
606
    if (s->fd == -1) {
607
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
608
        ret = -errno;
609
        goto err;
610
    }
611

    
612
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
613
    if (ret < KVM_API_VERSION) {
614
        if (ret > 0)
615
            ret = -EINVAL;
616
        fprintf(stderr, "kvm version too old\n");
617
        goto err;
618
    }
619

    
620
    if (ret > KVM_API_VERSION) {
621
        ret = -EINVAL;
622
        fprintf(stderr, "kvm version not supported\n");
623
        goto err;
624
    }
625

    
626
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
627
    if (s->vmfd < 0) {
628
#ifdef TARGET_S390X
629
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
630
                        "your host kernel command line\n");
631
#endif
632
        goto err;
633
    }
634

    
635
    /* initially, KVM allocated its own memory and we had to jump through
636
     * hooks to make phys_ram_base point to this.  Modern versions of KVM
637
     * just use a user allocated buffer so we can use regular pages
638
     * unmodified.  Make sure we have a sufficiently modern version of KVM.
639
     */
640
    if (!kvm_check_extension(s, KVM_CAP_USER_MEMORY)) {
641
        ret = -EINVAL;
642
        fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n%s",
643
                upgrade_note);
644
        goto err;
645
    }
646

    
647
    /* There was a nasty bug in < kvm-80 that prevents memory slots from being
648
     * destroyed properly.  Since we rely on this capability, refuse to work
649
     * with any kernel without this capability. */
650
    if (!kvm_check_extension(s, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
651
        ret = -EINVAL;
652

    
653
        fprintf(stderr,
654
                "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
655
                upgrade_note);
656
        goto err;
657
    }
658

    
659
    s->coalesced_mmio = 0;
660
#ifdef KVM_CAP_COALESCED_MMIO
661
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
662
    s->coalesced_mmio_ring = NULL;
663
#endif
664

    
665
    s->broken_set_mem_region = 1;
666
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
667
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
668
    if (ret > 0) {
669
        s->broken_set_mem_region = 0;
670
    }
671
#endif
672

    
673
    s->vcpu_events = 0;
674
#ifdef KVM_CAP_VCPU_EVENTS
675
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
676
#endif
677

    
678
    s->robust_singlestep = 0;
679
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
680
    s->robust_singlestep =
681
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
682
#endif
683

    
684
    s->debugregs = 0;
685
#ifdef KVM_CAP_DEBUGREGS
686
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
687
#endif
688

    
689
    ret = kvm_arch_init(s, smp_cpus);
690
    if (ret < 0)
691
        goto err;
692

    
693
    kvm_state = s;
694
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
695

    
696
    return 0;
697

    
698
err:
699
    if (s) {
700
        if (s->vmfd != -1)
701
            close(s->vmfd);
702
        if (s->fd != -1)
703
            close(s->fd);
704
    }
705
    qemu_free(s);
706

    
707
    return ret;
708
}
709

    
710
static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
711
                         uint32_t count)
712
{
713
    int i;
714
    uint8_t *ptr = data;
715

    
716
    for (i = 0; i < count; i++) {
717
        if (direction == KVM_EXIT_IO_IN) {
718
            switch (size) {
719
            case 1:
720
                stb_p(ptr, cpu_inb(port));
721
                break;
722
            case 2:
723
                stw_p(ptr, cpu_inw(port));
724
                break;
725
            case 4:
726
                stl_p(ptr, cpu_inl(port));
727
                break;
728
            }
729
        } else {
730
            switch (size) {
731
            case 1:
732
                cpu_outb(port, ldub_p(ptr));
733
                break;
734
            case 2:
735
                cpu_outw(port, lduw_p(ptr));
736
                break;
737
            case 4:
738
                cpu_outl(port, ldl_p(ptr));
739
                break;
740
            }
741
        }
742

    
743
        ptr += size;
744
    }
745

    
746
    return 1;
747
}
748

    
749
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
750
static void kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
751
{
752

    
753
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
754
        int i;
755

    
756
        fprintf(stderr, "KVM internal error. Suberror: %d\n",
757
                run->internal.suberror);
758

    
759
        for (i = 0; i < run->internal.ndata; ++i) {
760
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
761
                    i, (uint64_t)run->internal.data[i]);
762
        }
763
    }
764
    cpu_dump_state(env, stderr, fprintf, 0);
765
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
766
        fprintf(stderr, "emulation failure\n");
767
    }
768
    /* FIXME: Should trigger a qmp message to let management know
769
     * something went wrong.
770
     */
771
    vm_stop(0);
772
}
773
#endif
774

    
775
void kvm_flush_coalesced_mmio_buffer(void)
776
{
777
#ifdef KVM_CAP_COALESCED_MMIO
778
    KVMState *s = kvm_state;
779
    if (s->coalesced_mmio_ring) {
780
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
781
        while (ring->first != ring->last) {
782
            struct kvm_coalesced_mmio *ent;
783

    
784
            ent = &ring->coalesced_mmio[ring->first];
785

    
786
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
787
            smp_wmb();
788
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
789
        }
790
    }
791
#endif
792
}
793

    
794
static void do_kvm_cpu_synchronize_state(void *_env)
795
{
796
    CPUState *env = _env;
797

    
798
    if (!env->kvm_vcpu_dirty) {
799
        kvm_arch_get_registers(env);
800
        env->kvm_vcpu_dirty = 1;
801
    }
802
}
803

    
804
void kvm_cpu_synchronize_state(CPUState *env)
805
{
806
    if (!env->kvm_vcpu_dirty)
807
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
808
}
809

    
810
void kvm_cpu_synchronize_post_reset(CPUState *env)
811
{
812
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
813
    env->kvm_vcpu_dirty = 0;
814
}
815

    
816
void kvm_cpu_synchronize_post_init(CPUState *env)
817
{
818
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
819
    env->kvm_vcpu_dirty = 0;
820
}
821

    
822
int kvm_cpu_exec(CPUState *env)
823
{
824
    struct kvm_run *run = env->kvm_run;
825
    int ret;
826

    
827
    DPRINTF("kvm_cpu_exec()\n");
828

    
829
    do {
830
#ifndef CONFIG_IOTHREAD
831
        if (env->exit_request) {
832
            DPRINTF("interrupt exit requested\n");
833
            ret = 0;
834
            break;
835
        }
836
#endif
837

    
838
        if (kvm_arch_process_irqchip_events(env)) {
839
            ret = 0;
840
            break;
841
        }
842

    
843
        if (env->kvm_vcpu_dirty) {
844
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
845
            env->kvm_vcpu_dirty = 0;
846
        }
847

    
848
        kvm_arch_pre_run(env, run);
849
        cpu_single_env = NULL;
850
        qemu_mutex_unlock_iothread();
851
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
852
        qemu_mutex_lock_iothread();
853
        cpu_single_env = env;
854
        kvm_arch_post_run(env, run);
855

    
856
        if (ret == -EINTR || ret == -EAGAIN) {
857
            cpu_exit(env);
858
            DPRINTF("io window exit\n");
859
            ret = 0;
860
            break;
861
        }
862

    
863
        if (ret < 0) {
864
            DPRINTF("kvm run failed %s\n", strerror(-ret));
865
            abort();
866
        }
867

    
868
        kvm_flush_coalesced_mmio_buffer();
869

    
870
        ret = 0; /* exit loop */
871
        switch (run->exit_reason) {
872
        case KVM_EXIT_IO:
873
            DPRINTF("handle_io\n");
874
            ret = kvm_handle_io(run->io.port,
875
                                (uint8_t *)run + run->io.data_offset,
876
                                run->io.direction,
877
                                run->io.size,
878
                                run->io.count);
879
            break;
880
        case KVM_EXIT_MMIO:
881
            DPRINTF("handle_mmio\n");
882
            cpu_physical_memory_rw(run->mmio.phys_addr,
883
                                   run->mmio.data,
884
                                   run->mmio.len,
885
                                   run->mmio.is_write);
886
            ret = 1;
887
            break;
888
        case KVM_EXIT_IRQ_WINDOW_OPEN:
889
            DPRINTF("irq_window_open\n");
890
            break;
891
        case KVM_EXIT_SHUTDOWN:
892
            DPRINTF("shutdown\n");
893
            qemu_system_reset_request();
894
            ret = 1;
895
            break;
896
        case KVM_EXIT_UNKNOWN:
897
            DPRINTF("kvm_exit_unknown\n");
898
            break;
899
        case KVM_EXIT_FAIL_ENTRY:
900
            DPRINTF("kvm_exit_fail_entry\n");
901
            break;
902
        case KVM_EXIT_EXCEPTION:
903
            DPRINTF("kvm_exit_exception\n");
904
            break;
905
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
906
        case KVM_EXIT_INTERNAL_ERROR:
907
            kvm_handle_internal_error(env, run);
908
            break;
909
#endif
910
        case KVM_EXIT_DEBUG:
911
            DPRINTF("kvm_exit_debug\n");
912
#ifdef KVM_CAP_SET_GUEST_DEBUG
913
            if (kvm_arch_debug(&run->debug.arch)) {
914
                gdb_set_stop_cpu(env);
915
                vm_stop(EXCP_DEBUG);
916
                env->exception_index = EXCP_DEBUG;
917
                return 0;
918
            }
919
            /* re-enter, this exception was guest-internal */
920
            ret = 1;
921
#endif /* KVM_CAP_SET_GUEST_DEBUG */
922
            break;
923
        default:
924
            DPRINTF("kvm_arch_handle_exit\n");
925
            ret = kvm_arch_handle_exit(env, run);
926
            break;
927
        }
928
    } while (ret > 0);
929

    
930
    if (env->exit_request) {
931
        env->exit_request = 0;
932
        env->exception_index = EXCP_INTERRUPT;
933
    }
934

    
935
    return ret;
936
}
937

    
938
int kvm_ioctl(KVMState *s, int type, ...)
939
{
940
    int ret;
941
    void *arg;
942
    va_list ap;
943

    
944
    va_start(ap, type);
945
    arg = va_arg(ap, void *);
946
    va_end(ap);
947

    
948
    ret = ioctl(s->fd, type, arg);
949
    if (ret == -1)
950
        ret = -errno;
951

    
952
    return ret;
953
}
954

    
955
int kvm_vm_ioctl(KVMState *s, int type, ...)
956
{
957
    int ret;
958
    void *arg;
959
    va_list ap;
960

    
961
    va_start(ap, type);
962
    arg = va_arg(ap, void *);
963
    va_end(ap);
964

    
965
    ret = ioctl(s->vmfd, type, arg);
966
    if (ret == -1)
967
        ret = -errno;
968

    
969
    return ret;
970
}
971

    
972
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
973
{
974
    int ret;
975
    void *arg;
976
    va_list ap;
977

    
978
    va_start(ap, type);
979
    arg = va_arg(ap, void *);
980
    va_end(ap);
981

    
982
    ret = ioctl(env->kvm_fd, type, arg);
983
    if (ret == -1)
984
        ret = -errno;
985

    
986
    return ret;
987
}
988

    
989
int kvm_has_sync_mmu(void)
990
{
991
#ifdef KVM_CAP_SYNC_MMU
992
    KVMState *s = kvm_state;
993

    
994
    return kvm_check_extension(s, KVM_CAP_SYNC_MMU);
995
#else
996
    return 0;
997
#endif
998
}
999

    
1000
int kvm_has_vcpu_events(void)
1001
{
1002
    return kvm_state->vcpu_events;
1003
}
1004

    
1005
int kvm_has_robust_singlestep(void)
1006
{
1007
    return kvm_state->robust_singlestep;
1008
}
1009

    
1010
int kvm_has_debugregs(void)
1011
{
1012
    return kvm_state->debugregs;
1013
}
1014

    
1015
void kvm_setup_guest_memory(void *start, size_t size)
1016
{
1017
    if (!kvm_has_sync_mmu()) {
1018
#ifdef MADV_DONTFORK
1019
        int ret = madvise(start, size, MADV_DONTFORK);
1020

    
1021
        if (ret) {
1022
            perror("madvice");
1023
            exit(1);
1024
        }
1025
#else
1026
        fprintf(stderr,
1027
                "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1028
        exit(1);
1029
#endif
1030
    }
1031
}
1032

    
1033
#ifdef KVM_CAP_SET_GUEST_DEBUG
1034
static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
1035
{
1036
#ifdef CONFIG_IOTHREAD
1037
    if (env != cpu_single_env) {
1038
        abort();
1039
    }
1040
#endif
1041
    func(data);
1042
}
1043

    
1044
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1045
                                                 target_ulong pc)
1046
{
1047
    struct kvm_sw_breakpoint *bp;
1048

    
1049
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1050
        if (bp->pc == pc)
1051
            return bp;
1052
    }
1053
    return NULL;
1054
}
1055

    
1056
int kvm_sw_breakpoints_active(CPUState *env)
1057
{
1058
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1059
}
1060

    
1061
struct kvm_set_guest_debug_data {
1062
    struct kvm_guest_debug dbg;
1063
    CPUState *env;
1064
    int err;
1065
};
1066

    
1067
static void kvm_invoke_set_guest_debug(void *data)
1068
{
1069
    struct kvm_set_guest_debug_data *dbg_data = data;
1070
    CPUState *env = dbg_data->env;
1071

    
1072
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1073
}
1074

    
1075
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1076
{
1077
    struct kvm_set_guest_debug_data data;
1078

    
1079
    data.dbg.control = reinject_trap;
1080

    
1081
    if (env->singlestep_enabled) {
1082
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1083
    }
1084
    kvm_arch_update_guest_debug(env, &data.dbg);
1085
    data.env = env;
1086

    
1087
    on_vcpu(env, kvm_invoke_set_guest_debug, &data);
1088
    return data.err;
1089
}
1090

    
1091
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1092
                          target_ulong len, int type)
1093
{
1094
    struct kvm_sw_breakpoint *bp;
1095
    CPUState *env;
1096
    int err;
1097

    
1098
    if (type == GDB_BREAKPOINT_SW) {
1099
        bp = kvm_find_sw_breakpoint(current_env, addr);
1100
        if (bp) {
1101
            bp->use_count++;
1102
            return 0;
1103
        }
1104

    
1105
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1106
        if (!bp)
1107
            return -ENOMEM;
1108

    
1109
        bp->pc = addr;
1110
        bp->use_count = 1;
1111
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1112
        if (err) {
1113
            free(bp);
1114
            return err;
1115
        }
1116

    
1117
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1118
                          bp, entry);
1119
    } else {
1120
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1121
        if (err)
1122
            return err;
1123
    }
1124

    
1125
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1126
        err = kvm_update_guest_debug(env, 0);
1127
        if (err)
1128
            return err;
1129
    }
1130
    return 0;
1131
}
1132

    
1133
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1134
                          target_ulong len, int type)
1135
{
1136
    struct kvm_sw_breakpoint *bp;
1137
    CPUState *env;
1138
    int err;
1139

    
1140
    if (type == GDB_BREAKPOINT_SW) {
1141
        bp = kvm_find_sw_breakpoint(current_env, addr);
1142
        if (!bp)
1143
            return -ENOENT;
1144

    
1145
        if (bp->use_count > 1) {
1146
            bp->use_count--;
1147
            return 0;
1148
        }
1149

    
1150
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1151
        if (err)
1152
            return err;
1153

    
1154
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1155
        qemu_free(bp);
1156
    } else {
1157
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1158
        if (err)
1159
            return err;
1160
    }
1161

    
1162
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1163
        err = kvm_update_guest_debug(env, 0);
1164
        if (err)
1165
            return err;
1166
    }
1167
    return 0;
1168
}
1169

    
1170
void kvm_remove_all_breakpoints(CPUState *current_env)
1171
{
1172
    struct kvm_sw_breakpoint *bp, *next;
1173
    KVMState *s = current_env->kvm_state;
1174
    CPUState *env;
1175

    
1176
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1177
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1178
            /* Try harder to find a CPU that currently sees the breakpoint. */
1179
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1180
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0)
1181
                    break;
1182
            }
1183
        }
1184
    }
1185
    kvm_arch_remove_all_hw_breakpoints();
1186

    
1187
    for (env = first_cpu; env != NULL; env = env->next_cpu)
1188
        kvm_update_guest_debug(env, 0);
1189
}
1190

    
1191
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1192

    
1193
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1194
{
1195
    return -EINVAL;
1196
}
1197

    
1198
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1199
                          target_ulong len, int type)
1200
{
1201
    return -EINVAL;
1202
}
1203

    
1204
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1205
                          target_ulong len, int type)
1206
{
1207
    return -EINVAL;
1208
}
1209

    
1210
void kvm_remove_all_breakpoints(CPUState *current_env)
1211
{
1212
}
1213
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1214

    
1215
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1216
{
1217
    struct kvm_signal_mask *sigmask;
1218
    int r;
1219

    
1220
    if (!sigset)
1221
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1222

    
1223
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1224

    
1225
    sigmask->len = 8;
1226
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1227
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1228
    free(sigmask);
1229

    
1230
    return r;
1231
}
1232

    
1233
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1234
{
1235
#ifdef KVM_IOEVENTFD
1236
    struct kvm_ioeventfd kick = {
1237
        .datamatch = val,
1238
        .addr = addr,
1239
        .len = 2,
1240
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1241
        .fd = fd,
1242
    };
1243
    int r;
1244
    if (!kvm_enabled())
1245
        return -ENOSYS;
1246
    if (!assign)
1247
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1248
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1249
    if (r < 0)
1250
        return r;
1251
    return 0;
1252
#else
1253
    return -ENOSYS;
1254
#endif
1255
}