Statistics
| Branch: | Revision:

root / kvm-all.c @ f1665b21

History | View | Annotate | Download (32 kB)

1
/*
2
 * QEMU KVM support
3
 *
4
 * Copyright IBM, Corp. 2008
5
 *           Red Hat, Inc. 2008
6
 *
7
 * Authors:
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *  Glauber Costa     <gcosta@redhat.com>
10
 *
11
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12
 * See the COPYING file in the top-level directory.
13
 *
14
 */
15

    
16
#include <sys/types.h>
17
#include <sys/ioctl.h>
18
#include <sys/mman.h>
19
#include <stdarg.h>
20

    
21
#include <linux/kvm.h>
22

    
23
#include "qemu-common.h"
24
#include "qemu-barrier.h"
25
#include "sysemu.h"
26
#include "hw/hw.h"
27
#include "gdbstub.h"
28
#include "kvm.h"
29
#include "bswap.h"
30

    
31
/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
32
#define PAGE_SIZE TARGET_PAGE_SIZE
33

    
34
//#define DEBUG_KVM
35

    
36
#ifdef DEBUG_KVM
37
#define DPRINTF(fmt, ...) \
38
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
39
#else
40
#define DPRINTF(fmt, ...) \
41
    do { } while (0)
42
#endif
43

    
44
typedef struct KVMSlot
45
{
46
    target_phys_addr_t start_addr;
47
    ram_addr_t memory_size;
48
    ram_addr_t phys_offset;
49
    int slot;
50
    int flags;
51
} KVMSlot;
52

    
53
typedef struct kvm_dirty_log KVMDirtyLog;
54

    
55
struct KVMState
56
{
57
    KVMSlot slots[32];
58
    int fd;
59
    int vmfd;
60
    int coalesced_mmio;
61
#ifdef KVM_CAP_COALESCED_MMIO
62
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
63
#endif
64
    int broken_set_mem_region;
65
    int migration_log;
66
    int vcpu_events;
67
    int robust_singlestep;
68
    int debugregs;
69
#ifdef KVM_CAP_SET_GUEST_DEBUG
70
    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
71
#endif
72
    int irqchip_in_kernel;
73
    int pit_in_kernel;
74
    int xsave, xcrs;
75
};
76

    
77
static KVMState *kvm_state;
78

    
79
static KVMSlot *kvm_alloc_slot(KVMState *s)
80
{
81
    int i;
82

    
83
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
84
        /* KVM private memory slots */
85
        if (i >= 8 && i < 12)
86
            continue;
87
        if (s->slots[i].memory_size == 0)
88
            return &s->slots[i];
89
    }
90

    
91
    fprintf(stderr, "%s: no free slot available\n", __func__);
92
    abort();
93
}
94

    
95
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
96
                                         target_phys_addr_t start_addr,
97
                                         target_phys_addr_t end_addr)
98
{
99
    int i;
100

    
101
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
102
        KVMSlot *mem = &s->slots[i];
103

    
104
        if (start_addr == mem->start_addr &&
105
            end_addr == mem->start_addr + mem->memory_size) {
106
            return mem;
107
        }
108
    }
109

    
110
    return NULL;
111
}
112

    
113
/*
114
 * Find overlapping slot with lowest start address
115
 */
116
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
117
                                            target_phys_addr_t start_addr,
118
                                            target_phys_addr_t end_addr)
119
{
120
    KVMSlot *found = NULL;
121
    int i;
122

    
123
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
124
        KVMSlot *mem = &s->slots[i];
125

    
126
        if (mem->memory_size == 0 ||
127
            (found && found->start_addr < mem->start_addr)) {
128
            continue;
129
        }
130

    
131
        if (end_addr > mem->start_addr &&
132
            start_addr < mem->start_addr + mem->memory_size) {
133
            found = mem;
134
        }
135
    }
136

    
137
    return found;
138
}
139

    
140
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
141
{
142
    struct kvm_userspace_memory_region mem;
143

    
144
    mem.slot = slot->slot;
145
    mem.guest_phys_addr = slot->start_addr;
146
    mem.memory_size = slot->memory_size;
147
    mem.userspace_addr = (unsigned long)qemu_get_ram_ptr(slot->phys_offset);
148
    mem.flags = slot->flags;
149
    if (s->migration_log) {
150
        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
151
    }
152
    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
153
}
154

    
155
static void kvm_reset_vcpu(void *opaque)
156
{
157
    CPUState *env = opaque;
158

    
159
    kvm_arch_reset_vcpu(env);
160
}
161

    
162
int kvm_irqchip_in_kernel(void)
163
{
164
    return kvm_state->irqchip_in_kernel;
165
}
166

    
167
int kvm_pit_in_kernel(void)
168
{
169
    return kvm_state->pit_in_kernel;
170
}
171

    
172

    
173
int kvm_init_vcpu(CPUState *env)
174
{
175
    KVMState *s = kvm_state;
176
    long mmap_size;
177
    int ret;
178

    
179
    DPRINTF("kvm_init_vcpu\n");
180

    
181
    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
182
    if (ret < 0) {
183
        DPRINTF("kvm_create_vcpu failed\n");
184
        goto err;
185
    }
186

    
187
    env->kvm_fd = ret;
188
    env->kvm_state = s;
189

    
190
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
191
    if (mmap_size < 0) {
192
        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
193
        goto err;
194
    }
195

    
196
    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
197
                        env->kvm_fd, 0);
198
    if (env->kvm_run == MAP_FAILED) {
199
        ret = -errno;
200
        DPRINTF("mmap'ing vcpu state failed\n");
201
        goto err;
202
    }
203

    
204
#ifdef KVM_CAP_COALESCED_MMIO
205
    if (s->coalesced_mmio && !s->coalesced_mmio_ring)
206
        s->coalesced_mmio_ring = (void *) env->kvm_run +
207
                s->coalesced_mmio * PAGE_SIZE;
208
#endif
209

    
210
    ret = kvm_arch_init_vcpu(env);
211
    if (ret == 0) {
212
        qemu_register_reset(kvm_reset_vcpu, env);
213
        kvm_arch_reset_vcpu(env);
214
    }
215
err:
216
    return ret;
217
}
218

    
219
/*
220
 * dirty pages logging control
221
 */
222
static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
223
                                      ram_addr_t size, int flags, int mask)
224
{
225
    KVMState *s = kvm_state;
226
    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
227
    int old_flags;
228

    
229
    if (mem == NULL)  {
230
            fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
231
                    TARGET_FMT_plx "\n", __func__, phys_addr,
232
                    (target_phys_addr_t)(phys_addr + size - 1));
233
            return -EINVAL;
234
    }
235

    
236
    old_flags = mem->flags;
237

    
238
    flags = (mem->flags & ~mask) | flags;
239
    mem->flags = flags;
240

    
241
    /* If nothing changed effectively, no need to issue ioctl */
242
    if (s->migration_log) {
243
        flags |= KVM_MEM_LOG_DIRTY_PAGES;
244
    }
245
    if (flags == old_flags) {
246
            return 0;
247
    }
248

    
249
    return kvm_set_user_memory_region(s, mem);
250
}
251

    
252
int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
253
{
254
        return kvm_dirty_pages_log_change(phys_addr, size,
255
                                          KVM_MEM_LOG_DIRTY_PAGES,
256
                                          KVM_MEM_LOG_DIRTY_PAGES);
257
}
258

    
259
int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
260
{
261
        return kvm_dirty_pages_log_change(phys_addr, size,
262
                                          0,
263
                                          KVM_MEM_LOG_DIRTY_PAGES);
264
}
265

    
266
static int kvm_set_migration_log(int enable)
267
{
268
    KVMState *s = kvm_state;
269
    KVMSlot *mem;
270
    int i, err;
271

    
272
    s->migration_log = enable;
273

    
274
    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
275
        mem = &s->slots[i];
276

    
277
        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
278
            continue;
279
        }
280
        err = kvm_set_user_memory_region(s, mem);
281
        if (err) {
282
            return err;
283
        }
284
    }
285
    return 0;
286
}
287

    
288
/* get kvm's dirty pages bitmap and update qemu's */
289
static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
290
                                         unsigned long *bitmap,
291
                                         unsigned long offset,
292
                                         unsigned long mem_size)
293
{
294
    unsigned int i, j;
295
    unsigned long page_number, addr, addr1, c;
296
    ram_addr_t ram_addr;
297
    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
298
        HOST_LONG_BITS;
299

    
300
    /*
301
     * bitmap-traveling is faster than memory-traveling (for addr...)
302
     * especially when most of the memory is not dirty.
303
     */
304
    for (i = 0; i < len; i++) {
305
        if (bitmap[i] != 0) {
306
            c = leul_to_cpu(bitmap[i]);
307
            do {
308
                j = ffsl(c) - 1;
309
                c &= ~(1ul << j);
310
                page_number = i * HOST_LONG_BITS + j;
311
                addr1 = page_number * TARGET_PAGE_SIZE;
312
                addr = offset + addr1;
313
                ram_addr = cpu_get_physical_page_desc(addr);
314
                cpu_physical_memory_set_dirty(ram_addr);
315
            } while (c != 0);
316
        }
317
    }
318
    return 0;
319
}
320

    
321
#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
322

    
323
/**
324
 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
325
 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
326
 * This means all bits are set to dirty.
327
 *
328
 * @start_add: start of logged region.
329
 * @end_addr: end of logged region.
330
 */
331
static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
332
                                          target_phys_addr_t end_addr)
333
{
334
    KVMState *s = kvm_state;
335
    unsigned long size, allocated_size = 0;
336
    KVMDirtyLog d;
337
    KVMSlot *mem;
338
    int ret = 0;
339

    
340
    d.dirty_bitmap = NULL;
341
    while (start_addr < end_addr) {
342
        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
343
        if (mem == NULL) {
344
            break;
345
        }
346

    
347
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), HOST_LONG_BITS) / 8;
348
        if (!d.dirty_bitmap) {
349
            d.dirty_bitmap = qemu_malloc(size);
350
        } else if (size > allocated_size) {
351
            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
352
        }
353
        allocated_size = size;
354
        memset(d.dirty_bitmap, 0, allocated_size);
355

    
356
        d.slot = mem->slot;
357

    
358
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
359
            DPRINTF("ioctl failed %d\n", errno);
360
            ret = -1;
361
            break;
362
        }
363

    
364
        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
365
                                      mem->start_addr, mem->memory_size);
366
        start_addr = mem->start_addr + mem->memory_size;
367
    }
368
    qemu_free(d.dirty_bitmap);
369

    
370
    return ret;
371
}
372

    
373
int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
374
{
375
    int ret = -ENOSYS;
376
#ifdef KVM_CAP_COALESCED_MMIO
377
    KVMState *s = kvm_state;
378

    
379
    if (s->coalesced_mmio) {
380
        struct kvm_coalesced_mmio_zone zone;
381

    
382
        zone.addr = start;
383
        zone.size = size;
384

    
385
        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
386
    }
387
#endif
388

    
389
    return ret;
390
}
391

    
392
int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
393
{
394
    int ret = -ENOSYS;
395
#ifdef KVM_CAP_COALESCED_MMIO
396
    KVMState *s = kvm_state;
397

    
398
    if (s->coalesced_mmio) {
399
        struct kvm_coalesced_mmio_zone zone;
400

    
401
        zone.addr = start;
402
        zone.size = size;
403

    
404
        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
405
    }
406
#endif
407

    
408
    return ret;
409
}
410

    
411
int kvm_check_extension(KVMState *s, unsigned int extension)
412
{
413
    int ret;
414

    
415
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
416
    if (ret < 0) {
417
        ret = 0;
418
    }
419

    
420
    return ret;
421
}
422

    
423
static void kvm_set_phys_mem(target_phys_addr_t start_addr,
424
                             ram_addr_t size,
425
                             ram_addr_t phys_offset)
426
{
427
    KVMState *s = kvm_state;
428
    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
429
    KVMSlot *mem, old;
430
    int err;
431

    
432
    if (start_addr & ~TARGET_PAGE_MASK) {
433
        if (flags >= IO_MEM_UNASSIGNED) {
434
            if (!kvm_lookup_overlapping_slot(s, start_addr,
435
                                             start_addr + size)) {
436
                return;
437
            }
438
            fprintf(stderr, "Unaligned split of a KVM memory slot\n");
439
        } else {
440
            fprintf(stderr, "Only page-aligned memory slots supported\n");
441
        }
442
        abort();
443
    }
444

    
445
    /* KVM does not support read-only slots */
446
    phys_offset &= ~IO_MEM_ROM;
447

    
448
    while (1) {
449
        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
450
        if (!mem) {
451
            break;
452
        }
453

    
454
        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
455
            (start_addr + size <= mem->start_addr + mem->memory_size) &&
456
            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
457
            /* The new slot fits into the existing one and comes with
458
             * identical parameters - nothing to be done. */
459
            return;
460
        }
461

    
462
        old = *mem;
463

    
464
        /* unregister the overlapping slot */
465
        mem->memory_size = 0;
466
        err = kvm_set_user_memory_region(s, mem);
467
        if (err) {
468
            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
469
                    __func__, strerror(-err));
470
            abort();
471
        }
472

    
473
        /* Workaround for older KVM versions: we can't join slots, even not by
474
         * unregistering the previous ones and then registering the larger
475
         * slot. We have to maintain the existing fragmentation. Sigh.
476
         *
477
         * This workaround assumes that the new slot starts at the same
478
         * address as the first existing one. If not or if some overlapping
479
         * slot comes around later, we will fail (not seen in practice so far)
480
         * - and actually require a recent KVM version. */
481
        if (s->broken_set_mem_region &&
482
            old.start_addr == start_addr && old.memory_size < size &&
483
            flags < IO_MEM_UNASSIGNED) {
484
            mem = kvm_alloc_slot(s);
485
            mem->memory_size = old.memory_size;
486
            mem->start_addr = old.start_addr;
487
            mem->phys_offset = old.phys_offset;
488
            mem->flags = 0;
489

    
490
            err = kvm_set_user_memory_region(s, mem);
491
            if (err) {
492
                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
493
                        strerror(-err));
494
                abort();
495
            }
496

    
497
            start_addr += old.memory_size;
498
            phys_offset += old.memory_size;
499
            size -= old.memory_size;
500
            continue;
501
        }
502

    
503
        /* register prefix slot */
504
        if (old.start_addr < start_addr) {
505
            mem = kvm_alloc_slot(s);
506
            mem->memory_size = start_addr - old.start_addr;
507
            mem->start_addr = old.start_addr;
508
            mem->phys_offset = old.phys_offset;
509
            mem->flags = 0;
510

    
511
            err = kvm_set_user_memory_region(s, mem);
512
            if (err) {
513
                fprintf(stderr, "%s: error registering prefix slot: %s\n",
514
                        __func__, strerror(-err));
515
                abort();
516
            }
517
        }
518

    
519
        /* register suffix slot */
520
        if (old.start_addr + old.memory_size > start_addr + size) {
521
            ram_addr_t size_delta;
522

    
523
            mem = kvm_alloc_slot(s);
524
            mem->start_addr = start_addr + size;
525
            size_delta = mem->start_addr - old.start_addr;
526
            mem->memory_size = old.memory_size - size_delta;
527
            mem->phys_offset = old.phys_offset + size_delta;
528
            mem->flags = 0;
529

    
530
            err = kvm_set_user_memory_region(s, mem);
531
            if (err) {
532
                fprintf(stderr, "%s: error registering suffix slot: %s\n",
533
                        __func__, strerror(-err));
534
                abort();
535
            }
536
        }
537
    }
538

    
539
    /* in case the KVM bug workaround already "consumed" the new slot */
540
    if (!size)
541
        return;
542

    
543
    /* KVM does not need to know about this memory */
544
    if (flags >= IO_MEM_UNASSIGNED)
545
        return;
546

    
547
    mem = kvm_alloc_slot(s);
548
    mem->memory_size = size;
549
    mem->start_addr = start_addr;
550
    mem->phys_offset = phys_offset;
551
    mem->flags = 0;
552

    
553
    err = kvm_set_user_memory_region(s, mem);
554
    if (err) {
555
        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
556
                strerror(-err));
557
        abort();
558
    }
559
}
560

    
561
static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
562
                                  target_phys_addr_t start_addr,
563
                                  ram_addr_t size,
564
                                  ram_addr_t phys_offset)
565
{
566
        kvm_set_phys_mem(start_addr, size, phys_offset);
567
}
568

    
569
static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
570
                                        target_phys_addr_t start_addr,
571
                                        target_phys_addr_t end_addr)
572
{
573
        return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
574
}
575

    
576
static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
577
                                    int enable)
578
{
579
        return kvm_set_migration_log(enable);
580
}
581

    
582
static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
583
        .set_memory = kvm_client_set_memory,
584
        .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
585
        .migration_log = kvm_client_migration_log,
586
};
587

    
588
int kvm_init(int smp_cpus)
589
{
590
    static const char upgrade_note[] =
591
        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
592
        "(see http://sourceforge.net/projects/kvm).\n";
593
    KVMState *s;
594
    int ret;
595
    int i;
596

    
597
    s = qemu_mallocz(sizeof(KVMState));
598

    
599
#ifdef KVM_CAP_SET_GUEST_DEBUG
600
    QTAILQ_INIT(&s->kvm_sw_breakpoints);
601
#endif
602
    for (i = 0; i < ARRAY_SIZE(s->slots); i++)
603
        s->slots[i].slot = i;
604

    
605
    s->vmfd = -1;
606
    s->fd = qemu_open("/dev/kvm", O_RDWR);
607
    if (s->fd == -1) {
608
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
609
        ret = -errno;
610
        goto err;
611
    }
612

    
613
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
614
    if (ret < KVM_API_VERSION) {
615
        if (ret > 0)
616
            ret = -EINVAL;
617
        fprintf(stderr, "kvm version too old\n");
618
        goto err;
619
    }
620

    
621
    if (ret > KVM_API_VERSION) {
622
        ret = -EINVAL;
623
        fprintf(stderr, "kvm version not supported\n");
624
        goto err;
625
    }
626

    
627
    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
628
    if (s->vmfd < 0) {
629
#ifdef TARGET_S390X
630
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
631
                        "your host kernel command line\n");
632
#endif
633
        goto err;
634
    }
635

    
636
    /* initially, KVM allocated its own memory and we had to jump through
637
     * hooks to make phys_ram_base point to this.  Modern versions of KVM
638
     * just use a user allocated buffer so we can use regular pages
639
     * unmodified.  Make sure we have a sufficiently modern version of KVM.
640
     */
641
    if (!kvm_check_extension(s, KVM_CAP_USER_MEMORY)) {
642
        ret = -EINVAL;
643
        fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n%s",
644
                upgrade_note);
645
        goto err;
646
    }
647

    
648
    /* There was a nasty bug in < kvm-80 that prevents memory slots from being
649
     * destroyed properly.  Since we rely on this capability, refuse to work
650
     * with any kernel without this capability. */
651
    if (!kvm_check_extension(s, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
652
        ret = -EINVAL;
653

    
654
        fprintf(stderr,
655
                "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
656
                upgrade_note);
657
        goto err;
658
    }
659

    
660
    s->coalesced_mmio = 0;
661
#ifdef KVM_CAP_COALESCED_MMIO
662
    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
663
    s->coalesced_mmio_ring = NULL;
664
#endif
665

    
666
    s->broken_set_mem_region = 1;
667
#ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
668
    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
669
    if (ret > 0) {
670
        s->broken_set_mem_region = 0;
671
    }
672
#endif
673

    
674
    s->vcpu_events = 0;
675
#ifdef KVM_CAP_VCPU_EVENTS
676
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
677
#endif
678

    
679
    s->robust_singlestep = 0;
680
#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
681
    s->robust_singlestep =
682
        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
683
#endif
684

    
685
    s->debugregs = 0;
686
#ifdef KVM_CAP_DEBUGREGS
687
    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
688
#endif
689

    
690
    s->xsave = 0;
691
#ifdef KVM_CAP_XSAVE
692
    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
693
#endif
694

    
695
    s->xcrs = 0;
696
#ifdef KVM_CAP_XCRS
697
    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
698
#endif
699

    
700
    ret = kvm_arch_init(s, smp_cpus);
701
    if (ret < 0)
702
        goto err;
703

    
704
    kvm_state = s;
705
    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
706

    
707
    return 0;
708

    
709
err:
710
    if (s) {
711
        if (s->vmfd != -1)
712
            close(s->vmfd);
713
        if (s->fd != -1)
714
            close(s->fd);
715
    }
716
    qemu_free(s);
717

    
718
    return ret;
719
}
720

    
721
static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
722
                         uint32_t count)
723
{
724
    int i;
725
    uint8_t *ptr = data;
726

    
727
    for (i = 0; i < count; i++) {
728
        if (direction == KVM_EXIT_IO_IN) {
729
            switch (size) {
730
            case 1:
731
                stb_p(ptr, cpu_inb(port));
732
                break;
733
            case 2:
734
                stw_p(ptr, cpu_inw(port));
735
                break;
736
            case 4:
737
                stl_p(ptr, cpu_inl(port));
738
                break;
739
            }
740
        } else {
741
            switch (size) {
742
            case 1:
743
                cpu_outb(port, ldub_p(ptr));
744
                break;
745
            case 2:
746
                cpu_outw(port, lduw_p(ptr));
747
                break;
748
            case 4:
749
                cpu_outl(port, ldl_p(ptr));
750
                break;
751
            }
752
        }
753

    
754
        ptr += size;
755
    }
756

    
757
    return 1;
758
}
759

    
760
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
761
static void kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
762
{
763

    
764
    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
765
        int i;
766

    
767
        fprintf(stderr, "KVM internal error. Suberror: %d\n",
768
                run->internal.suberror);
769

    
770
        for (i = 0; i < run->internal.ndata; ++i) {
771
            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
772
                    i, (uint64_t)run->internal.data[i]);
773
        }
774
    }
775
    cpu_dump_state(env, stderr, fprintf, 0);
776
    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
777
        fprintf(stderr, "emulation failure\n");
778
        if (!kvm_arch_stop_on_emulation_error(env))
779
                return;
780
    }
781
    /* FIXME: Should trigger a qmp message to let management know
782
     * something went wrong.
783
     */
784
    vm_stop(0);
785
}
786
#endif
787

    
788
void kvm_flush_coalesced_mmio_buffer(void)
789
{
790
#ifdef KVM_CAP_COALESCED_MMIO
791
    KVMState *s = kvm_state;
792
    if (s->coalesced_mmio_ring) {
793
        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
794
        while (ring->first != ring->last) {
795
            struct kvm_coalesced_mmio *ent;
796

    
797
            ent = &ring->coalesced_mmio[ring->first];
798

    
799
            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
800
            smp_wmb();
801
            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
802
        }
803
    }
804
#endif
805
}
806

    
807
static void do_kvm_cpu_synchronize_state(void *_env)
808
{
809
    CPUState *env = _env;
810

    
811
    if (!env->kvm_vcpu_dirty) {
812
        kvm_arch_get_registers(env);
813
        env->kvm_vcpu_dirty = 1;
814
    }
815
}
816

    
817
void kvm_cpu_synchronize_state(CPUState *env)
818
{
819
    if (!env->kvm_vcpu_dirty)
820
        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
821
}
822

    
823
void kvm_cpu_synchronize_post_reset(CPUState *env)
824
{
825
    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
826
    env->kvm_vcpu_dirty = 0;
827
}
828

    
829
void kvm_cpu_synchronize_post_init(CPUState *env)
830
{
831
    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
832
    env->kvm_vcpu_dirty = 0;
833
}
834

    
835
int kvm_cpu_exec(CPUState *env)
836
{
837
    struct kvm_run *run = env->kvm_run;
838
    int ret;
839

    
840
    DPRINTF("kvm_cpu_exec()\n");
841

    
842
    do {
843
#ifndef CONFIG_IOTHREAD
844
        if (env->exit_request) {
845
            DPRINTF("interrupt exit requested\n");
846
            ret = 0;
847
            break;
848
        }
849
#endif
850

    
851
        if (kvm_arch_process_irqchip_events(env)) {
852
            ret = 0;
853
            break;
854
        }
855

    
856
        if (env->kvm_vcpu_dirty) {
857
            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
858
            env->kvm_vcpu_dirty = 0;
859
        }
860

    
861
        kvm_arch_pre_run(env, run);
862
        cpu_single_env = NULL;
863
        qemu_mutex_unlock_iothread();
864
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
865
        qemu_mutex_lock_iothread();
866
        cpu_single_env = env;
867
        kvm_arch_post_run(env, run);
868

    
869
        if (ret == -EINTR || ret == -EAGAIN) {
870
            cpu_exit(env);
871
            DPRINTF("io window exit\n");
872
            ret = 0;
873
            break;
874
        }
875

    
876
        if (ret < 0) {
877
            DPRINTF("kvm run failed %s\n", strerror(-ret));
878
            abort();
879
        }
880

    
881
        kvm_flush_coalesced_mmio_buffer();
882

    
883
        ret = 0; /* exit loop */
884
        switch (run->exit_reason) {
885
        case KVM_EXIT_IO:
886
            DPRINTF("handle_io\n");
887
            ret = kvm_handle_io(run->io.port,
888
                                (uint8_t *)run + run->io.data_offset,
889
                                run->io.direction,
890
                                run->io.size,
891
                                run->io.count);
892
            break;
893
        case KVM_EXIT_MMIO:
894
            DPRINTF("handle_mmio\n");
895
            cpu_physical_memory_rw(run->mmio.phys_addr,
896
                                   run->mmio.data,
897
                                   run->mmio.len,
898
                                   run->mmio.is_write);
899
            ret = 1;
900
            break;
901
        case KVM_EXIT_IRQ_WINDOW_OPEN:
902
            DPRINTF("irq_window_open\n");
903
            break;
904
        case KVM_EXIT_SHUTDOWN:
905
            DPRINTF("shutdown\n");
906
            qemu_system_reset_request();
907
            ret = 1;
908
            break;
909
        case KVM_EXIT_UNKNOWN:
910
            DPRINTF("kvm_exit_unknown\n");
911
            break;
912
        case KVM_EXIT_FAIL_ENTRY:
913
            DPRINTF("kvm_exit_fail_entry\n");
914
            break;
915
        case KVM_EXIT_EXCEPTION:
916
            DPRINTF("kvm_exit_exception\n");
917
            break;
918
#ifdef KVM_CAP_INTERNAL_ERROR_DATA
919
        case KVM_EXIT_INTERNAL_ERROR:
920
            kvm_handle_internal_error(env, run);
921
            break;
922
#endif
923
        case KVM_EXIT_DEBUG:
924
            DPRINTF("kvm_exit_debug\n");
925
#ifdef KVM_CAP_SET_GUEST_DEBUG
926
            if (kvm_arch_debug(&run->debug.arch)) {
927
                gdb_set_stop_cpu(env);
928
                vm_stop(EXCP_DEBUG);
929
                env->exception_index = EXCP_DEBUG;
930
                return 0;
931
            }
932
            /* re-enter, this exception was guest-internal */
933
            ret = 1;
934
#endif /* KVM_CAP_SET_GUEST_DEBUG */
935
            break;
936
        default:
937
            DPRINTF("kvm_arch_handle_exit\n");
938
            ret = kvm_arch_handle_exit(env, run);
939
            break;
940
        }
941
    } while (ret > 0);
942

    
943
    if (env->exit_request) {
944
        env->exit_request = 0;
945
        env->exception_index = EXCP_INTERRUPT;
946
    }
947

    
948
    return ret;
949
}
950

    
951
int kvm_ioctl(KVMState *s, int type, ...)
952
{
953
    int ret;
954
    void *arg;
955
    va_list ap;
956

    
957
    va_start(ap, type);
958
    arg = va_arg(ap, void *);
959
    va_end(ap);
960

    
961
    ret = ioctl(s->fd, type, arg);
962
    if (ret == -1)
963
        ret = -errno;
964

    
965
    return ret;
966
}
967

    
968
int kvm_vm_ioctl(KVMState *s, int type, ...)
969
{
970
    int ret;
971
    void *arg;
972
    va_list ap;
973

    
974
    va_start(ap, type);
975
    arg = va_arg(ap, void *);
976
    va_end(ap);
977

    
978
    ret = ioctl(s->vmfd, type, arg);
979
    if (ret == -1)
980
        ret = -errno;
981

    
982
    return ret;
983
}
984

    
985
int kvm_vcpu_ioctl(CPUState *env, int type, ...)
986
{
987
    int ret;
988
    void *arg;
989
    va_list ap;
990

    
991
    va_start(ap, type);
992
    arg = va_arg(ap, void *);
993
    va_end(ap);
994

    
995
    ret = ioctl(env->kvm_fd, type, arg);
996
    if (ret == -1)
997
        ret = -errno;
998

    
999
    return ret;
1000
}
1001

    
1002
int kvm_has_sync_mmu(void)
1003
{
1004
#ifdef KVM_CAP_SYNC_MMU
1005
    KVMState *s = kvm_state;
1006

    
1007
    return kvm_check_extension(s, KVM_CAP_SYNC_MMU);
1008
#else
1009
    return 0;
1010
#endif
1011
}
1012

    
1013
int kvm_has_vcpu_events(void)
1014
{
1015
    return kvm_state->vcpu_events;
1016
}
1017

    
1018
int kvm_has_robust_singlestep(void)
1019
{
1020
    return kvm_state->robust_singlestep;
1021
}
1022

    
1023
int kvm_has_debugregs(void)
1024
{
1025
    return kvm_state->debugregs;
1026
}
1027

    
1028
int kvm_has_xsave(void)
1029
{
1030
    return kvm_state->xsave;
1031
}
1032

    
1033
int kvm_has_xcrs(void)
1034
{
1035
    return kvm_state->xcrs;
1036
}
1037

    
1038
void kvm_setup_guest_memory(void *start, size_t size)
1039
{
1040
    if (!kvm_has_sync_mmu()) {
1041
#ifdef MADV_DONTFORK
1042
        int ret = madvise(start, size, MADV_DONTFORK);
1043

    
1044
        if (ret) {
1045
            perror("madvice");
1046
            exit(1);
1047
        }
1048
#else
1049
        fprintf(stderr,
1050
                "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1051
        exit(1);
1052
#endif
1053
    }
1054
}
1055

    
1056
#ifdef KVM_CAP_SET_GUEST_DEBUG
1057
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1058
                                                 target_ulong pc)
1059
{
1060
    struct kvm_sw_breakpoint *bp;
1061

    
1062
    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1063
        if (bp->pc == pc)
1064
            return bp;
1065
    }
1066
    return NULL;
1067
}
1068

    
1069
int kvm_sw_breakpoints_active(CPUState *env)
1070
{
1071
    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1072
}
1073

    
1074
struct kvm_set_guest_debug_data {
1075
    struct kvm_guest_debug dbg;
1076
    CPUState *env;
1077
    int err;
1078
};
1079

    
1080
static void kvm_invoke_set_guest_debug(void *data)
1081
{
1082
    struct kvm_set_guest_debug_data *dbg_data = data;
1083
    CPUState *env = dbg_data->env;
1084

    
1085
    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1086
}
1087

    
1088
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1089
{
1090
    struct kvm_set_guest_debug_data data;
1091

    
1092
    data.dbg.control = reinject_trap;
1093

    
1094
    if (env->singlestep_enabled) {
1095
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1096
    }
1097
    kvm_arch_update_guest_debug(env, &data.dbg);
1098
    data.env = env;
1099

    
1100
    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1101
    return data.err;
1102
}
1103

    
1104
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1105
                          target_ulong len, int type)
1106
{
1107
    struct kvm_sw_breakpoint *bp;
1108
    CPUState *env;
1109
    int err;
1110

    
1111
    if (type == GDB_BREAKPOINT_SW) {
1112
        bp = kvm_find_sw_breakpoint(current_env, addr);
1113
        if (bp) {
1114
            bp->use_count++;
1115
            return 0;
1116
        }
1117

    
1118
        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1119
        if (!bp)
1120
            return -ENOMEM;
1121

    
1122
        bp->pc = addr;
1123
        bp->use_count = 1;
1124
        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1125
        if (err) {
1126
            free(bp);
1127
            return err;
1128
        }
1129

    
1130
        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1131
                          bp, entry);
1132
    } else {
1133
        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1134
        if (err)
1135
            return err;
1136
    }
1137

    
1138
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1139
        err = kvm_update_guest_debug(env, 0);
1140
        if (err)
1141
            return err;
1142
    }
1143
    return 0;
1144
}
1145

    
1146
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1147
                          target_ulong len, int type)
1148
{
1149
    struct kvm_sw_breakpoint *bp;
1150
    CPUState *env;
1151
    int err;
1152

    
1153
    if (type == GDB_BREAKPOINT_SW) {
1154
        bp = kvm_find_sw_breakpoint(current_env, addr);
1155
        if (!bp)
1156
            return -ENOENT;
1157

    
1158
        if (bp->use_count > 1) {
1159
            bp->use_count--;
1160
            return 0;
1161
        }
1162

    
1163
        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1164
        if (err)
1165
            return err;
1166

    
1167
        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1168
        qemu_free(bp);
1169
    } else {
1170
        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1171
        if (err)
1172
            return err;
1173
    }
1174

    
1175
    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1176
        err = kvm_update_guest_debug(env, 0);
1177
        if (err)
1178
            return err;
1179
    }
1180
    return 0;
1181
}
1182

    
1183
void kvm_remove_all_breakpoints(CPUState *current_env)
1184
{
1185
    struct kvm_sw_breakpoint *bp, *next;
1186
    KVMState *s = current_env->kvm_state;
1187
    CPUState *env;
1188

    
1189
    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1190
        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1191
            /* Try harder to find a CPU that currently sees the breakpoint. */
1192
            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1193
                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0)
1194
                    break;
1195
            }
1196
        }
1197
    }
1198
    kvm_arch_remove_all_hw_breakpoints();
1199

    
1200
    for (env = first_cpu; env != NULL; env = env->next_cpu)
1201
        kvm_update_guest_debug(env, 0);
1202
}
1203

    
1204
#else /* !KVM_CAP_SET_GUEST_DEBUG */
1205

    
1206
int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1207
{
1208
    return -EINVAL;
1209
}
1210

    
1211
int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1212
                          target_ulong len, int type)
1213
{
1214
    return -EINVAL;
1215
}
1216

    
1217
int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1218
                          target_ulong len, int type)
1219
{
1220
    return -EINVAL;
1221
}
1222

    
1223
void kvm_remove_all_breakpoints(CPUState *current_env)
1224
{
1225
}
1226
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1227

    
1228
int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1229
{
1230
    struct kvm_signal_mask *sigmask;
1231
    int r;
1232

    
1233
    if (!sigset)
1234
        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1235

    
1236
    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1237

    
1238
    sigmask->len = 8;
1239
    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1240
    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1241
    free(sigmask);
1242

    
1243
    return r;
1244
}
1245

    
1246
int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1247
{
1248
#ifdef KVM_IOEVENTFD
1249
    struct kvm_ioeventfd kick = {
1250
        .datamatch = val,
1251
        .addr = addr,
1252
        .len = 2,
1253
        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1254
        .fd = fd,
1255
    };
1256
    int r;
1257
    if (!kvm_enabled())
1258
        return -ENOSYS;
1259
    if (!assign)
1260
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1261
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1262
    if (r < 0)
1263
        return r;
1264
    return 0;
1265
#else
1266
    return -ENOSYS;
1267
#endif
1268
}