Statistics
| Branch: | Revision:

root / hw / vfio_pci.c @ 8fc94e5a

History | View | Annotate | Download (62.9 kB)

1
/*
2
 * vfio based device assignment support
3
 *
4
 * Copyright Red Hat, Inc. 2012
5
 *
6
 * Authors:
7
 *  Alex Williamson <alex.williamson@redhat.com>
8
 *
9
 * This work is licensed under the terms of the GNU GPL, version 2.  See
10
 * the COPYING file in the top-level directory.
11
 *
12
 * Based on qemu-kvm device-assignment:
13
 *  Adapted for KVM by Qumranet.
14
 *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15
 *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16
 *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17
 *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18
 *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19
 */
20

    
21
#include <dirent.h>
22
#include <unistd.h>
23
#include <sys/ioctl.h>
24
#include <sys/mman.h>
25
#include <sys/stat.h>
26
#include <sys/types.h>
27
#include <linux/vfio.h>
28

    
29
#include "config.h"
30
#include "qemu/event_notifier.h"
31
#include "exec/address-spaces.h"
32
#include "sysemu/kvm.h"
33
#include "exec/memory.h"
34
#include "pci/msi.h"
35
#include "pci/msix.h"
36
#include "pci/pci.h"
37
#include "qemu-common.h"
38
#include "qemu/error-report.h"
39
#include "qemu/queue.h"
40
#include "qemu/range.h"
41

    
42
/* #define DEBUG_VFIO */
43
#ifdef DEBUG_VFIO
44
#define DPRINTF(fmt, ...) \
45
    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
46
#else
47
#define DPRINTF(fmt, ...) \
48
    do { } while (0)
49
#endif
50

    
51
typedef struct VFIOBAR {
52
    off_t fd_offset; /* offset of BAR within device fd */
53
    int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
54
    MemoryRegion mem; /* slow, read/write access */
55
    MemoryRegion mmap_mem; /* direct mapped access */
56
    void *mmap;
57
    size_t size;
58
    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
59
    uint8_t nr; /* cache the BAR number for debug */
60
} VFIOBAR;
61

    
62
typedef struct VFIOINTx {
63
    bool pending; /* interrupt pending */
64
    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
65
    uint8_t pin; /* which pin to pull for qemu_set_irq */
66
    EventNotifier interrupt; /* eventfd triggered on interrupt */
67
    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
68
    PCIINTxRoute route; /* routing info for QEMU bypass */
69
    uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
70
    QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
71
} VFIOINTx;
72

    
73
struct VFIODevice;
74

    
75
typedef struct VFIOMSIVector {
76
    EventNotifier interrupt; /* eventfd triggered on interrupt */
77
    struct VFIODevice *vdev; /* back pointer to device */
78
    int virq; /* KVM irqchip route for QEMU bypass */
79
    bool use;
80
} VFIOMSIVector;
81

    
82
enum {
83
    VFIO_INT_NONE = 0,
84
    VFIO_INT_INTx = 1,
85
    VFIO_INT_MSI  = 2,
86
    VFIO_INT_MSIX = 3,
87
};
88

    
89
struct VFIOGroup;
90

    
91
typedef struct VFIOContainer {
92
    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
93
    struct {
94
        /* enable abstraction to support various iommu backends */
95
        union {
96
            MemoryListener listener; /* Used by type1 iommu */
97
        };
98
        void (*release)(struct VFIOContainer *);
99
    } iommu_data;
100
    QLIST_HEAD(, VFIOGroup) group_list;
101
    QLIST_ENTRY(VFIOContainer) next;
102
} VFIOContainer;
103

    
104
/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
105
typedef struct VFIOMSIXInfo {
106
    uint8_t table_bar;
107
    uint8_t pba_bar;
108
    uint16_t entries;
109
    uint32_t table_offset;
110
    uint32_t pba_offset;
111
    MemoryRegion mmap_mem;
112
    void *mmap;
113
} VFIOMSIXInfo;
114

    
115
typedef struct VFIODevice {
116
    PCIDevice pdev;
117
    int fd;
118
    VFIOINTx intx;
119
    unsigned int config_size;
120
    off_t config_offset; /* Offset of config space region within device fd */
121
    unsigned int rom_size;
122
    off_t rom_offset; /* Offset of ROM region within device fd */
123
    int msi_cap_size;
124
    VFIOMSIVector *msi_vectors;
125
    VFIOMSIXInfo *msix;
126
    int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
127
    int interrupt; /* Current interrupt type */
128
    VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
129
    PCIHostDeviceAddress host;
130
    QLIST_ENTRY(VFIODevice) next;
131
    struct VFIOGroup *group;
132
    bool reset_works;
133
} VFIODevice;
134

    
135
typedef struct VFIOGroup {
136
    int fd;
137
    int groupid;
138
    VFIOContainer *container;
139
    QLIST_HEAD(, VFIODevice) device_list;
140
    QLIST_ENTRY(VFIOGroup) next;
141
    QLIST_ENTRY(VFIOGroup) container_next;
142
} VFIOGroup;
143

    
144
#define MSIX_CAP_LENGTH 12
145

    
146
static QLIST_HEAD(, VFIOContainer)
147
    container_list = QLIST_HEAD_INITIALIZER(container_list);
148

    
149
static QLIST_HEAD(, VFIOGroup)
150
    group_list = QLIST_HEAD_INITIALIZER(group_list);
151

    
152
static void vfio_disable_interrupts(VFIODevice *vdev);
153
static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
154
static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
155

    
156
/*
157
 * Common VFIO interrupt disable
158
 */
159
static void vfio_disable_irqindex(VFIODevice *vdev, int index)
160
{
161
    struct vfio_irq_set irq_set = {
162
        .argsz = sizeof(irq_set),
163
        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
164
        .index = index,
165
        .start = 0,
166
        .count = 0,
167
    };
168

    
169
    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
170
}
171

    
172
/*
173
 * INTx
174
 */
175
static void vfio_unmask_intx(VFIODevice *vdev)
176
{
177
    struct vfio_irq_set irq_set = {
178
        .argsz = sizeof(irq_set),
179
        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
180
        .index = VFIO_PCI_INTX_IRQ_INDEX,
181
        .start = 0,
182
        .count = 1,
183
    };
184

    
185
    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
186
}
187

    
188
#ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
189
static void vfio_mask_intx(VFIODevice *vdev)
190
{
191
    struct vfio_irq_set irq_set = {
192
        .argsz = sizeof(irq_set),
193
        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
194
        .index = VFIO_PCI_INTX_IRQ_INDEX,
195
        .start = 0,
196
        .count = 1,
197
    };
198

    
199
    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
200
}
201
#endif
202

    
203
/*
204
 * Disabling BAR mmaping can be slow, but toggling it around INTx can
205
 * also be a huge overhead.  We try to get the best of both worlds by
206
 * waiting until an interrupt to disable mmaps (subsequent transitions
207
 * to the same state are effectively no overhead).  If the interrupt has
208
 * been serviced and the time gap is long enough, we re-enable mmaps for
209
 * performance.  This works well for things like graphics cards, which
210
 * may not use their interrupt at all and are penalized to an unusable
211
 * level by read/write BAR traps.  Other devices, like NICs, have more
212
 * regular interrupts and see much better latency by staying in non-mmap
213
 * mode.  We therefore set the default mmap_timeout such that a ping
214
 * is just enough to keep the mmap disabled.  Users can experiment with
215
 * other options with the x-intx-mmap-timeout-ms parameter (a value of
216
 * zero disables the timer).
217
 */
218
static void vfio_intx_mmap_enable(void *opaque)
219
{
220
    VFIODevice *vdev = opaque;
221

    
222
    if (vdev->intx.pending) {
223
        qemu_mod_timer(vdev->intx.mmap_timer,
224
                       qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout);
225
        return;
226
    }
227

    
228
    vfio_mmap_set_enabled(vdev, true);
229
}
230

    
231
static void vfio_intx_interrupt(void *opaque)
232
{
233
    VFIODevice *vdev = opaque;
234

    
235
    if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
236
        return;
237
    }
238

    
239
    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
240
            vdev->host.bus, vdev->host.slot, vdev->host.function,
241
            'A' + vdev->intx.pin);
242

    
243
    vdev->intx.pending = true;
244
    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
245
    vfio_mmap_set_enabled(vdev, false);
246
    if (vdev->intx.mmap_timeout) {
247
        qemu_mod_timer(vdev->intx.mmap_timer,
248
                       qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout);
249
    }
250
}
251

    
252
static void vfio_eoi(VFIODevice *vdev)
253
{
254
    if (!vdev->intx.pending) {
255
        return;
256
    }
257

    
258
    DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
259
            vdev->host.bus, vdev->host.slot, vdev->host.function);
260

    
261
    vdev->intx.pending = false;
262
    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
263
    vfio_unmask_intx(vdev);
264
}
265

    
266
static void vfio_enable_intx_kvm(VFIODevice *vdev)
267
{
268
#ifdef CONFIG_KVM
269
    struct kvm_irqfd irqfd = {
270
        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
271
        .gsi = vdev->intx.route.irq,
272
        .flags = KVM_IRQFD_FLAG_RESAMPLE,
273
    };
274
    struct vfio_irq_set *irq_set;
275
    int ret, argsz;
276
    int32_t *pfd;
277

    
278
    if (!kvm_irqfds_enabled() ||
279
        vdev->intx.route.mode != PCI_INTX_ENABLED ||
280
        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
281
        return;
282
    }
283

    
284
    /* Get to a known interrupt state */
285
    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
286
    vfio_mask_intx(vdev);
287
    vdev->intx.pending = false;
288
    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
289

    
290
    /* Get an eventfd for resample/unmask */
291
    if (event_notifier_init(&vdev->intx.unmask, 0)) {
292
        error_report("vfio: Error: event_notifier_init failed eoi\n");
293
        goto fail;
294
    }
295

    
296
    /* KVM triggers it, VFIO listens for it */
297
    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
298

    
299
    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
300
        error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
301
        goto fail_irqfd;
302
    }
303

    
304
    argsz = sizeof(*irq_set) + sizeof(*pfd);
305

    
306
    irq_set = g_malloc0(argsz);
307
    irq_set->argsz = argsz;
308
    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
309
    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
310
    irq_set->start = 0;
311
    irq_set->count = 1;
312
    pfd = (int32_t *)&irq_set->data;
313

    
314
    *pfd = irqfd.resamplefd;
315

    
316
    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
317
    g_free(irq_set);
318
    if (ret) {
319
        error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
320
        goto fail_vfio;
321
    }
322

    
323
    /* Let'em rip */
324
    vfio_unmask_intx(vdev);
325

    
326
    vdev->intx.kvm_accel = true;
327

    
328
    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
329
            __func__, vdev->host.domain, vdev->host.bus,
330
            vdev->host.slot, vdev->host.function);
331

    
332
    return;
333

    
334
fail_vfio:
335
    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
336
    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
337
fail_irqfd:
338
    event_notifier_cleanup(&vdev->intx.unmask);
339
fail:
340
    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
341
    vfio_unmask_intx(vdev);
342
#endif
343
}
344

    
345
static void vfio_disable_intx_kvm(VFIODevice *vdev)
346
{
347
#ifdef CONFIG_KVM
348
    struct kvm_irqfd irqfd = {
349
        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
350
        .gsi = vdev->intx.route.irq,
351
        .flags = KVM_IRQFD_FLAG_DEASSIGN,
352
    };
353

    
354
    if (!vdev->intx.kvm_accel) {
355
        return;
356
    }
357

    
358
    /*
359
     * Get to a known state, hardware masked, QEMU ready to accept new
360
     * interrupts, QEMU IRQ de-asserted.
361
     */
362
    vfio_mask_intx(vdev);
363
    vdev->intx.pending = false;
364
    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
365

    
366
    /* Tell KVM to stop listening for an INTx irqfd */
367
    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
368
        error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
369
    }
370

    
371
    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
372
    event_notifier_cleanup(&vdev->intx.unmask);
373

    
374
    /* QEMU starts listening for interrupt events. */
375
    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
376

    
377
    vdev->intx.kvm_accel = false;
378

    
379
    /* If we've missed an event, let it re-fire through QEMU */
380
    vfio_unmask_intx(vdev);
381

    
382
    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
383
            __func__, vdev->host.domain, vdev->host.bus,
384
            vdev->host.slot, vdev->host.function);
385
#endif
386
}
387

    
388
static void vfio_update_irq(PCIDevice *pdev)
389
{
390
    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
391
    PCIINTxRoute route;
392

    
393
    if (vdev->interrupt != VFIO_INT_INTx) {
394
        return;
395
    }
396

    
397
    route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
398

    
399
    if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
400
        return; /* Nothing changed */
401
    }
402

    
403
    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
404
            vdev->host.domain, vdev->host.bus, vdev->host.slot,
405
            vdev->host.function, vdev->intx.route.irq, route.irq);
406

    
407
    vfio_disable_intx_kvm(vdev);
408

    
409
    vdev->intx.route = route;
410

    
411
    if (route.mode != PCI_INTX_ENABLED) {
412
        return;
413
    }
414

    
415
    vfio_enable_intx_kvm(vdev);
416

    
417
    /* Re-enable the interrupt in cased we missed an EOI */
418
    vfio_eoi(vdev);
419
}
420

    
421
static int vfio_enable_intx(VFIODevice *vdev)
422
{
423
    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
424
    int ret, argsz;
425
    struct vfio_irq_set *irq_set;
426
    int32_t *pfd;
427

    
428
    if (!pin) {
429
        return 0;
430
    }
431

    
432
    vfio_disable_interrupts(vdev);
433

    
434
    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
435

    
436
#ifdef CONFIG_KVM
437
    /*
438
     * Only conditional to avoid generating error messages on platforms
439
     * where we won't actually use the result anyway.
440
     */
441
    if (kvm_irqfds_enabled() &&
442
        kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
443
        vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
444
                                                        vdev->intx.pin);
445
    }
446
#endif
447

    
448
    ret = event_notifier_init(&vdev->intx.interrupt, 0);
449
    if (ret) {
450
        error_report("vfio: Error: event_notifier_init failed\n");
451
        return ret;
452
    }
453

    
454
    argsz = sizeof(*irq_set) + sizeof(*pfd);
455

    
456
    irq_set = g_malloc0(argsz);
457
    irq_set->argsz = argsz;
458
    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
459
    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
460
    irq_set->start = 0;
461
    irq_set->count = 1;
462
    pfd = (int32_t *)&irq_set->data;
463

    
464
    *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
465
    qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
466

    
467
    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
468
    g_free(irq_set);
469
    if (ret) {
470
        error_report("vfio: Error: Failed to setup INTx fd: %m\n");
471
        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
472
        event_notifier_cleanup(&vdev->intx.interrupt);
473
        return -errno;
474
    }
475

    
476
    vfio_enable_intx_kvm(vdev);
477

    
478
    vdev->interrupt = VFIO_INT_INTx;
479

    
480
    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
481
            vdev->host.bus, vdev->host.slot, vdev->host.function);
482

    
483
    return 0;
484
}
485

    
486
static void vfio_disable_intx(VFIODevice *vdev)
487
{
488
    int fd;
489

    
490
    qemu_del_timer(vdev->intx.mmap_timer);
491
    vfio_disable_intx_kvm(vdev);
492
    vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
493
    vdev->intx.pending = false;
494
    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
495
    vfio_mmap_set_enabled(vdev, true);
496

    
497
    fd = event_notifier_get_fd(&vdev->intx.interrupt);
498
    qemu_set_fd_handler(fd, NULL, NULL, vdev);
499
    event_notifier_cleanup(&vdev->intx.interrupt);
500

    
501
    vdev->interrupt = VFIO_INT_NONE;
502

    
503
    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
504
            vdev->host.bus, vdev->host.slot, vdev->host.function);
505
}
506

    
507
/*
508
 * MSI/X
509
 */
510
static void vfio_msi_interrupt(void *opaque)
511
{
512
    VFIOMSIVector *vector = opaque;
513
    VFIODevice *vdev = vector->vdev;
514
    int nr = vector - vdev->msi_vectors;
515

    
516
    if (!event_notifier_test_and_clear(&vector->interrupt)) {
517
        return;
518
    }
519

    
520
    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__,
521
            vdev->host.domain, vdev->host.bus, vdev->host.slot,
522
            vdev->host.function, nr);
523

    
524
    if (vdev->interrupt == VFIO_INT_MSIX) {
525
        msix_notify(&vdev->pdev, nr);
526
    } else if (vdev->interrupt == VFIO_INT_MSI) {
527
        msi_notify(&vdev->pdev, nr);
528
    } else {
529
        error_report("vfio: MSI interrupt receieved, but not enabled?\n");
530
    }
531
}
532

    
533
static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
534
{
535
    struct vfio_irq_set *irq_set;
536
    int ret = 0, i, argsz;
537
    int32_t *fds;
538

    
539
    argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
540

    
541
    irq_set = g_malloc0(argsz);
542
    irq_set->argsz = argsz;
543
    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
544
    irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
545
    irq_set->start = 0;
546
    irq_set->count = vdev->nr_vectors;
547
    fds = (int32_t *)&irq_set->data;
548

    
549
    for (i = 0; i < vdev->nr_vectors; i++) {
550
        if (!vdev->msi_vectors[i].use) {
551
            fds[i] = -1;
552
            continue;
553
        }
554

    
555
        fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
556
    }
557

    
558
    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
559

    
560
    g_free(irq_set);
561

    
562
    return ret;
563
}
564

    
565
static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
566
                                   MSIMessage *msg, IOHandler *handler)
567
{
568
    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
569
    VFIOMSIVector *vector;
570
    int ret;
571

    
572
    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
573
            vdev->host.domain, vdev->host.bus, vdev->host.slot,
574
            vdev->host.function, nr);
575

    
576
    vector = &vdev->msi_vectors[nr];
577
    vector->vdev = vdev;
578
    vector->use = true;
579

    
580
    msix_vector_use(pdev, nr);
581

    
582
    if (event_notifier_init(&vector->interrupt, 0)) {
583
        error_report("vfio: Error: event_notifier_init failed\n");
584
    }
585

    
586
    /*
587
     * Attempt to enable route through KVM irqchip,
588
     * default to userspace handling if unavailable.
589
     */
590
    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
591
    if (vector->virq < 0 ||
592
        kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
593
                                       vector->virq) < 0) {
594
        if (vector->virq >= 0) {
595
            kvm_irqchip_release_virq(kvm_state, vector->virq);
596
            vector->virq = -1;
597
        }
598
        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
599
                            handler, NULL, vector);
600
    }
601

    
602
    /*
603
     * We don't want to have the host allocate all possible MSI vectors
604
     * for a device if they're not in use, so we shutdown and incrementally
605
     * increase them as needed.
606
     */
607
    if (vdev->nr_vectors < nr + 1) {
608
        vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
609
        vdev->nr_vectors = nr + 1;
610
        ret = vfio_enable_vectors(vdev, true);
611
        if (ret) {
612
            error_report("vfio: failed to enable vectors, %d\n", ret);
613
        }
614
    } else {
615
        int argsz;
616
        struct vfio_irq_set *irq_set;
617
        int32_t *pfd;
618

    
619
        argsz = sizeof(*irq_set) + sizeof(*pfd);
620

    
621
        irq_set = g_malloc0(argsz);
622
        irq_set->argsz = argsz;
623
        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
624
                         VFIO_IRQ_SET_ACTION_TRIGGER;
625
        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
626
        irq_set->start = nr;
627
        irq_set->count = 1;
628
        pfd = (int32_t *)&irq_set->data;
629

    
630
        *pfd = event_notifier_get_fd(&vector->interrupt);
631

    
632
        ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
633
        g_free(irq_set);
634
        if (ret) {
635
            error_report("vfio: failed to modify vector, %d\n", ret);
636
        }
637
    }
638

    
639
    return 0;
640
}
641

    
642
static int vfio_msix_vector_use(PCIDevice *pdev,
643
                                unsigned int nr, MSIMessage msg)
644
{
645
    return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
646
}
647

    
648
static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
649
{
650
    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
651
    VFIOMSIVector *vector = &vdev->msi_vectors[nr];
652
    int argsz;
653
    struct vfio_irq_set *irq_set;
654
    int32_t *pfd;
655

    
656
    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
657
            vdev->host.domain, vdev->host.bus, vdev->host.slot,
658
            vdev->host.function, nr);
659

    
660
    /*
661
     * XXX What's the right thing to do here?  This turns off the interrupt
662
     * completely, but do we really just want to switch the interrupt to
663
     * bouncing through userspace and let msix.c drop it?  Not sure.
664
     */
665
    msix_vector_unuse(pdev, nr);
666

    
667
    argsz = sizeof(*irq_set) + sizeof(*pfd);
668

    
669
    irq_set = g_malloc0(argsz);
670
    irq_set->argsz = argsz;
671
    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
672
                     VFIO_IRQ_SET_ACTION_TRIGGER;
673
    irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
674
    irq_set->start = nr;
675
    irq_set->count = 1;
676
    pfd = (int32_t *)&irq_set->data;
677

    
678
    *pfd = -1;
679

    
680
    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
681

    
682
    g_free(irq_set);
683

    
684
    if (vector->virq < 0) {
685
        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
686
                            NULL, NULL, NULL);
687
    } else {
688
        kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
689
                                          vector->virq);
690
        kvm_irqchip_release_virq(kvm_state, vector->virq);
691
        vector->virq = -1;
692
    }
693

    
694
    event_notifier_cleanup(&vector->interrupt);
695
    vector->use = false;
696
}
697

    
698
static void vfio_enable_msix(VFIODevice *vdev)
699
{
700
    vfio_disable_interrupts(vdev);
701

    
702
    vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
703

    
704
    vdev->interrupt = VFIO_INT_MSIX;
705

    
706
    /*
707
     * Some communication channels between VF & PF or PF & fw rely on the
708
     * physical state of the device and expect that enabling MSI-X from the
709
     * guest enables the same on the host.  When our guest is Linux, the
710
     * guest driver call to pci_enable_msix() sets the enabling bit in the
711
     * MSI-X capability, but leaves the vector table masked.  We therefore
712
     * can't rely on a vector_use callback (from request_irq() in the guest)
713
     * to switch the physical device into MSI-X mode because that may come a
714
     * long time after pci_enable_msix().  This code enables vector 0 with
715
     * triggering to userspace, then immediately release the vector, leaving
716
     * the physical device with no vectors enabled, but MSI-X enabled, just
717
     * like the guest view.
718
     */
719
    vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
720
    vfio_msix_vector_release(&vdev->pdev, 0);
721

    
722
    if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
723
                                  vfio_msix_vector_release, NULL)) {
724
        error_report("vfio: msix_set_vector_notifiers failed\n");
725
    }
726

    
727
    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
728
            vdev->host.bus, vdev->host.slot, vdev->host.function);
729
}
730

    
731
static void vfio_enable_msi(VFIODevice *vdev)
732
{
733
    int ret, i;
734

    
735
    vfio_disable_interrupts(vdev);
736

    
737
    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
738
retry:
739
    vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
740

    
741
    for (i = 0; i < vdev->nr_vectors; i++) {
742
        MSIMessage msg;
743
        VFIOMSIVector *vector = &vdev->msi_vectors[i];
744

    
745
        vector->vdev = vdev;
746
        vector->use = true;
747

    
748
        if (event_notifier_init(&vector->interrupt, 0)) {
749
            error_report("vfio: Error: event_notifier_init failed\n");
750
        }
751

    
752
        msg = msi_get_message(&vdev->pdev, i);
753

    
754
        /*
755
         * Attempt to enable route through KVM irqchip,
756
         * default to userspace handling if unavailable.
757
         */
758
        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
759
        if (vector->virq < 0 ||
760
            kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
761
                                           vector->virq) < 0) {
762
            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
763
                                vfio_msi_interrupt, NULL, vector);
764
        }
765
    }
766

    
767
    ret = vfio_enable_vectors(vdev, false);
768
    if (ret) {
769
        if (ret < 0) {
770
            error_report("vfio: Error: Failed to setup MSI fds: %m\n");
771
        } else if (ret != vdev->nr_vectors) {
772
            error_report("vfio: Error: Failed to enable %d "
773
                         "MSI vectors, retry with %d\n", vdev->nr_vectors, ret);
774
        }
775

    
776
        for (i = 0; i < vdev->nr_vectors; i++) {
777
            VFIOMSIVector *vector = &vdev->msi_vectors[i];
778
            if (vector->virq >= 0) {
779
                kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
780
                                                  vector->virq);
781
                kvm_irqchip_release_virq(kvm_state, vector->virq);
782
                vector->virq = -1;
783
            } else {
784
                qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
785
                                    NULL, NULL, NULL);
786
            }
787
            event_notifier_cleanup(&vector->interrupt);
788
        }
789

    
790
        g_free(vdev->msi_vectors);
791

    
792
        if (ret > 0 && ret != vdev->nr_vectors) {
793
            vdev->nr_vectors = ret;
794
            goto retry;
795
        }
796
        vdev->nr_vectors = 0;
797

    
798
        return;
799
    }
800

    
801
    vdev->interrupt = VFIO_INT_MSI;
802

    
803
    DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
804
            vdev->host.domain, vdev->host.bus, vdev->host.slot,
805
            vdev->host.function, vdev->nr_vectors);
806
}
807

    
808
static void vfio_disable_msi_common(VFIODevice *vdev)
809
{
810
    g_free(vdev->msi_vectors);
811
    vdev->msi_vectors = NULL;
812
    vdev->nr_vectors = 0;
813
    vdev->interrupt = VFIO_INT_NONE;
814

    
815
    vfio_enable_intx(vdev);
816
}
817

    
818
static void vfio_disable_msix(VFIODevice *vdev)
819
{
820
    msix_unset_vector_notifiers(&vdev->pdev);
821

    
822
    if (vdev->nr_vectors) {
823
        vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
824
    }
825

    
826
    vfio_disable_msi_common(vdev);
827

    
828
    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
829
            vdev->host.bus, vdev->host.slot, vdev->host.function);
830
}
831

    
832
static void vfio_disable_msi(VFIODevice *vdev)
833
{
834
    int i;
835

    
836
    vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
837

    
838
    for (i = 0; i < vdev->nr_vectors; i++) {
839
        VFIOMSIVector *vector = &vdev->msi_vectors[i];
840

    
841
        if (!vector->use) {
842
            continue;
843
        }
844

    
845
        if (vector->virq >= 0) {
846
            kvm_irqchip_remove_irqfd_notifier(kvm_state,
847
                                              &vector->interrupt, vector->virq);
848
            kvm_irqchip_release_virq(kvm_state, vector->virq);
849
            vector->virq = -1;
850
        } else {
851
            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
852
                                NULL, NULL, NULL);
853
        }
854

    
855
        event_notifier_cleanup(&vector->interrupt);
856
    }
857

    
858
    vfio_disable_msi_common(vdev);
859

    
860
    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
861
            vdev->host.bus, vdev->host.slot, vdev->host.function);
862
}
863

    
864
/*
865
 * IO Port/MMIO - Beware of the endians, VFIO is always little endian
866
 */
867
static void vfio_bar_write(void *opaque, hwaddr addr,
868
                           uint64_t data, unsigned size)
869
{
870
    VFIOBAR *bar = opaque;
871
    union {
872
        uint8_t byte;
873
        uint16_t word;
874
        uint32_t dword;
875
        uint64_t qword;
876
    } buf;
877

    
878
    switch (size) {
879
    case 1:
880
        buf.byte = data;
881
        break;
882
    case 2:
883
        buf.word = cpu_to_le16(data);
884
        break;
885
    case 4:
886
        buf.dword = cpu_to_le32(data);
887
        break;
888
    default:
889
        hw_error("vfio: unsupported write size, %d bytes\n", size);
890
        break;
891
    }
892

    
893
    if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
894
        error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m\n",
895
                     __func__, addr, data, size);
896
    }
897

    
898
    DPRINTF("%s(BAR%d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
899
            __func__, bar->nr, addr, data, size);
900

    
901
    /*
902
     * A read or write to a BAR always signals an INTx EOI.  This will
903
     * do nothing if not pending (including not in INTx mode).  We assume
904
     * that a BAR access is in response to an interrupt and that BAR
905
     * accesses will service the interrupt.  Unfortunately, we don't know
906
     * which access will service the interrupt, so we're potentially
907
     * getting quite a few host interrupts per guest interrupt.
908
     */
909
    vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
910
}
911

    
912
static uint64_t vfio_bar_read(void *opaque,
913
                              hwaddr addr, unsigned size)
914
{
915
    VFIOBAR *bar = opaque;
916
    union {
917
        uint8_t byte;
918
        uint16_t word;
919
        uint32_t dword;
920
        uint64_t qword;
921
    } buf;
922
    uint64_t data = 0;
923

    
924
    if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
925
        error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m\n",
926
                     __func__, addr, size);
927
        return (uint64_t)-1;
928
    }
929

    
930
    switch (size) {
931
    case 1:
932
        data = buf.byte;
933
        break;
934
    case 2:
935
        data = le16_to_cpu(buf.word);
936
        break;
937
    case 4:
938
        data = le32_to_cpu(buf.dword);
939
        break;
940
    default:
941
        hw_error("vfio: unsupported read size, %d bytes\n", size);
942
        break;
943
    }
944

    
945
    DPRINTF("%s(BAR%d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
946
            __func__, bar->nr, addr, size, data);
947

    
948
    /* Same as write above */
949
    vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
950

    
951
    return data;
952
}
953

    
954
static const MemoryRegionOps vfio_bar_ops = {
955
    .read = vfio_bar_read,
956
    .write = vfio_bar_write,
957
    .endianness = DEVICE_LITTLE_ENDIAN,
958
};
959

    
960
/*
961
 * PCI config space
962
 */
963
static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
964
{
965
    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
966
    uint32_t val = 0;
967

    
968
    /*
969
     * We only need QEMU PCI config support for the ROM BAR, the MSI and MSIX
970
     * capabilities, and the multifunction bit below.  We let VFIO handle
971
     * virtualizing everything else.  Performance is not a concern here.
972
     */
973
    if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) ||
974
        (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
975
         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) ||
976
        (pdev->cap_present & QEMU_PCI_CAP_MSI &&
977
         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size))) {
978

    
979
        val = pci_default_read_config(pdev, addr, len);
980
    } else {
981
        if (pread(vdev->fd, &val, len, vdev->config_offset + addr) != len) {
982
            error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m\n",
983
                         __func__, vdev->host.domain, vdev->host.bus,
984
                         vdev->host.slot, vdev->host.function, addr, len);
985
            return -errno;
986
        }
987
        val = le32_to_cpu(val);
988
    }
989

    
990
    /* Multifunction bit is virualized in QEMU */
991
    if (unlikely(ranges_overlap(addr, len, PCI_HEADER_TYPE, 1))) {
992
        uint32_t mask = PCI_HEADER_TYPE_MULTI_FUNCTION;
993

    
994
        if (len == 4) {
995
            mask <<= 16;
996
        }
997

    
998
        if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
999
            val |= mask;
1000
        } else {
1001
            val &= ~mask;
1002
        }
1003
    }
1004

    
1005
    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
1006
            vdev->host.domain, vdev->host.bus, vdev->host.slot,
1007
            vdev->host.function, addr, len, val);
1008

    
1009
    return val;
1010
}
1011

    
1012
static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
1013
                                  uint32_t val, int len)
1014
{
1015
    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
1016
    uint32_t val_le = cpu_to_le32(val);
1017

    
1018
    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
1019
            vdev->host.domain, vdev->host.bus, vdev->host.slot,
1020
            vdev->host.function, addr, val, len);
1021

    
1022
    /* Write everything to VFIO, let it filter out what we can't write */
1023
    if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
1024
        error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m\n",
1025
                     __func__, vdev->host.domain, vdev->host.bus,
1026
                     vdev->host.slot, vdev->host.function, addr, val, len);
1027
    }
1028

    
1029
    /* Write standard header bits to emulation */
1030
    if (addr < PCI_CONFIG_HEADER_SIZE) {
1031
        pci_default_write_config(pdev, addr, val, len);
1032
        return;
1033
    }
1034

    
1035
    /* MSI/MSI-X Enabling/Disabling */
1036
    if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1037
        ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1038
        int is_enabled, was_enabled = msi_enabled(pdev);
1039

    
1040
        pci_default_write_config(pdev, addr, val, len);
1041

    
1042
        is_enabled = msi_enabled(pdev);
1043

    
1044
        if (!was_enabled && is_enabled) {
1045
            vfio_enable_msi(vdev);
1046
        } else if (was_enabled && !is_enabled) {
1047
            vfio_disable_msi(vdev);
1048
        }
1049
    }
1050

    
1051
    if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1052
        ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1053
        int is_enabled, was_enabled = msix_enabled(pdev);
1054

    
1055
        pci_default_write_config(pdev, addr, val, len);
1056

    
1057
        is_enabled = msix_enabled(pdev);
1058

    
1059
        if (!was_enabled && is_enabled) {
1060
            vfio_enable_msix(vdev);
1061
        } else if (was_enabled && !is_enabled) {
1062
            vfio_disable_msix(vdev);
1063
        }
1064
    }
1065
}
1066

    
1067
/*
1068
 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
1069
 */
1070
static int vfio_dma_unmap(VFIOContainer *container,
1071
                          hwaddr iova, ram_addr_t size)
1072
{
1073
    struct vfio_iommu_type1_dma_unmap unmap = {
1074
        .argsz = sizeof(unmap),
1075
        .flags = 0,
1076
        .iova = iova,
1077
        .size = size,
1078
    };
1079

    
1080
    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
1081
        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
1082
        return -errno;
1083
    }
1084

    
1085
    return 0;
1086
}
1087

    
1088
static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
1089
                        ram_addr_t size, void *vaddr, bool readonly)
1090
{
1091
    struct vfio_iommu_type1_dma_map map = {
1092
        .argsz = sizeof(map),
1093
        .flags = VFIO_DMA_MAP_FLAG_READ,
1094
        .vaddr = (__u64)(uintptr_t)vaddr,
1095
        .iova = iova,
1096
        .size = size,
1097
    };
1098

    
1099
    if (!readonly) {
1100
        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
1101
    }
1102

    
1103
    /*
1104
     * Try the mapping, if it fails with EBUSY, unmap the region and try
1105
     * again.  This shouldn't be necessary, but we sometimes see it in
1106
     * the the VGA ROM space.
1107
     */
1108
    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
1109
        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
1110
         ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
1111
        return 0;
1112
    }
1113

    
1114
    DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
1115
    return -errno;
1116
}
1117

    
1118
static bool vfio_listener_skipped_section(MemoryRegionSection *section)
1119
{
1120
    return !memory_region_is_ram(section->mr);
1121
}
1122

    
1123
static void vfio_listener_region_add(MemoryListener *listener,
1124
                                     MemoryRegionSection *section)
1125
{
1126
    VFIOContainer *container = container_of(listener, VFIOContainer,
1127
                                            iommu_data.listener);
1128
    hwaddr iova, end;
1129
    void *vaddr;
1130
    int ret;
1131

    
1132
    if (vfio_listener_skipped_section(section)) {
1133
        DPRINTF("vfio: SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
1134
                section->offset_within_address_space,
1135
                section->offset_within_address_space + section->size - 1);
1136
        return;
1137
    }
1138

    
1139
    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
1140
                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
1141
        error_report("%s received unaligned region\n", __func__);
1142
        return;
1143
    }
1144

    
1145
    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
1146
    end = (section->offset_within_address_space + section->size) &
1147
          TARGET_PAGE_MASK;
1148

    
1149
    if (iova >= end) {
1150
        return;
1151
    }
1152

    
1153
    vaddr = memory_region_get_ram_ptr(section->mr) +
1154
            section->offset_within_region +
1155
            (iova - section->offset_within_address_space);
1156

    
1157
    DPRINTF("vfio: region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
1158
            iova, end - 1, vaddr);
1159

    
1160
    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
1161
    if (ret) {
1162
        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
1163
                     "0x%"HWADDR_PRIx", %p) = %d (%m)\n",
1164
                     container, iova, end - iova, vaddr, ret);
1165
    }
1166
}
1167

    
1168
static void vfio_listener_region_del(MemoryListener *listener,
1169
                                     MemoryRegionSection *section)
1170
{
1171
    VFIOContainer *container = container_of(listener, VFIOContainer,
1172
                                            iommu_data.listener);
1173
    hwaddr iova, end;
1174
    int ret;
1175

    
1176
    if (vfio_listener_skipped_section(section)) {
1177
        DPRINTF("vfio: SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
1178
                section->offset_within_address_space,
1179
                section->offset_within_address_space + section->size - 1);
1180
        return;
1181
    }
1182

    
1183
    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
1184
                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
1185
        error_report("%s received unaligned region\n", __func__);
1186
        return;
1187
    }
1188

    
1189
    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
1190
    end = (section->offset_within_address_space + section->size) &
1191
          TARGET_PAGE_MASK;
1192

    
1193
    if (iova >= end) {
1194
        return;
1195
    }
1196

    
1197
    DPRINTF("vfio: region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
1198
            iova, end - 1);
1199

    
1200
    ret = vfio_dma_unmap(container, iova, end - iova);
1201
    if (ret) {
1202
        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1203
                     "0x%"HWADDR_PRIx") = %d (%m)\n",
1204
                     container, iova, end - iova, ret);
1205
    }
1206
}
1207

    
1208
static MemoryListener vfio_memory_listener = {
1209
    .region_add = vfio_listener_region_add,
1210
    .region_del = vfio_listener_region_del,
1211
};
1212

    
1213
static void vfio_listener_release(VFIOContainer *container)
1214
{
1215
    memory_listener_unregister(&container->iommu_data.listener);
1216
}
1217

    
1218
/*
1219
 * Interrupt setup
1220
 */
1221
static void vfio_disable_interrupts(VFIODevice *vdev)
1222
{
1223
    switch (vdev->interrupt) {
1224
    case VFIO_INT_INTx:
1225
        vfio_disable_intx(vdev);
1226
        break;
1227
    case VFIO_INT_MSI:
1228
        vfio_disable_msi(vdev);
1229
        break;
1230
    case VFIO_INT_MSIX:
1231
        vfio_disable_msix(vdev);
1232
        break;
1233
    }
1234
}
1235

    
1236
static int vfio_setup_msi(VFIODevice *vdev, int pos)
1237
{
1238
    uint16_t ctrl;
1239
    bool msi_64bit, msi_maskbit;
1240
    int ret, entries;
1241

    
1242
    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
1243
              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
1244
        return -errno;
1245
    }
1246
    ctrl = le16_to_cpu(ctrl);
1247

    
1248
    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1249
    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1250
    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1251

    
1252
    DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
1253
            vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
1254

    
1255
    ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
1256
    if (ret < 0) {
1257
        if (ret == -ENOTSUP) {
1258
            return 0;
1259
        }
1260
        error_report("vfio: msi_init failed\n");
1261
        return ret;
1262
    }
1263
    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1264

    
1265
    return 0;
1266
}
1267

    
1268
/*
1269
 * We don't have any control over how pci_add_capability() inserts
1270
 * capabilities into the chain.  In order to setup MSI-X we need a
1271
 * MemoryRegion for the BAR.  In order to setup the BAR and not
1272
 * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1273
 * need to first look for where the MSI-X table lives.  So we
1274
 * unfortunately split MSI-X setup across two functions.
1275
 */
1276
static int vfio_early_setup_msix(VFIODevice *vdev)
1277
{
1278
    uint8_t pos;
1279
    uint16_t ctrl;
1280
    uint32_t table, pba;
1281

    
1282
    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1283
    if (!pos) {
1284
        return 0;
1285
    }
1286

    
1287
    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
1288
              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
1289
        return -errno;
1290
    }
1291

    
1292
    if (pread(vdev->fd, &table, sizeof(table),
1293
              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
1294
        return -errno;
1295
    }
1296

    
1297
    if (pread(vdev->fd, &pba, sizeof(pba),
1298
              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
1299
        return -errno;
1300
    }
1301

    
1302
    ctrl = le16_to_cpu(ctrl);
1303
    table = le32_to_cpu(table);
1304
    pba = le32_to_cpu(pba);
1305

    
1306
    vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
1307
    vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1308
    vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1309
    vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1310
    vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1311
    vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1312

    
1313
    DPRINTF("%04x:%02x:%02x.%x "
1314
            "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
1315
            vdev->host.domain, vdev->host.bus, vdev->host.slot,
1316
            vdev->host.function, pos, vdev->msix->table_bar,
1317
            vdev->msix->table_offset, vdev->msix->entries);
1318

    
1319
    return 0;
1320
}
1321

    
1322
static int vfio_setup_msix(VFIODevice *vdev, int pos)
1323
{
1324
    int ret;
1325

    
1326
    ret = msix_init(&vdev->pdev, vdev->msix->entries,
1327
                    &vdev->bars[vdev->msix->table_bar].mem,
1328
                    vdev->msix->table_bar, vdev->msix->table_offset,
1329
                    &vdev->bars[vdev->msix->pba_bar].mem,
1330
                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
1331
    if (ret < 0) {
1332
        if (ret == -ENOTSUP) {
1333
            return 0;
1334
        }
1335
        error_report("vfio: msix_init failed\n");
1336
        return ret;
1337
    }
1338

    
1339
    return 0;
1340
}
1341

    
1342
static void vfio_teardown_msi(VFIODevice *vdev)
1343
{
1344
    msi_uninit(&vdev->pdev);
1345

    
1346
    if (vdev->msix) {
1347
        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
1348
                    &vdev->bars[vdev->msix->pba_bar].mem);
1349
    }
1350
}
1351

    
1352
/*
1353
 * Resource setup
1354
 */
1355
static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled)
1356
{
1357
    int i;
1358

    
1359
    for (i = 0; i < PCI_ROM_SLOT; i++) {
1360
        VFIOBAR *bar = &vdev->bars[i];
1361

    
1362
        if (!bar->size) {
1363
            continue;
1364
        }
1365

    
1366
        memory_region_set_enabled(&bar->mmap_mem, enabled);
1367
        if (vdev->msix && vdev->msix->table_bar == i) {
1368
            memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
1369
        }
1370
    }
1371
}
1372

    
1373
static void vfio_unmap_bar(VFIODevice *vdev, int nr)
1374
{
1375
    VFIOBAR *bar = &vdev->bars[nr];
1376

    
1377
    if (!bar->size) {
1378
        return;
1379
    }
1380

    
1381
    memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
1382
    munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
1383

    
1384
    if (vdev->msix && vdev->msix->table_bar == nr) {
1385
        memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
1386
        munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
1387
    }
1388

    
1389
    memory_region_destroy(&bar->mem);
1390
}
1391

    
1392
static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
1393
                         void **map, size_t size, off_t offset,
1394
                         const char *name)
1395
{
1396
    int ret = 0;
1397

    
1398
    if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
1399
        int prot = 0;
1400

    
1401
        if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
1402
            prot |= PROT_READ;
1403
        }
1404

    
1405
        if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
1406
            prot |= PROT_WRITE;
1407
        }
1408

    
1409
        *map = mmap(NULL, size, prot, MAP_SHARED,
1410
                    bar->fd, bar->fd_offset + offset);
1411
        if (*map == MAP_FAILED) {
1412
            *map = NULL;
1413
            ret = -errno;
1414
            goto empty_region;
1415
        }
1416

    
1417
        memory_region_init_ram_ptr(submem, name, size, *map);
1418
    } else {
1419
empty_region:
1420
        /* Create a zero sized sub-region to make cleanup easy. */
1421
        memory_region_init(submem, name, 0);
1422
    }
1423

    
1424
    memory_region_add_subregion(mem, offset, submem);
1425

    
1426
    return ret;
1427
}
1428

    
1429
static void vfio_map_bar(VFIODevice *vdev, int nr)
1430
{
1431
    VFIOBAR *bar = &vdev->bars[nr];
1432
    unsigned size = bar->size;
1433
    char name[64];
1434
    uint32_t pci_bar;
1435
    uint8_t type;
1436
    int ret;
1437

    
1438
    /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
1439
    if (!size) {
1440
        return;
1441
    }
1442

    
1443
    snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
1444
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1445
             vdev->host.function, nr);
1446

    
1447
    /* Determine what type of BAR this is for registration */
1448
    ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
1449
                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
1450
    if (ret != sizeof(pci_bar)) {
1451
        error_report("vfio: Failed to read BAR %d (%m)\n", nr);
1452
        return;
1453
    }
1454

    
1455
    pci_bar = le32_to_cpu(pci_bar);
1456
    type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
1457
           ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
1458

    
1459
    /* A "slow" read/write mapping underlies all BARs */
1460
    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
1461
    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
1462

    
1463
    /*
1464
     * We can't mmap areas overlapping the MSIX vector table, so we
1465
     * potentially insert a direct-mapped subregion before and after it.
1466
     */
1467
    if (vdev->msix && vdev->msix->table_bar == nr) {
1468
        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
1469
    }
1470

    
1471
    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
1472
    if (vfio_mmap_bar(bar, &bar->mem,
1473
                      &bar->mmap_mem, &bar->mmap, size, 0, name)) {
1474
        error_report("%s unsupported. Performance may be slow\n", name);
1475
    }
1476

    
1477
    if (vdev->msix && vdev->msix->table_bar == nr) {
1478
        unsigned start;
1479

    
1480
        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
1481
                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1482

    
1483
        size = start < bar->size ? bar->size - start : 0;
1484
        strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
1485
        /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
1486
        if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
1487
                          &vdev->msix->mmap, size, start, name)) {
1488
            error_report("%s unsupported. Performance may be slow\n", name);
1489
        }
1490
    }
1491
}
1492

    
1493
static void vfio_map_bars(VFIODevice *vdev)
1494
{
1495
    int i;
1496

    
1497
    for (i = 0; i < PCI_ROM_SLOT; i++) {
1498
        vfio_map_bar(vdev, i);
1499
    }
1500
}
1501

    
1502
static void vfio_unmap_bars(VFIODevice *vdev)
1503
{
1504
    int i;
1505

    
1506
    for (i = 0; i < PCI_ROM_SLOT; i++) {
1507
        vfio_unmap_bar(vdev, i);
1508
    }
1509
}
1510

    
1511
/*
1512
 * General setup
1513
 */
1514
static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
1515
{
1516
    uint8_t tmp, next = 0xff;
1517

    
1518
    for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
1519
         tmp = pdev->config[tmp + 1]) {
1520
        if (tmp > pos && tmp < next) {
1521
            next = tmp;
1522
        }
1523
    }
1524

    
1525
    return next - pos;
1526
}
1527

    
1528
static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
1529
{
1530
    PCIDevice *pdev = &vdev->pdev;
1531
    uint8_t cap_id, next, size;
1532
    int ret;
1533

    
1534
    cap_id = pdev->config[pos];
1535
    next = pdev->config[pos + 1];
1536

    
1537
    /*
1538
     * If it becomes important to configure capabilities to their actual
1539
     * size, use this as the default when it's something we don't recognize.
1540
     * Since QEMU doesn't actually handle many of the config accesses,
1541
     * exact size doesn't seem worthwhile.
1542
     */
1543
    size = vfio_std_cap_max_size(pdev, pos);
1544

    
1545
    /*
1546
     * pci_add_capability always inserts the new capability at the head
1547
     * of the chain.  Therefore to end up with a chain that matches the
1548
     * physical device, we insert from the end by making this recursive.
1549
     * This is also why we pre-caclulate size above as cached config space
1550
     * will be changed as we unwind the stack.
1551
     */
1552
    if (next) {
1553
        ret = vfio_add_std_cap(vdev, next);
1554
        if (ret) {
1555
            return ret;
1556
        }
1557
    } else {
1558
        pdev->config[PCI_CAPABILITY_LIST] = 0; /* Begin the rebuild */
1559
    }
1560

    
1561
    switch (cap_id) {
1562
    case PCI_CAP_ID_MSI:
1563
        ret = vfio_setup_msi(vdev, pos);
1564
        break;
1565
    case PCI_CAP_ID_MSIX:
1566
        ret = vfio_setup_msix(vdev, pos);
1567
        break;
1568
    default:
1569
        ret = pci_add_capability(pdev, cap_id, pos, size);
1570
        break;
1571
    }
1572

    
1573
    if (ret < 0) {
1574
        error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
1575
                     "0x%x[0x%x]@0x%x: %d\n", vdev->host.domain,
1576
                     vdev->host.bus, vdev->host.slot, vdev->host.function,
1577
                     cap_id, size, pos, ret);
1578
        return ret;
1579
    }
1580

    
1581
    return 0;
1582
}
1583

    
1584
static int vfio_add_capabilities(VFIODevice *vdev)
1585
{
1586
    PCIDevice *pdev = &vdev->pdev;
1587

    
1588
    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
1589
        !pdev->config[PCI_CAPABILITY_LIST]) {
1590
        return 0; /* Nothing to add */
1591
    }
1592

    
1593
    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
1594
}
1595

    
1596
static int vfio_load_rom(VFIODevice *vdev)
1597
{
1598
    uint64_t size = vdev->rom_size;
1599
    char name[32];
1600
    off_t off = 0, voff = vdev->rom_offset;
1601
    ssize_t bytes;
1602
    void *ptr;
1603

    
1604
    /* If loading ROM from file, pci handles it */
1605
    if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
1606
        return 0;
1607
    }
1608

    
1609
    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
1610
            vdev->host.bus, vdev->host.slot, vdev->host.function);
1611

    
1612
    snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
1613
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1614
             vdev->host.function);
1615
    memory_region_init_ram(&vdev->pdev.rom, name, size);
1616
    ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
1617
    memset(ptr, 0xff, size);
1618

    
1619
    while (size) {
1620
        bytes = pread(vdev->fd, ptr + off, size, voff + off);
1621
        if (bytes == 0) {
1622
            break; /* expect that we could get back less than the ROM BAR */
1623
        } else if (bytes > 0) {
1624
            off += bytes;
1625
            size -= bytes;
1626
        } else {
1627
            if (errno == EINTR || errno == EAGAIN) {
1628
                continue;
1629
            }
1630
            error_report("vfio: Error reading device ROM: %m\n");
1631
            memory_region_destroy(&vdev->pdev.rom);
1632
            return -errno;
1633
        }
1634
    }
1635

    
1636
    pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
1637
    vdev->pdev.has_rom = true;
1638
    return 0;
1639
}
1640

    
1641
static int vfio_connect_container(VFIOGroup *group)
1642
{
1643
    VFIOContainer *container;
1644
    int ret, fd;
1645

    
1646
    if (group->container) {
1647
        return 0;
1648
    }
1649

    
1650
    QLIST_FOREACH(container, &container_list, next) {
1651
        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
1652
            group->container = container;
1653
            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
1654
            return 0;
1655
        }
1656
    }
1657

    
1658
    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
1659
    if (fd < 0) {
1660
        error_report("vfio: failed to open /dev/vfio/vfio: %m\n");
1661
        return -errno;
1662
    }
1663

    
1664
    ret = ioctl(fd, VFIO_GET_API_VERSION);
1665
    if (ret != VFIO_API_VERSION) {
1666
        error_report("vfio: supported vfio version: %d, "
1667
                     "reported version: %d\n", VFIO_API_VERSION, ret);
1668
        close(fd);
1669
        return -EINVAL;
1670
    }
1671

    
1672
    container = g_malloc0(sizeof(*container));
1673
    container->fd = fd;
1674

    
1675
    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
1676
        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
1677
        if (ret) {
1678
            error_report("vfio: failed to set group container: %m\n");
1679
            g_free(container);
1680
            close(fd);
1681
            return -errno;
1682
        }
1683

    
1684
        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
1685
        if (ret) {
1686
            error_report("vfio: failed to set iommu for container: %m\n");
1687
            g_free(container);
1688
            close(fd);
1689
            return -errno;
1690
        }
1691

    
1692
        container->iommu_data.listener = vfio_memory_listener;
1693
        container->iommu_data.release = vfio_listener_release;
1694

    
1695
        memory_listener_register(&container->iommu_data.listener, &address_space_memory);
1696
    } else {
1697
        error_report("vfio: No available IOMMU models\n");
1698
        g_free(container);
1699
        close(fd);
1700
        return -EINVAL;
1701
    }
1702

    
1703
    QLIST_INIT(&container->group_list);
1704
    QLIST_INSERT_HEAD(&container_list, container, next);
1705

    
1706
    group->container = container;
1707
    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
1708

    
1709
    return 0;
1710
}
1711

    
1712
static void vfio_disconnect_container(VFIOGroup *group)
1713
{
1714
    VFIOContainer *container = group->container;
1715

    
1716
    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
1717
        error_report("vfio: error disconnecting group %d from container\n",
1718
                     group->groupid);
1719
    }
1720

    
1721
    QLIST_REMOVE(group, container_next);
1722
    group->container = NULL;
1723

    
1724
    if (QLIST_EMPTY(&container->group_list)) {
1725
        if (container->iommu_data.release) {
1726
            container->iommu_data.release(container);
1727
        }
1728
        QLIST_REMOVE(container, next);
1729
        DPRINTF("vfio_disconnect_container: close container->fd\n");
1730
        close(container->fd);
1731
        g_free(container);
1732
    }
1733
}
1734

    
1735
static VFIOGroup *vfio_get_group(int groupid)
1736
{
1737
    VFIOGroup *group;
1738
    char path[32];
1739
    struct vfio_group_status status = { .argsz = sizeof(status) };
1740

    
1741
    QLIST_FOREACH(group, &group_list, next) {
1742
        if (group->groupid == groupid) {
1743
            return group;
1744
        }
1745
    }
1746

    
1747
    group = g_malloc0(sizeof(*group));
1748

    
1749
    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
1750
    group->fd = qemu_open(path, O_RDWR);
1751
    if (group->fd < 0) {
1752
        error_report("vfio: error opening %s: %m\n", path);
1753
        g_free(group);
1754
        return NULL;
1755
    }
1756

    
1757
    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
1758
        error_report("vfio: error getting group status: %m\n");
1759
        close(group->fd);
1760
        g_free(group);
1761
        return NULL;
1762
    }
1763

    
1764
    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
1765
        error_report("vfio: error, group %d is not viable, please ensure "
1766
                     "all devices within the iommu_group are bound to their "
1767
                     "vfio bus driver.\n", groupid);
1768
        close(group->fd);
1769
        g_free(group);
1770
        return NULL;
1771
    }
1772

    
1773
    group->groupid = groupid;
1774
    QLIST_INIT(&group->device_list);
1775

    
1776
    if (vfio_connect_container(group)) {
1777
        error_report("vfio: failed to setup container for group %d\n", groupid);
1778
        close(group->fd);
1779
        g_free(group);
1780
        return NULL;
1781
    }
1782

    
1783
    QLIST_INSERT_HEAD(&group_list, group, next);
1784

    
1785
    return group;
1786
}
1787

    
1788
static void vfio_put_group(VFIOGroup *group)
1789
{
1790
    if (!QLIST_EMPTY(&group->device_list)) {
1791
        return;
1792
    }
1793

    
1794
    vfio_disconnect_container(group);
1795
    QLIST_REMOVE(group, next);
1796
    DPRINTF("vfio_put_group: close group->fd\n");
1797
    close(group->fd);
1798
    g_free(group);
1799
}
1800

    
1801
static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
1802
{
1803
    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
1804
    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
1805
    int ret, i;
1806

    
1807
    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
1808
    if (ret < 0) {
1809
        error_report("vfio: error getting device %s from group %d: %m\n",
1810
                     name, group->groupid);
1811
        error_report("Verify all devices in group %d are bound to vfio-pci "
1812
                     "or pci-stub and not already in use\n", group->groupid);
1813
        return ret;
1814
    }
1815

    
1816
    vdev->fd = ret;
1817
    vdev->group = group;
1818
    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
1819

    
1820
    /* Sanity check device */
1821
    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
1822
    if (ret) {
1823
        error_report("vfio: error getting device info: %m\n");
1824
        goto error;
1825
    }
1826

    
1827
    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
1828
            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
1829

    
1830
    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
1831
        error_report("vfio: Um, this isn't a PCI device\n");
1832
        goto error;
1833
    }
1834

    
1835
    vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
1836
    if (!vdev->reset_works) {
1837
        error_report("Warning, device %s does not support reset\n", name);
1838
    }
1839

    
1840
    if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
1841
        error_report("vfio: unexpected number of io regions %u\n",
1842
                     dev_info.num_regions);
1843
        goto error;
1844
    }
1845

    
1846
    if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
1847
        error_report("vfio: unexpected number of irqs %u\n", dev_info.num_irqs);
1848
        goto error;
1849
    }
1850

    
1851
    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
1852
        reg_info.index = i;
1853

    
1854
        ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
1855
        if (ret) {
1856
            error_report("vfio: Error getting region %d info: %m\n", i);
1857
            goto error;
1858
        }
1859

    
1860
        DPRINTF("Device %s region %d:\n", name, i);
1861
        DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
1862
                (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
1863
                (unsigned long)reg_info.flags);
1864

    
1865
        vdev->bars[i].flags = reg_info.flags;
1866
        vdev->bars[i].size = reg_info.size;
1867
        vdev->bars[i].fd_offset = reg_info.offset;
1868
        vdev->bars[i].fd = vdev->fd;
1869
        vdev->bars[i].nr = i;
1870
    }
1871

    
1872
    reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
1873

    
1874
    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
1875
    if (ret) {
1876
        error_report("vfio: Error getting ROM info: %m\n");
1877
        goto error;
1878
    }
1879

    
1880
    DPRINTF("Device %s ROM:\n", name);
1881
    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
1882
            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
1883
            (unsigned long)reg_info.flags);
1884

    
1885
    vdev->rom_size = reg_info.size;
1886
    vdev->rom_offset = reg_info.offset;
1887

    
1888
    reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
1889

    
1890
    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
1891
    if (ret) {
1892
        error_report("vfio: Error getting config info: %m\n");
1893
        goto error;
1894
    }
1895

    
1896
    DPRINTF("Device %s config:\n", name);
1897
    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
1898
            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
1899
            (unsigned long)reg_info.flags);
1900

    
1901
    vdev->config_size = reg_info.size;
1902
    vdev->config_offset = reg_info.offset;
1903

    
1904
error:
1905
    if (ret) {
1906
        QLIST_REMOVE(vdev, next);
1907
        vdev->group = NULL;
1908
        close(vdev->fd);
1909
    }
1910
    return ret;
1911
}
1912

    
1913
static void vfio_put_device(VFIODevice *vdev)
1914
{
1915
    QLIST_REMOVE(vdev, next);
1916
    vdev->group = NULL;
1917
    DPRINTF("vfio_put_device: close vdev->fd\n");
1918
    close(vdev->fd);
1919
    if (vdev->msix) {
1920
        g_free(vdev->msix);
1921
        vdev->msix = NULL;
1922
    }
1923
}
1924

    
1925
static int vfio_initfn(PCIDevice *pdev)
1926
{
1927
    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
1928
    VFIOGroup *group;
1929
    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
1930
    ssize_t len;
1931
    struct stat st;
1932
    int groupid;
1933
    int ret;
1934

    
1935
    /* Check that the host device exists */
1936
    snprintf(path, sizeof(path),
1937
             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
1938
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1939
             vdev->host.function);
1940
    if (stat(path, &st) < 0) {
1941
        error_report("vfio: error: no such host device: %s\n", path);
1942
        return -errno;
1943
    }
1944

    
1945
    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
1946

    
1947
    len = readlink(path, iommu_group_path, PATH_MAX);
1948
    if (len <= 0) {
1949
        error_report("vfio: error no iommu_group for device\n");
1950
        return -errno;
1951
    }
1952

    
1953
    iommu_group_path[len] = 0;
1954
    group_name = basename(iommu_group_path);
1955

    
1956
    if (sscanf(group_name, "%d", &groupid) != 1) {
1957
        error_report("vfio: error reading %s: %m\n", path);
1958
        return -errno;
1959
    }
1960

    
1961
    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
1962
            vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
1963

    
1964
    group = vfio_get_group(groupid);
1965
    if (!group) {
1966
        error_report("vfio: failed to get group %d\n", groupid);
1967
        return -ENOENT;
1968
    }
1969

    
1970
    snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
1971
            vdev->host.domain, vdev->host.bus, vdev->host.slot,
1972
            vdev->host.function);
1973

    
1974
    QLIST_FOREACH(pvdev, &group->device_list, next) {
1975
        if (pvdev->host.domain == vdev->host.domain &&
1976
            pvdev->host.bus == vdev->host.bus &&
1977
            pvdev->host.slot == vdev->host.slot &&
1978
            pvdev->host.function == vdev->host.function) {
1979

    
1980
            error_report("vfio: error: device %s is already attached\n", path);
1981
            vfio_put_group(group);
1982
            return -EBUSY;
1983
        }
1984
    }
1985

    
1986
    ret = vfio_get_device(group, path, vdev);
1987
    if (ret) {
1988
        error_report("vfio: failed to get device %s\n", path);
1989
        vfio_put_group(group);
1990
        return ret;
1991
    }
1992

    
1993
    /* Get a copy of config space */
1994
    ret = pread(vdev->fd, vdev->pdev.config,
1995
                MIN(pci_config_size(&vdev->pdev), vdev->config_size),
1996
                vdev->config_offset);
1997
    if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
1998
        ret = ret < 0 ? -errno : -EFAULT;
1999
        error_report("vfio: Failed to read device config space\n");
2000
        goto out_put;
2001
    }
2002

    
2003
    /*
2004
     * Clear host resource mapping info.  If we choose not to register a
2005
     * BAR, such as might be the case with the option ROM, we can get
2006
     * confusing, unwritable, residual addresses from the host here.
2007
     */
2008
    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
2009
    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
2010

    
2011
    vfio_load_rom(vdev);
2012

    
2013
    ret = vfio_early_setup_msix(vdev);
2014
    if (ret) {
2015
        goto out_put;
2016
    }
2017

    
2018
    vfio_map_bars(vdev);
2019

    
2020
    ret = vfio_add_capabilities(vdev);
2021
    if (ret) {
2022
        goto out_teardown;
2023
    }
2024

    
2025
    if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
2026
        vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
2027
                                                  vfio_intx_mmap_enable, vdev);
2028
        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
2029
        ret = vfio_enable_intx(vdev);
2030
        if (ret) {
2031
            goto out_teardown;
2032
        }
2033
    }
2034

    
2035
    return 0;
2036

    
2037
out_teardown:
2038
    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
2039
    vfio_teardown_msi(vdev);
2040
    vfio_unmap_bars(vdev);
2041
out_put:
2042
    vfio_put_device(vdev);
2043
    vfio_put_group(group);
2044
    return ret;
2045
}
2046

    
2047
static void vfio_exitfn(PCIDevice *pdev)
2048
{
2049
    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2050
    VFIOGroup *group = vdev->group;
2051

    
2052
    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
2053
    vfio_disable_interrupts(vdev);
2054
    if (vdev->intx.mmap_timer) {
2055
        qemu_free_timer(vdev->intx.mmap_timer);
2056
    }
2057
    vfio_teardown_msi(vdev);
2058
    vfio_unmap_bars(vdev);
2059
    vfio_put_device(vdev);
2060
    vfio_put_group(group);
2061
}
2062

    
2063
static void vfio_pci_reset(DeviceState *dev)
2064
{
2065
    PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
2066
    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2067
    uint16_t cmd;
2068

    
2069
    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
2070
            vdev->host.bus, vdev->host.slot, vdev->host.function);
2071

    
2072
    vfio_disable_interrupts(vdev);
2073

    
2074
    /*
2075
     * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
2076
     * Also put INTx Disable in known state.
2077
     */
2078
    cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2079
    cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2080
             PCI_COMMAND_INTX_DISABLE);
2081
    vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2082

    
2083
    if (vdev->reset_works) {
2084
        if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
2085
            error_report("vfio: Error unable to reset physical device "
2086
                         "(%04x:%02x:%02x.%x): %m\n", vdev->host.domain,
2087
                         vdev->host.bus, vdev->host.slot, vdev->host.function);
2088
        }
2089
    }
2090

    
2091
    vfio_enable_intx(vdev);
2092
}
2093

    
2094
static Property vfio_pci_dev_properties[] = {
2095
    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
2096
    DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIODevice,
2097
                       intx.mmap_timeout, 1100),
2098
    /*
2099
     * TODO - support passed fds... is this necessary?
2100
     * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
2101
     * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
2102
     */
2103
    DEFINE_PROP_END_OF_LIST(),
2104
};
2105

    
2106
static const VMStateDescription vfio_pci_vmstate = {
2107
    .name = "vfio-pci",
2108
    .unmigratable = 1,
2109
};
2110

    
2111
static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
2112
{
2113
    DeviceClass *dc = DEVICE_CLASS(klass);
2114
    PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
2115

    
2116
    dc->reset = vfio_pci_reset;
2117
    dc->props = vfio_pci_dev_properties;
2118
    dc->vmsd = &vfio_pci_vmstate;
2119
    dc->desc = "VFIO-based PCI device assignment";
2120
    pdc->init = vfio_initfn;
2121
    pdc->exit = vfio_exitfn;
2122
    pdc->config_read = vfio_pci_read_config;
2123
    pdc->config_write = vfio_pci_write_config;
2124
}
2125

    
2126
static const TypeInfo vfio_pci_dev_info = {
2127
    .name = "vfio-pci",
2128
    .parent = TYPE_PCI_DEVICE,
2129
    .instance_size = sizeof(VFIODevice),
2130
    .class_init = vfio_pci_dev_class_init,
2131
};
2132

    
2133
static void register_vfio_pci_dev_type(void)
2134
{
2135
    type_register_static(&vfio_pci_dev_info);
2136
}
2137

    
2138
type_init(register_vfio_pci_dev_type)