Statistics
| Branch: | Revision:

root / hw / dataplane / virtio-blk.c @ cd7fdfe5

History | View | Annotate | Download (15.9 kB)

1
/*
2
 * Dedicated thread for virtio-blk I/O processing
3
 *
4
 * Copyright 2012 IBM, Corp.
5
 * Copyright 2012 Red Hat, Inc. and/or its affiliates
6
 *
7
 * Authors:
8
 *   Stefan Hajnoczi <stefanha@redhat.com>
9
 *
10
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
11
 * See the COPYING file in the top-level directory.
12
 *
13
 */
14

    
15
#include "trace.h"
16
#include "qemu/iov.h"
17
#include "event-poll.h"
18
#include "qemu/thread.h"
19
#include "vring.h"
20
#include "ioq.h"
21
#include "migration/migration.h"
22
#include "hw/virtio-blk.h"
23
#include "hw/dataplane/virtio-blk.h"
24

    
25
enum {
26
    SEG_MAX = 126,                  /* maximum number of I/O segments */
27
    VRING_MAX = SEG_MAX + 2,        /* maximum number of vring descriptors */
28
    REQ_MAX = VRING_MAX,            /* maximum number of requests in the vring,
29
                                     * is VRING_MAX / 2 with traditional and
30
                                     * VRING_MAX with indirect descriptors */
31
};
32

    
33
typedef struct {
34
    struct iocb iocb;               /* Linux AIO control block */
35
    QEMUIOVector *inhdr;            /* iovecs for virtio_blk_inhdr */
36
    unsigned int head;              /* vring descriptor index */
37
    struct iovec *bounce_iov;       /* used if guest buffers are unaligned */
38
    QEMUIOVector *read_qiov;        /* for read completion /w bounce buffer */
39
} VirtIOBlockRequest;
40

    
41
struct VirtIOBlockDataPlane {
42
    bool started;
43
    bool stopping;
44
    QEMUBH *start_bh;
45
    QemuThread thread;
46

    
47
    VirtIOBlkConf *blk;
48
    int fd;                         /* image file descriptor */
49

    
50
    VirtIODevice *vdev;
51
    Vring vring;                    /* virtqueue vring */
52
    EventNotifier *guest_notifier;  /* irq */
53

    
54
    EventPoll event_poll;           /* event poller */
55
    EventHandler io_handler;        /* Linux AIO completion handler */
56
    EventHandler notify_handler;    /* virtqueue notify handler */
57

    
58
    IOQueue ioqueue;                /* Linux AIO queue (should really be per
59
                                       dataplane thread) */
60
    VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the
61
                                             queue */
62

    
63
    unsigned int num_reqs;
64

    
65
    Error *migration_blocker;
66
};
67

    
68
/* Raise an interrupt to signal guest, if necessary */
69
static void notify_guest(VirtIOBlockDataPlane *s)
70
{
71
    if (!vring_should_notify(s->vdev, &s->vring)) {
72
        return;
73
    }
74

    
75
    event_notifier_set(s->guest_notifier);
76
}
77

    
78
static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
79
{
80
    VirtIOBlockDataPlane *s = opaque;
81
    VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
82
    struct virtio_blk_inhdr hdr;
83
    int len;
84

    
85
    if (likely(ret >= 0)) {
86
        hdr.status = VIRTIO_BLK_S_OK;
87
        len = ret;
88
    } else {
89
        hdr.status = VIRTIO_BLK_S_IOERR;
90
        len = 0;
91
    }
92

    
93
    trace_virtio_blk_data_plane_complete_request(s, req->head, ret);
94

    
95
    if (req->read_qiov) {
96
        assert(req->bounce_iov);
97
        qemu_iovec_from_buf(req->read_qiov, 0, req->bounce_iov->iov_base, len);
98
        qemu_iovec_destroy(req->read_qiov);
99
        g_slice_free(QEMUIOVector, req->read_qiov);
100
    }
101

    
102
    if (req->bounce_iov) {
103
        qemu_vfree(req->bounce_iov->iov_base);
104
        g_slice_free(struct iovec, req->bounce_iov);
105
    }
106

    
107
    qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr));
108
    qemu_iovec_destroy(req->inhdr);
109
    g_slice_free(QEMUIOVector, req->inhdr);
110

    
111
    /* According to the virtio specification len should be the number of bytes
112
     * written to, but for virtio-blk it seems to be the number of bytes
113
     * transferred plus the status bytes.
114
     */
115
    vring_push(&s->vring, req->head, len + sizeof(hdr));
116

    
117
    s->num_reqs--;
118
}
119

    
120
static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head,
121
                                   QEMUIOVector *inhdr, unsigned char status)
122
{
123
    struct virtio_blk_inhdr hdr = {
124
        .status = status,
125
    };
126

    
127
    qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr));
128
    qemu_iovec_destroy(inhdr);
129
    g_slice_free(QEMUIOVector, inhdr);
130

    
131
    vring_push(&s->vring, head, sizeof(hdr));
132
    notify_guest(s);
133
}
134

    
135
/* Get disk serial number */
136
static void do_get_id_cmd(VirtIOBlockDataPlane *s,
137
                          struct iovec *iov, unsigned int iov_cnt,
138
                          unsigned int head, QEMUIOVector *inhdr)
139
{
140
    char id[VIRTIO_BLK_ID_BYTES];
141

    
142
    /* Serial number not NUL-terminated when shorter than buffer */
143
    strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id));
144
    iov_from_buf(iov, iov_cnt, 0, id, sizeof(id));
145
    complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
146
}
147

    
148
static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
149
                       struct iovec *iov, unsigned int iov_cnt,
150
                       long long offset, unsigned int head,
151
                       QEMUIOVector *inhdr)
152
{
153
    struct iocb *iocb;
154
    QEMUIOVector qiov;
155
    struct iovec *bounce_iov = NULL;
156
    QEMUIOVector *read_qiov = NULL;
157

    
158
    qemu_iovec_init_external(&qiov, iov, iov_cnt);
159
    if (!bdrv_qiov_is_aligned(s->blk->conf.bs, &qiov)) {
160
        void *bounce_buffer = qemu_blockalign(s->blk->conf.bs, qiov.size);
161

    
162
        if (read) {
163
            /* Need to copy back from bounce buffer on completion */
164
            read_qiov = g_slice_new(QEMUIOVector);
165
            qemu_iovec_init(read_qiov, iov_cnt);
166
            qemu_iovec_concat_iov(read_qiov, iov, iov_cnt, 0, qiov.size);
167
        } else {
168
            qemu_iovec_to_buf(&qiov, 0, bounce_buffer, qiov.size);
169
        }
170

    
171
        /* Redirect I/O to aligned bounce buffer */
172
        bounce_iov = g_slice_new(struct iovec);
173
        bounce_iov->iov_base = bounce_buffer;
174
        bounce_iov->iov_len = qiov.size;
175
        iov = bounce_iov;
176
        iov_cnt = 1;
177
    }
178

    
179
    iocb = ioq_rdwr(&s->ioqueue, read, iov, iov_cnt, offset);
180

    
181
    /* Fill in virtio block metadata needed for completion */
182
    VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
183
    req->head = head;
184
    req->inhdr = inhdr;
185
    req->bounce_iov = bounce_iov;
186
    req->read_qiov = read_qiov;
187
    return 0;
188
}
189

    
190
static int process_request(IOQueue *ioq, struct iovec iov[],
191
                           unsigned int out_num, unsigned int in_num,
192
                           unsigned int head)
193
{
194
    VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
195
    struct iovec *in_iov = &iov[out_num];
196
    struct virtio_blk_outhdr outhdr;
197
    QEMUIOVector *inhdr;
198
    size_t in_size;
199

    
200
    /* Copy in outhdr */
201
    if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr,
202
                            sizeof(outhdr)) != sizeof(outhdr))) {
203
        error_report("virtio-blk request outhdr too short");
204
        return -EFAULT;
205
    }
206
    iov_discard_front(&iov, &out_num, sizeof(outhdr));
207

    
208
    /* Grab inhdr for later */
209
    in_size = iov_size(in_iov, in_num);
210
    if (in_size < sizeof(struct virtio_blk_inhdr)) {
211
        error_report("virtio_blk request inhdr too short");
212
        return -EFAULT;
213
    }
214
    inhdr = g_slice_new(QEMUIOVector);
215
    qemu_iovec_init(inhdr, 1);
216
    qemu_iovec_concat_iov(inhdr, in_iov, in_num,
217
            in_size - sizeof(struct virtio_blk_inhdr),
218
            sizeof(struct virtio_blk_inhdr));
219
    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
220

    
221
    /* TODO Linux sets the barrier bit even when not advertised! */
222
    outhdr.type &= ~VIRTIO_BLK_T_BARRIER;
223

    
224
    switch (outhdr.type) {
225
    case VIRTIO_BLK_T_IN:
226
        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr);
227
        return 0;
228

    
229
    case VIRTIO_BLK_T_OUT:
230
        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr);
231
        return 0;
232

    
233
    case VIRTIO_BLK_T_SCSI_CMD:
234
        /* TODO support SCSI commands */
235
        complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
236
        return 0;
237

    
238
    case VIRTIO_BLK_T_FLUSH:
239
        /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
240
        if (qemu_fdatasync(s->fd) < 0) {
241
            complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR);
242
        } else {
243
            complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
244
        }
245
        return 0;
246

    
247
    case VIRTIO_BLK_T_GET_ID:
248
        do_get_id_cmd(s, in_iov, in_num, head, inhdr);
249
        return 0;
250

    
251
    default:
252
        error_report("virtio-blk unsupported request type %#x", outhdr.type);
253
        qemu_iovec_destroy(inhdr);
254
        g_slice_free(QEMUIOVector, inhdr);
255
        return -EFAULT;
256
    }
257
}
258

    
259
static void handle_notify(EventHandler *handler)
260
{
261
    VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
262
                                           notify_handler);
263

    
264
    /* There is one array of iovecs into which all new requests are extracted
265
     * from the vring.  Requests are read from the vring and the translated
266
     * descriptors are written to the iovecs array.  The iovecs do not have to
267
     * persist across handle_notify() calls because the kernel copies the
268
     * iovecs on io_submit().
269
     *
270
     * Handling io_submit() EAGAIN may require storing the requests across
271
     * handle_notify() calls until the kernel has sufficient resources to
272
     * accept more I/O.  This is not implemented yet.
273
     */
274
    struct iovec iovec[VRING_MAX];
275
    struct iovec *end = &iovec[VRING_MAX];
276
    struct iovec *iov = iovec;
277

    
278
    /* When a request is read from the vring, the index of the first descriptor
279
     * (aka head) is returned so that the completed request can be pushed onto
280
     * the vring later.
281
     *
282
     * The number of hypervisor read-only iovecs is out_num.  The number of
283
     * hypervisor write-only iovecs is in_num.
284
     */
285
    int head;
286
    unsigned int out_num = 0, in_num = 0;
287
    unsigned int num_queued;
288

    
289
    for (;;) {
290
        /* Disable guest->host notifies to avoid unnecessary vmexits */
291
        vring_disable_notification(s->vdev, &s->vring);
292

    
293
        for (;;) {
294
            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
295
            if (head < 0) {
296
                break; /* no more requests */
297
            }
298

    
299
            trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
300
                                                        head);
301

    
302
            if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
303
                vring_set_broken(&s->vring);
304
                break;
305
            }
306
            iov += out_num + in_num;
307
        }
308

    
309
        if (likely(head == -EAGAIN)) { /* vring emptied */
310
            /* Re-enable guest->host notifies and stop processing the vring.
311
             * But if the guest has snuck in more descriptors, keep processing.
312
             */
313
            if (vring_enable_notification(s->vdev, &s->vring)) {
314
                break;
315
            }
316
        } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */
317
            /* Since there are no iovecs[] left, stop processing for now.  Do
318
             * not re-enable guest->host notifies since the I/O completion
319
             * handler knows to check for more vring descriptors anyway.
320
             */
321
            break;
322
        }
323
    }
324

    
325
    num_queued = ioq_num_queued(&s->ioqueue);
326
    if (num_queued > 0) {
327
        s->num_reqs += num_queued;
328

    
329
        int rc = ioq_submit(&s->ioqueue);
330
        if (unlikely(rc < 0)) {
331
            fprintf(stderr, "ioq_submit failed %d\n", rc);
332
            exit(1);
333
        }
334
    }
335
}
336

    
337
static void handle_io(EventHandler *handler)
338
{
339
    VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
340
                                           io_handler);
341

    
342
    if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) {
343
        notify_guest(s);
344
    }
345

    
346
    /* If there were more requests than iovecs, the vring will not be empty yet
347
     * so check again.  There should now be enough resources to process more
348
     * requests.
349
     */
350
    if (unlikely(vring_more_avail(&s->vring))) {
351
        handle_notify(&s->notify_handler);
352
    }
353
}
354

    
355
static void *data_plane_thread(void *opaque)
356
{
357
    VirtIOBlockDataPlane *s = opaque;
358

    
359
    do {
360
        event_poll(&s->event_poll);
361
    } while (!s->stopping || s->num_reqs > 0);
362
    return NULL;
363
}
364

    
365
static void start_data_plane_bh(void *opaque)
366
{
367
    VirtIOBlockDataPlane *s = opaque;
368

    
369
    qemu_bh_delete(s->start_bh);
370
    s->start_bh = NULL;
371
    qemu_thread_create(&s->thread, data_plane_thread,
372
                       s, QEMU_THREAD_JOINABLE);
373
}
374

    
375
bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
376
                                  VirtIOBlockDataPlane **dataplane)
377
{
378
    VirtIOBlockDataPlane *s;
379
    int fd;
380

    
381
    *dataplane = NULL;
382

    
383
    if (!blk->data_plane) {
384
        return true;
385
    }
386

    
387
    if (blk->scsi) {
388
        error_report("device is incompatible with x-data-plane, use scsi=off");
389
        return false;
390
    }
391

    
392
    if (blk->config_wce) {
393
        error_report("device is incompatible with x-data-plane, "
394
                     "use config-wce=off");
395
        return false;
396
    }
397

    
398
    fd = raw_get_aio_fd(blk->conf.bs);
399
    if (fd < 0) {
400
        error_report("drive is incompatible with x-data-plane, "
401
                     "use format=raw,cache=none,aio=native");
402
        return false;
403
    }
404

    
405
    s = g_new0(VirtIOBlockDataPlane, 1);
406
    s->vdev = vdev;
407
    s->fd = fd;
408
    s->blk = blk;
409

    
410
    /* Prevent block operations that conflict with data plane thread */
411
    bdrv_set_in_use(blk->conf.bs, 1);
412

    
413
    error_setg(&s->migration_blocker,
414
            "x-data-plane does not support migration");
415
    migrate_add_blocker(s->migration_blocker);
416

    
417
    *dataplane = s;
418
    return true;
419
}
420

    
421
void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
422
{
423
    if (!s) {
424
        return;
425
    }
426

    
427
    virtio_blk_data_plane_stop(s);
428
    migrate_del_blocker(s->migration_blocker);
429
    error_free(s->migration_blocker);
430
    bdrv_set_in_use(s->blk->conf.bs, 0);
431
    g_free(s);
432
}
433

    
434
void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
435
{
436
    VirtQueue *vq;
437
    int i;
438

    
439
    if (s->started) {
440
        return;
441
    }
442

    
443
    vq = virtio_get_queue(s->vdev, 0);
444
    if (!vring_setup(&s->vring, s->vdev, 0)) {
445
        return;
446
    }
447

    
448
    event_poll_init(&s->event_poll);
449

    
450
    /* Set up guest notifier (irq) */
451
    if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1,
452
                                              true) != 0) {
453
        fprintf(stderr, "virtio-blk failed to set guest notifier, "
454
                "ensure -enable-kvm is set\n");
455
        exit(1);
456
    }
457
    s->guest_notifier = virtio_queue_get_guest_notifier(vq);
458

    
459
    /* Set up virtqueue notify */
460
    if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque,
461
                                            0, true) != 0) {
462
        fprintf(stderr, "virtio-blk failed to set host notifier\n");
463
        exit(1);
464
    }
465
    event_poll_add(&s->event_poll, &s->notify_handler,
466
                   virtio_queue_get_host_notifier(vq),
467
                   handle_notify);
468

    
469
    /* Set up ioqueue */
470
    ioq_init(&s->ioqueue, s->fd, REQ_MAX);
471
    for (i = 0; i < ARRAY_SIZE(s->requests); i++) {
472
        ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb);
473
    }
474
    event_poll_add(&s->event_poll, &s->io_handler,
475
                   ioq_get_notifier(&s->ioqueue), handle_io);
476

    
477
    s->started = true;
478
    trace_virtio_blk_data_plane_start(s);
479

    
480
    /* Kick right away to begin processing requests already in vring */
481
    event_notifier_set(virtio_queue_get_host_notifier(vq));
482

    
483
    /* Spawn thread in BH so it inherits iothread cpusets */
484
    s->start_bh = qemu_bh_new(start_data_plane_bh, s);
485
    qemu_bh_schedule(s->start_bh);
486
}
487

    
488
void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
489
{
490
    if (!s->started || s->stopping) {
491
        return;
492
    }
493
    s->stopping = true;
494
    trace_virtio_blk_data_plane_stop(s);
495

    
496
    /* Stop thread or cancel pending thread creation BH */
497
    if (s->start_bh) {
498
        qemu_bh_delete(s->start_bh);
499
        s->start_bh = NULL;
500
    } else {
501
        event_poll_notify(&s->event_poll);
502
        qemu_thread_join(&s->thread);
503
    }
504

    
505
    ioq_cleanup(&s->ioqueue);
506

    
507
    s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false);
508

    
509
    event_poll_cleanup(&s->event_poll);
510

    
511
    /* Clean up guest notifier (irq) */
512
    s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1, false);
513

    
514
    vring_teardown(&s->vring);
515
    s->started = false;
516
    s->stopping = false;
517
}