Statistics
| Branch: | Revision:

root / hw / virtio-blk.c @ b6a4805b

History | View | Annotate | Download (14.5 kB)

1
/*
2
 * Virtio Block Device
3
 *
4
 * Copyright IBM, Corp. 2007
5
 *
6
 * Authors:
7
 *  Anthony Liguori   <aliguori@us.ibm.com>
8
 *
9
 * This work is licensed under the terms of the GNU GPL, version 2.  See
10
 * the COPYING file in the top-level directory.
11
 *
12
 */
13

    
14
#include <qemu-common.h>
15
#include "qemu-error.h"
16
#include "blockdev.h"
17
#include "virtio-blk.h"
18
#ifdef __linux__
19
# include <scsi/sg.h>
20
#endif
21

    
22
typedef struct VirtIOBlock
23
{
24
    VirtIODevice vdev;
25
    BlockDriverState *bs;
26
    VirtQueue *vq;
27
    void *rq;
28
    QEMUBH *bh;
29
    BlockConf *conf;
30
    unsigned short sector_mask;
31
    char sn[BLOCK_SERIAL_STRLEN];
32
    DeviceState *qdev;
33
} VirtIOBlock;
34

    
35
static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
36
{
37
    return (VirtIOBlock *)vdev;
38
}
39

    
40
typedef struct VirtIOBlockReq
41
{
42
    VirtIOBlock *dev;
43
    VirtQueueElement elem;
44
    struct virtio_blk_inhdr *in;
45
    struct virtio_blk_outhdr *out;
46
    struct virtio_scsi_inhdr *scsi;
47
    QEMUIOVector qiov;
48
    struct VirtIOBlockReq *next;
49
} VirtIOBlockReq;
50

    
51
static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
52
{
53
    VirtIOBlock *s = req->dev;
54

    
55
    req->in->status = status;
56
    virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
57
    virtio_notify(&s->vdev, s->vq);
58

    
59
    qemu_free(req);
60
}
61

    
62
static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
63
    int is_read)
64
{
65
    BlockErrorAction action = bdrv_get_on_error(req->dev->bs, is_read);
66
    VirtIOBlock *s = req->dev;
67

    
68
    if (action == BLOCK_ERR_IGNORE) {
69
        bdrv_mon_event(s->bs, BDRV_ACTION_IGNORE, is_read);
70
        return 0;
71
    }
72

    
73
    if ((error == ENOSPC && action == BLOCK_ERR_STOP_ENOSPC)
74
            || action == BLOCK_ERR_STOP_ANY) {
75
        req->next = s->rq;
76
        s->rq = req;
77
        bdrv_mon_event(s->bs, BDRV_ACTION_STOP, is_read);
78
        vm_stop(0);
79
    } else {
80
        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
81
        bdrv_mon_event(s->bs, BDRV_ACTION_REPORT, is_read);
82
    }
83

    
84
    return 1;
85
}
86

    
87
static void virtio_blk_rw_complete(void *opaque, int ret)
88
{
89
    VirtIOBlockReq *req = opaque;
90

    
91
    if (ret) {
92
        int is_read = !(req->out->type & VIRTIO_BLK_T_OUT);
93
        if (virtio_blk_handle_rw_error(req, -ret, is_read))
94
            return;
95
    }
96

    
97
    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
98
}
99

    
100
static void virtio_blk_flush_complete(void *opaque, int ret)
101
{
102
    VirtIOBlockReq *req = opaque;
103

    
104
    virtio_blk_req_complete(req, ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK);
105
}
106

    
107
static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
108
{
109
    VirtIOBlockReq *req = qemu_malloc(sizeof(*req));
110
    req->dev = s;
111
    req->qiov.size = 0;
112
    req->next = NULL;
113
    return req;
114
}
115

    
116
static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
117
{
118
    VirtIOBlockReq *req = virtio_blk_alloc_request(s);
119

    
120
    if (req != NULL) {
121
        if (!virtqueue_pop(s->vq, &req->elem)) {
122
            qemu_free(req);
123
            return NULL;
124
        }
125
    }
126

    
127
    return req;
128
}
129

    
130
#ifdef __linux__
131
static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
132
{
133
    struct sg_io_hdr hdr;
134
    int ret;
135
    int status;
136
    int i;
137

    
138
    /*
139
     * We require at least one output segment each for the virtio_blk_outhdr
140
     * and the SCSI command block.
141
     *
142
     * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
143
     * and the sense buffer pointer in the input segments.
144
     */
145
    if (req->elem.out_num < 2 || req->elem.in_num < 3) {
146
        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
147
        return;
148
    }
149

    
150
    /*
151
     * No support for bidirection commands yet.
152
     */
153
    if (req->elem.out_num > 2 && req->elem.in_num > 3) {
154
        virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
155
        return;
156
    }
157

    
158
    /*
159
     * The scsi inhdr is placed in the second-to-last input segment, just
160
     * before the regular inhdr.
161
     */
162
    req->scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;
163

    
164
    memset(&hdr, 0, sizeof(struct sg_io_hdr));
165
    hdr.interface_id = 'S';
166
    hdr.cmd_len = req->elem.out_sg[1].iov_len;
167
    hdr.cmdp = req->elem.out_sg[1].iov_base;
168
    hdr.dxfer_len = 0;
169

    
170
    if (req->elem.out_num > 2) {
171
        /*
172
         * If there are more than the minimally required 2 output segments
173
         * there is write payload starting from the third iovec.
174
         */
175
        hdr.dxfer_direction = SG_DXFER_TO_DEV;
176
        hdr.iovec_count = req->elem.out_num - 2;
177

    
178
        for (i = 0; i < hdr.iovec_count; i++)
179
            hdr.dxfer_len += req->elem.out_sg[i + 2].iov_len;
180

    
181
        hdr.dxferp = req->elem.out_sg + 2;
182

    
183
    } else if (req->elem.in_num > 3) {
184
        /*
185
         * If we have more than 3 input segments the guest wants to actually
186
         * read data.
187
         */
188
        hdr.dxfer_direction = SG_DXFER_FROM_DEV;
189
        hdr.iovec_count = req->elem.in_num - 3;
190
        for (i = 0; i < hdr.iovec_count; i++)
191
            hdr.dxfer_len += req->elem.in_sg[i].iov_len;
192

    
193
        hdr.dxferp = req->elem.in_sg;
194
    } else {
195
        /*
196
         * Some SCSI commands don't actually transfer any data.
197
         */
198
        hdr.dxfer_direction = SG_DXFER_NONE;
199
    }
200

    
201
    hdr.sbp = req->elem.in_sg[req->elem.in_num - 3].iov_base;
202
    hdr.mx_sb_len = req->elem.in_sg[req->elem.in_num - 3].iov_len;
203

    
204
    ret = bdrv_ioctl(req->dev->bs, SG_IO, &hdr);
205
    if (ret) {
206
        status = VIRTIO_BLK_S_UNSUPP;
207
        hdr.status = ret;
208
        hdr.resid = hdr.dxfer_len;
209
    } else if (hdr.status) {
210
        status = VIRTIO_BLK_S_IOERR;
211
    } else {
212
        status = VIRTIO_BLK_S_OK;
213
    }
214

    
215
    req->scsi->errors = hdr.status;
216
    req->scsi->residual = hdr.resid;
217
    req->scsi->sense_len = hdr.sb_len_wr;
218
    req->scsi->data_len = hdr.dxfer_len;
219

    
220
    virtio_blk_req_complete(req, status);
221
}
222
#else
223
static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
224
{
225
    virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
226
}
227
#endif /* __linux__ */
228

    
229
typedef struct MultiReqBuffer {
230
    BlockRequest        blkreq[32];
231
    unsigned int        num_writes;
232
} MultiReqBuffer;
233

    
234
static void virtio_submit_multiwrite(BlockDriverState *bs, MultiReqBuffer *mrb)
235
{
236
    int i, ret;
237

    
238
    if (!mrb->num_writes) {
239
        return;
240
    }
241

    
242
    ret = bdrv_aio_multiwrite(bs, mrb->blkreq, mrb->num_writes);
243
    if (ret != 0) {
244
        for (i = 0; i < mrb->num_writes; i++) {
245
            if (mrb->blkreq[i].error) {
246
                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
247
            }
248
        }
249
    }
250

    
251
    mrb->num_writes = 0;
252
}
253

    
254
static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
255
{
256
    BlockDriverAIOCB *acb;
257

    
258
    /*
259
     * Make sure all outstanding writes are posted to the backing device.
260
     */
261
    virtio_submit_multiwrite(req->dev->bs, mrb);
262

    
263
    acb = bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req);
264
    if (!acb) {
265
        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
266
    }
267
}
268

    
269
static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
270
{
271
    BlockRequest *blkreq;
272

    
273
    if (req->out->sector & req->dev->sector_mask) {
274
        virtio_blk_rw_complete(req, -EIO);
275
        return;
276
    }
277

    
278
    if (mrb->num_writes == 32) {
279
        virtio_submit_multiwrite(req->dev->bs, mrb);
280
    }
281

    
282
    blkreq = &mrb->blkreq[mrb->num_writes];
283
    blkreq->sector = req->out->sector;
284
    blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
285
    blkreq->qiov = &req->qiov;
286
    blkreq->cb = virtio_blk_rw_complete;
287
    blkreq->opaque = req;
288
    blkreq->error = 0;
289

    
290
    mrb->num_writes++;
291
}
292

    
293
static void virtio_blk_handle_read(VirtIOBlockReq *req)
294
{
295
    BlockDriverAIOCB *acb;
296

    
297
    if (req->out->sector & req->dev->sector_mask) {
298
        virtio_blk_rw_complete(req, -EIO);
299
        return;
300
    }
301

    
302
    acb = bdrv_aio_readv(req->dev->bs, req->out->sector, &req->qiov,
303
                         req->qiov.size / BDRV_SECTOR_SIZE,
304
                         virtio_blk_rw_complete, req);
305
    if (!acb) {
306
        virtio_blk_rw_complete(req, -EIO);
307
    }
308
}
309

    
310
static void virtio_blk_handle_request(VirtIOBlockReq *req,
311
    MultiReqBuffer *mrb)
312
{
313
    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
314
        fprintf(stderr, "virtio-blk missing headers\n");
315
        exit(1);
316
    }
317

    
318
    if (req->elem.out_sg[0].iov_len < sizeof(*req->out) ||
319
        req->elem.in_sg[req->elem.in_num - 1].iov_len < sizeof(*req->in)) {
320
        fprintf(stderr, "virtio-blk header not in correct element\n");
321
        exit(1);
322
    }
323

    
324
    req->out = (void *)req->elem.out_sg[0].iov_base;
325
    req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base;
326

    
327
    if (req->out->type & VIRTIO_BLK_T_FLUSH) {
328
        virtio_blk_handle_flush(req, mrb);
329
    } else if (req->out->type & VIRTIO_BLK_T_SCSI_CMD) {
330
        virtio_blk_handle_scsi(req);
331
    } else if (req->out->type & VIRTIO_BLK_T_GET_ID) {
332
        VirtIOBlock *s = req->dev;
333

    
334
        memcpy(req->elem.in_sg[0].iov_base, s->sn,
335
               MIN(req->elem.in_sg[0].iov_len, sizeof(s->sn)));
336
        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
337
    } else if (req->out->type & VIRTIO_BLK_T_OUT) {
338
        qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
339
                                 req->elem.out_num - 1);
340
        virtio_blk_handle_write(req, mrb);
341
    } else {
342
        qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0],
343
                                 req->elem.in_num - 1);
344
        virtio_blk_handle_read(req);
345
    }
346
}
347

    
348
static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
349
{
350
    VirtIOBlock *s = to_virtio_blk(vdev);
351
    VirtIOBlockReq *req;
352
    MultiReqBuffer mrb = {
353
        .num_writes = 0,
354
    };
355

    
356
    while ((req = virtio_blk_get_request(s))) {
357
        virtio_blk_handle_request(req, &mrb);
358
    }
359

    
360
    virtio_submit_multiwrite(s->bs, &mrb);
361

    
362
    /*
363
     * FIXME: Want to check for completions before returning to guest mode,
364
     * so cached reads and writes are reported as quickly as possible. But
365
     * that should be done in the generic block layer.
366
     */
367
}
368

    
369
static void virtio_blk_dma_restart_bh(void *opaque)
370
{
371
    VirtIOBlock *s = opaque;
372
    VirtIOBlockReq *req = s->rq;
373
    MultiReqBuffer mrb = {
374
        .num_writes = 0,
375
    };
376

    
377
    qemu_bh_delete(s->bh);
378
    s->bh = NULL;
379

    
380
    s->rq = NULL;
381

    
382
    while (req) {
383
        virtio_blk_handle_request(req, &mrb);
384
        req = req->next;
385
    }
386

    
387
    virtio_submit_multiwrite(s->bs, &mrb);
388
}
389

    
390
static void virtio_blk_dma_restart_cb(void *opaque, int running, int reason)
391
{
392
    VirtIOBlock *s = opaque;
393

    
394
    if (!running)
395
        return;
396

    
397
    if (!s->bh) {
398
        s->bh = qemu_bh_new(virtio_blk_dma_restart_bh, s);
399
        qemu_bh_schedule(s->bh);
400
    }
401
}
402

    
403
static void virtio_blk_reset(VirtIODevice *vdev)
404
{
405
    /*
406
     * This should cancel pending requests, but can't do nicely until there
407
     * are per-device request lists.
408
     */
409
    qemu_aio_flush();
410
}
411

    
412
/* coalesce internal state, copy to pci i/o region 0
413
 */
414
static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
415
{
416
    VirtIOBlock *s = to_virtio_blk(vdev);
417
    struct virtio_blk_config blkcfg;
418
    uint64_t capacity;
419
    int cylinders, heads, secs;
420

    
421
    bdrv_get_geometry(s->bs, &capacity);
422
    bdrv_get_geometry_hint(s->bs, &cylinders, &heads, &secs);
423
    memset(&blkcfg, 0, sizeof(blkcfg));
424
    stq_raw(&blkcfg.capacity, capacity);
425
    stl_raw(&blkcfg.seg_max, 128 - 2);
426
    stw_raw(&blkcfg.cylinders, cylinders);
427
    blkcfg.heads = heads;
428
    blkcfg.sectors = secs & ~s->sector_mask;
429
    blkcfg.blk_size = s->conf->logical_block_size;
430
    blkcfg.size_max = 0;
431
    blkcfg.physical_block_exp = get_physical_block_exp(s->conf);
432
    blkcfg.alignment_offset = 0;
433
    blkcfg.min_io_size = s->conf->min_io_size / blkcfg.blk_size;
434
    blkcfg.opt_io_size = s->conf->opt_io_size / blkcfg.blk_size;
435
    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
436
}
437

    
438
static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
439
{
440
    VirtIOBlock *s = to_virtio_blk(vdev);
441

    
442
    features |= (1 << VIRTIO_BLK_F_SEG_MAX);
443
    features |= (1 << VIRTIO_BLK_F_GEOMETRY);
444
    features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
445
    features |= (1 << VIRTIO_BLK_F_BLK_SIZE);
446

    
447
    if (bdrv_enable_write_cache(s->bs))
448
        features |= (1 << VIRTIO_BLK_F_WCACHE);
449
    
450
    if (bdrv_is_read_only(s->bs))
451
        features |= 1 << VIRTIO_BLK_F_RO;
452

    
453
    return features;
454
}
455

    
456
static void virtio_blk_save(QEMUFile *f, void *opaque)
457
{
458
    VirtIOBlock *s = opaque;
459
    VirtIOBlockReq *req = s->rq;
460

    
461
    virtio_save(&s->vdev, f);
462
    
463
    while (req) {
464
        qemu_put_sbyte(f, 1);
465
        qemu_put_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
466
        req = req->next;
467
    }
468
    qemu_put_sbyte(f, 0);
469
}
470

    
471
static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id)
472
{
473
    VirtIOBlock *s = opaque;
474

    
475
    if (version_id != 2)
476
        return -EINVAL;
477

    
478
    virtio_load(&s->vdev, f);
479
    while (qemu_get_sbyte(f)) {
480
        VirtIOBlockReq *req = virtio_blk_alloc_request(s);
481
        qemu_get_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
482
        req->next = s->rq;
483
        s->rq = req;
484

    
485
        virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr,
486
            req->elem.in_num, 1);
487
        virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr,
488
            req->elem.out_num, 0);
489
    }
490

    
491
    return 0;
492
}
493

    
494
VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf)
495
{
496
    VirtIOBlock *s;
497
    int cylinders, heads, secs;
498
    static int virtio_blk_id;
499
    DriveInfo *dinfo;
500

    
501
    if (!conf->bs) {
502
        error_report("virtio-blk-pci: drive property not set");
503
        return NULL;
504
    }
505
    if (!bdrv_is_inserted(conf->bs)) {
506
        error_report("Device needs media, but drive is empty");
507
        return NULL;
508
    }
509

    
510
    s = (VirtIOBlock *)virtio_common_init("virtio-blk", VIRTIO_ID_BLOCK,
511
                                          sizeof(struct virtio_blk_config),
512
                                          sizeof(VirtIOBlock));
513

    
514
    s->vdev.get_config = virtio_blk_update_config;
515
    s->vdev.get_features = virtio_blk_get_features;
516
    s->vdev.reset = virtio_blk_reset;
517
    s->bs = conf->bs;
518
    s->conf = conf;
519
    s->rq = NULL;
520
    s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1;
521
    bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
522

    
523
    /* NB: per existing s/n string convention the string is terminated
524
     * by '\0' only when less than sizeof (s->sn)
525
     */
526
    dinfo = drive_get_by_blockdev(s->bs);
527
    strncpy(s->sn, dinfo->serial, sizeof (s->sn));
528

    
529
    s->vq = virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
530

    
531
    qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
532
    s->qdev = dev;
533
    register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
534
                    virtio_blk_save, virtio_blk_load, s);
535
    bdrv_set_removable(s->bs, 0);
536

    
537
    return &s->vdev;
538
}
539

    
540
void virtio_blk_exit(VirtIODevice *vdev)
541
{
542
    VirtIOBlock *s = to_virtio_blk(vdev);
543
    unregister_savevm(s->qdev, "virtio-blk", s);
544
}