Statistics
| Branch: | Revision:

root / hw / xen_disk.c @ 0200db65

History | View | Annotate | Download (25.5 kB)

1
/*
2
 *  xen paravirt block device backend
3
 *
4
 *  (c) Gerd Hoffmann <kraxel@redhat.com>
5
 *
6
 *  This program is free software; you can redistribute it and/or modify
7
 *  it under the terms of the GNU General Public License as published by
8
 *  the Free Software Foundation; under version 2 of the License.
9
 *
10
 *  This program is distributed in the hope that it will be useful,
11
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 *  GNU General Public License for more details.
14
 *
15
 *  You should have received a copy of the GNU General Public License along
16
 *  with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
18

    
19
#include <stdio.h>
20
#include <stdlib.h>
21
#include <stdarg.h>
22
#include <string.h>
23
#include <unistd.h>
24
#include <signal.h>
25
#include <inttypes.h>
26
#include <time.h>
27
#include <fcntl.h>
28
#include <errno.h>
29
#include <sys/ioctl.h>
30
#include <sys/types.h>
31
#include <sys/stat.h>
32
#include <sys/mman.h>
33
#include <sys/uio.h>
34

    
35
#include <xs.h>
36
#include <xenctrl.h>
37
#include <xen/io/xenbus.h>
38

    
39
#include "hw.h"
40
#include "block_int.h"
41
#include "qemu-char.h"
42
#include "xen_blkif.h"
43
#include "xen_backend.h"
44
#include "blockdev.h"
45

    
46
/* ------------------------------------------------------------- */
47

    
48
static int syncwrite    = 0;
49
static int batch_maps   = 0;
50

    
51
static int max_requests = 32;
52
static int use_aio      = 1;
53

    
54
/* ------------------------------------------------------------- */
55

    
56
#define BLOCK_SIZE  512
57
#define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
58

    
59
struct ioreq {
60
    blkif_request_t     req;
61
    int16_t             status;
62

    
63
    /* parsed request */
64
    off_t               start;
65
    QEMUIOVector        v;
66
    int                 presync;
67
    int                 postsync;
68

    
69
    /* grant mapping */
70
    uint32_t            domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
71
    uint32_t            refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
72
    int                 prot;
73
    void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
74
    void                *pages;
75

    
76
    /* aio status */
77
    int                 aio_inflight;
78
    int                 aio_errors;
79

    
80
    struct XenBlkDev    *blkdev;
81
    QLIST_ENTRY(ioreq)   list;
82
    BlockAcctCookie     acct;
83
};
84

    
85
struct XenBlkDev {
86
    struct XenDevice    xendev;  /* must be first */
87
    char                *params;
88
    char                *mode;
89
    char                *type;
90
    char                *dev;
91
    char                *devtype;
92
    const char          *fileproto;
93
    const char          *filename;
94
    int                 ring_ref;
95
    void                *sring;
96
    int64_t             file_blk;
97
    int64_t             file_size;
98
    int                 protocol;
99
    blkif_back_rings_t  rings;
100
    int                 more_work;
101
    int                 cnt_map;
102

    
103
    /* request lists */
104
    QLIST_HEAD(inflight_head, ioreq) inflight;
105
    QLIST_HEAD(finished_head, ioreq) finished;
106
    QLIST_HEAD(freelist_head, ioreq) freelist;
107
    int                 requests_total;
108
    int                 requests_inflight;
109
    int                 requests_finished;
110

    
111
    /* qemu block driver */
112
    DriveInfo           *dinfo;
113
    BlockDriverState    *bs;
114
    QEMUBH              *bh;
115
};
116

    
117
/* ------------------------------------------------------------- */
118

    
119
static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
120
{
121
    struct ioreq *ioreq = NULL;
122

    
123
    if (QLIST_EMPTY(&blkdev->freelist)) {
124
        if (blkdev->requests_total >= max_requests) {
125
            goto out;
126
        }
127
        /* allocate new struct */
128
        ioreq = g_malloc0(sizeof(*ioreq));
129
        ioreq->blkdev = blkdev;
130
        blkdev->requests_total++;
131
        qemu_iovec_init(&ioreq->v, BLKIF_MAX_SEGMENTS_PER_REQUEST);
132
    } else {
133
        /* get one from freelist */
134
        ioreq = QLIST_FIRST(&blkdev->freelist);
135
        QLIST_REMOVE(ioreq, list);
136
        qemu_iovec_reset(&ioreq->v);
137
    }
138
    QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
139
    blkdev->requests_inflight++;
140

    
141
out:
142
    return ioreq;
143
}
144

    
145
static void ioreq_finish(struct ioreq *ioreq)
146
{
147
    struct XenBlkDev *blkdev = ioreq->blkdev;
148

    
149
    QLIST_REMOVE(ioreq, list);
150
    QLIST_INSERT_HEAD(&blkdev->finished, ioreq, list);
151
    blkdev->requests_inflight--;
152
    blkdev->requests_finished++;
153
}
154

    
155
static void ioreq_release(struct ioreq *ioreq)
156
{
157
    struct XenBlkDev *blkdev = ioreq->blkdev;
158

    
159
    QLIST_REMOVE(ioreq, list);
160
    memset(ioreq, 0, sizeof(*ioreq));
161
    ioreq->blkdev = blkdev;
162
    QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
163
    blkdev->requests_finished--;
164
}
165

    
166
/*
167
 * translate request into iovec + start offset
168
 * do sanity checks along the way
169
 */
170
static int ioreq_parse(struct ioreq *ioreq)
171
{
172
    struct XenBlkDev *blkdev = ioreq->blkdev;
173
    uintptr_t mem;
174
    size_t len;
175
    int i;
176

    
177
    xen_be_printf(&blkdev->xendev, 3,
178
                  "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
179
                  ioreq->req.operation, ioreq->req.nr_segments,
180
                  ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
181
    switch (ioreq->req.operation) {
182
    case BLKIF_OP_READ:
183
        ioreq->prot = PROT_WRITE; /* to memory */
184
        break;
185
    case BLKIF_OP_WRITE_BARRIER:
186
        if (!ioreq->req.nr_segments) {
187
            ioreq->presync = 1;
188
            return 0;
189
        }
190
        if (!syncwrite) {
191
            ioreq->presync = ioreq->postsync = 1;
192
        }
193
        /* fall through */
194
    case BLKIF_OP_WRITE:
195
        ioreq->prot = PROT_READ; /* from memory */
196
        if (syncwrite) {
197
            ioreq->postsync = 1;
198
        }
199
        break;
200
    default:
201
        xen_be_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
202
                      ioreq->req.operation);
203
        goto err;
204
    };
205

    
206
    if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
207
        xen_be_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
208
        goto err;
209
    }
210

    
211
    ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
212
    for (i = 0; i < ioreq->req.nr_segments; i++) {
213
        if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
214
            xen_be_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
215
            goto err;
216
        }
217
        if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
218
            xen_be_printf(&blkdev->xendev, 0, "error: first > last sector\n");
219
            goto err;
220
        }
221
        if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
222
            xen_be_printf(&blkdev->xendev, 0, "error: page crossing\n");
223
            goto err;
224
        }
225

    
226
        ioreq->domids[i] = blkdev->xendev.dom;
227
        ioreq->refs[i]   = ioreq->req.seg[i].gref;
228

    
229
        mem = ioreq->req.seg[i].first_sect * blkdev->file_blk;
230
        len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk;
231
        qemu_iovec_add(&ioreq->v, (void*)mem, len);
232
    }
233
    if (ioreq->start + ioreq->v.size > blkdev->file_size) {
234
        xen_be_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
235
        goto err;
236
    }
237
    return 0;
238

    
239
err:
240
    ioreq->status = BLKIF_RSP_ERROR;
241
    return -1;
242
}
243

    
244
static void ioreq_unmap(struct ioreq *ioreq)
245
{
246
    XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
247
    int i;
248

    
249
    if (ioreq->v.niov == 0) {
250
        return;
251
    }
252
    if (batch_maps) {
253
        if (!ioreq->pages) {
254
            return;
255
        }
256
        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) {
257
            xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
258
                          strerror(errno));
259
        }
260
        ioreq->blkdev->cnt_map -= ioreq->v.niov;
261
        ioreq->pages = NULL;
262
    } else {
263
        for (i = 0; i < ioreq->v.niov; i++) {
264
            if (!ioreq->page[i]) {
265
                continue;
266
            }
267
            if (xc_gnttab_munmap(gnt, ioreq->page[i], 1) != 0) {
268
                xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
269
                              strerror(errno));
270
            }
271
            ioreq->blkdev->cnt_map--;
272
            ioreq->page[i] = NULL;
273
        }
274
    }
275
}
276

    
277
static int ioreq_map(struct ioreq *ioreq)
278
{
279
    XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
280
    int i;
281

    
282
    if (ioreq->v.niov == 0) {
283
        return 0;
284
    }
285
    if (batch_maps) {
286
        ioreq->pages = xc_gnttab_map_grant_refs
287
            (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot);
288
        if (ioreq->pages == NULL) {
289
            xen_be_printf(&ioreq->blkdev->xendev, 0,
290
                          "can't map %d grant refs (%s, %d maps)\n",
291
                          ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map);
292
            return -1;
293
        }
294
        for (i = 0; i < ioreq->v.niov; i++) {
295
            ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
296
                (uintptr_t)ioreq->v.iov[i].iov_base;
297
        }
298
        ioreq->blkdev->cnt_map += ioreq->v.niov;
299
    } else  {
300
        for (i = 0; i < ioreq->v.niov; i++) {
301
            ioreq->page[i] = xc_gnttab_map_grant_ref
302
                (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
303
            if (ioreq->page[i] == NULL) {
304
                xen_be_printf(&ioreq->blkdev->xendev, 0,
305
                              "can't map grant ref %d (%s, %d maps)\n",
306
                              ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map);
307
                ioreq_unmap(ioreq);
308
                return -1;
309
            }
310
            ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base;
311
            ioreq->blkdev->cnt_map++;
312
        }
313
    }
314
    return 0;
315
}
316

    
317
static int ioreq_runio_qemu_sync(struct ioreq *ioreq)
318
{
319
    struct XenBlkDev *blkdev = ioreq->blkdev;
320
    int i, rc;
321
    off_t pos;
322

    
323
    if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) {
324
        goto err_no_map;
325
    }
326
    if (ioreq->presync) {
327
        bdrv_flush(blkdev->bs);
328
    }
329

    
330
    switch (ioreq->req.operation) {
331
    case BLKIF_OP_READ:
332
        pos = ioreq->start;
333
        for (i = 0; i < ioreq->v.niov; i++) {
334
            rc = bdrv_read(blkdev->bs, pos / BLOCK_SIZE,
335
                           ioreq->v.iov[i].iov_base,
336
                           ioreq->v.iov[i].iov_len / BLOCK_SIZE);
337
            if (rc != 0) {
338
                xen_be_printf(&blkdev->xendev, 0, "rd I/O error (%p, len %zd)\n",
339
                              ioreq->v.iov[i].iov_base,
340
                              ioreq->v.iov[i].iov_len);
341
                goto err;
342
            }
343
            pos += ioreq->v.iov[i].iov_len;
344
        }
345
        break;
346
    case BLKIF_OP_WRITE:
347
    case BLKIF_OP_WRITE_BARRIER:
348
        if (!ioreq->req.nr_segments) {
349
            break;
350
        }
351
        pos = ioreq->start;
352
        for (i = 0; i < ioreq->v.niov; i++) {
353
            rc = bdrv_write(blkdev->bs, pos / BLOCK_SIZE,
354
                            ioreq->v.iov[i].iov_base,
355
                            ioreq->v.iov[i].iov_len / BLOCK_SIZE);
356
            if (rc != 0) {
357
                xen_be_printf(&blkdev->xendev, 0, "wr I/O error (%p, len %zd)\n",
358
                              ioreq->v.iov[i].iov_base,
359
                              ioreq->v.iov[i].iov_len);
360
                goto err;
361
            }
362
            pos += ioreq->v.iov[i].iov_len;
363
        }
364
        break;
365
    default:
366
        /* unknown operation (shouldn't happen -- parse catches this) */
367
        goto err;
368
    }
369

    
370
    if (ioreq->postsync) {
371
        bdrv_flush(blkdev->bs);
372
    }
373
    ioreq->status = BLKIF_RSP_OKAY;
374

    
375
    ioreq_unmap(ioreq);
376
    ioreq_finish(ioreq);
377
    return 0;
378

    
379
err:
380
    ioreq_unmap(ioreq);
381
err_no_map:
382
    ioreq_finish(ioreq);
383
    ioreq->status = BLKIF_RSP_ERROR;
384
    return -1;
385
}
386

    
387
static void qemu_aio_complete(void *opaque, int ret)
388
{
389
    struct ioreq *ioreq = opaque;
390

    
391
    if (ret != 0) {
392
        xen_be_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
393
                      ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
394
        ioreq->aio_errors++;
395
    }
396

    
397
    ioreq->aio_inflight--;
398
    if (ioreq->aio_inflight > 0) {
399
        return;
400
    }
401

    
402
    ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
403
    ioreq_unmap(ioreq);
404
    ioreq_finish(ioreq);
405
    bdrv_acct_done(ioreq->blkdev->bs, &ioreq->acct);
406
    qemu_bh_schedule(ioreq->blkdev->bh);
407
}
408

    
409
static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
410
{
411
    struct XenBlkDev *blkdev = ioreq->blkdev;
412

    
413
    if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) {
414
        goto err_no_map;
415
    }
416

    
417
    ioreq->aio_inflight++;
418
    if (ioreq->presync) {
419
        bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */
420
    }
421

    
422
    switch (ioreq->req.operation) {
423
    case BLKIF_OP_READ:
424
        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ);
425
        ioreq->aio_inflight++;
426
        bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
427
                       &ioreq->v, ioreq->v.size / BLOCK_SIZE,
428
                       qemu_aio_complete, ioreq);
429
        break;
430
    case BLKIF_OP_WRITE:
431
    case BLKIF_OP_WRITE_BARRIER:
432
        if (!ioreq->req.nr_segments) {
433
            break;
434
        }
435

    
436
        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE);
437
        ioreq->aio_inflight++;
438
        bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
439
                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
440
                        qemu_aio_complete, ioreq);
441
        break;
442
    default:
443
        /* unknown operation (shouldn't happen -- parse catches this) */
444
        goto err;
445
    }
446

    
447
    if (ioreq->postsync) {
448
        bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */
449
    }
450
    qemu_aio_complete(ioreq, 0);
451

    
452
    return 0;
453

    
454
err:
455
    ioreq_unmap(ioreq);
456
err_no_map:
457
    ioreq_finish(ioreq);
458
    ioreq->status = BLKIF_RSP_ERROR;
459
    return -1;
460
}
461

    
462
static int blk_send_response_one(struct ioreq *ioreq)
463
{
464
    struct XenBlkDev  *blkdev = ioreq->blkdev;
465
    int               send_notify   = 0;
466
    int               have_requests = 0;
467
    blkif_response_t  resp;
468
    void              *dst;
469

    
470
    resp.id        = ioreq->req.id;
471
    resp.operation = ioreq->req.operation;
472
    resp.status    = ioreq->status;
473

    
474
    /* Place on the response ring for the relevant domain. */
475
    switch (blkdev->protocol) {
476
    case BLKIF_PROTOCOL_NATIVE:
477
        dst = RING_GET_RESPONSE(&blkdev->rings.native, blkdev->rings.native.rsp_prod_pvt);
478
        break;
479
    case BLKIF_PROTOCOL_X86_32:
480
        dst = RING_GET_RESPONSE(&blkdev->rings.x86_32_part,
481
                                blkdev->rings.x86_32_part.rsp_prod_pvt);
482
        break;
483
    case BLKIF_PROTOCOL_X86_64:
484
        dst = RING_GET_RESPONSE(&blkdev->rings.x86_64_part,
485
                                blkdev->rings.x86_64_part.rsp_prod_pvt);
486
        break;
487
    default:
488
        dst = NULL;
489
    }
490
    memcpy(dst, &resp, sizeof(resp));
491
    blkdev->rings.common.rsp_prod_pvt++;
492

    
493
    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
494
    if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
495
        /*
496
         * Tail check for pending requests. Allows frontend to avoid
497
         * notifications if requests are already in flight (lower
498
         * overheads and promotes batching).
499
         */
500
        RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
501
    } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
502
        have_requests = 1;
503
    }
504

    
505
    if (have_requests) {
506
        blkdev->more_work++;
507
    }
508
    return send_notify;
509
}
510

    
511
/* walk finished list, send outstanding responses, free requests */
512
static void blk_send_response_all(struct XenBlkDev *blkdev)
513
{
514
    struct ioreq *ioreq;
515
    int send_notify = 0;
516

    
517
    while (!QLIST_EMPTY(&blkdev->finished)) {
518
        ioreq = QLIST_FIRST(&blkdev->finished);
519
        send_notify += blk_send_response_one(ioreq);
520
        ioreq_release(ioreq);
521
    }
522
    if (send_notify) {
523
        xen_be_send_notify(&blkdev->xendev);
524
    }
525
}
526

    
527
static int blk_get_request(struct XenBlkDev *blkdev, struct ioreq *ioreq, RING_IDX rc)
528
{
529
    switch (blkdev->protocol) {
530
    case BLKIF_PROTOCOL_NATIVE:
531
        memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
532
               sizeof(ioreq->req));
533
        break;
534
    case BLKIF_PROTOCOL_X86_32:
535
        blkif_get_x86_32_req(&ioreq->req,
536
                             RING_GET_REQUEST(&blkdev->rings.x86_32_part, rc));
537
        break;
538
    case BLKIF_PROTOCOL_X86_64:
539
        blkif_get_x86_64_req(&ioreq->req,
540
                             RING_GET_REQUEST(&blkdev->rings.x86_64_part, rc));
541
        break;
542
    }
543
    return 0;
544
}
545

    
546
static void blk_handle_requests(struct XenBlkDev *blkdev)
547
{
548
    RING_IDX rc, rp;
549
    struct ioreq *ioreq;
550

    
551
    blkdev->more_work = 0;
552

    
553
    rc = blkdev->rings.common.req_cons;
554
    rp = blkdev->rings.common.sring->req_prod;
555
    xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
556

    
557
    if (use_aio) {
558
        blk_send_response_all(blkdev);
559
    }
560
    while (rc != rp) {
561
        /* pull request from ring */
562
        if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
563
            break;
564
        }
565
        ioreq = ioreq_start(blkdev);
566
        if (ioreq == NULL) {
567
            blkdev->more_work++;
568
            break;
569
        }
570
        blk_get_request(blkdev, ioreq, rc);
571
        blkdev->rings.common.req_cons = ++rc;
572

    
573
        /* parse them */
574
        if (ioreq_parse(ioreq) != 0) {
575
            if (blk_send_response_one(ioreq)) {
576
                xen_be_send_notify(&blkdev->xendev);
577
            }
578
            ioreq_release(ioreq);
579
            continue;
580
        }
581

    
582
        if (use_aio) {
583
            /* run i/o in aio mode */
584
            ioreq_runio_qemu_aio(ioreq);
585
        } else {
586
            /* run i/o in sync mode */
587
            ioreq_runio_qemu_sync(ioreq);
588
        }
589
    }
590
    if (!use_aio) {
591
        blk_send_response_all(blkdev);
592
    }
593

    
594
    if (blkdev->more_work && blkdev->requests_inflight < max_requests) {
595
        qemu_bh_schedule(blkdev->bh);
596
    }
597
}
598

    
599
/* ------------------------------------------------------------- */
600

    
601
static void blk_bh(void *opaque)
602
{
603
    struct XenBlkDev *blkdev = opaque;
604
    blk_handle_requests(blkdev);
605
}
606

    
607
static void blk_alloc(struct XenDevice *xendev)
608
{
609
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
610

    
611
    QLIST_INIT(&blkdev->inflight);
612
    QLIST_INIT(&blkdev->finished);
613
    QLIST_INIT(&blkdev->freelist);
614
    blkdev->bh = qemu_bh_new(blk_bh, blkdev);
615
    if (xen_mode != XEN_EMULATE) {
616
        batch_maps = 1;
617
    }
618
}
619

    
620
static int blk_init(struct XenDevice *xendev)
621
{
622
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
623
    int index, qflags, have_barriers, info = 0;
624

    
625
    /* read xenstore entries */
626
    if (blkdev->params == NULL) {
627
        char *h = NULL;
628
        blkdev->params = xenstore_read_be_str(&blkdev->xendev, "params");
629
        if (blkdev->params != NULL) {
630
            h = strchr(blkdev->params, ':');
631
        }
632
        if (h != NULL) {
633
            blkdev->fileproto = blkdev->params;
634
            blkdev->filename  = h+1;
635
            *h = 0;
636
        } else {
637
            blkdev->fileproto = "<unset>";
638
            blkdev->filename  = blkdev->params;
639
        }
640
    }
641
    if (!strcmp("aio", blkdev->fileproto)) {
642
        blkdev->fileproto = "raw";
643
    }
644
    if (blkdev->mode == NULL) {
645
        blkdev->mode = xenstore_read_be_str(&blkdev->xendev, "mode");
646
    }
647
    if (blkdev->type == NULL) {
648
        blkdev->type = xenstore_read_be_str(&blkdev->xendev, "type");
649
    }
650
    if (blkdev->dev == NULL) {
651
        blkdev->dev = xenstore_read_be_str(&blkdev->xendev, "dev");
652
    }
653
    if (blkdev->devtype == NULL) {
654
        blkdev->devtype = xenstore_read_be_str(&blkdev->xendev, "device-type");
655
    }
656

    
657
    /* do we have all we need? */
658
    if (blkdev->params == NULL ||
659
        blkdev->mode == NULL   ||
660
        blkdev->type == NULL   ||
661
        blkdev->dev == NULL) {
662
        goto out_error;
663
    }
664

    
665
    /* read-only ? */
666
    if (strcmp(blkdev->mode, "w") == 0) {
667
        qflags = BDRV_O_RDWR;
668
    } else {
669
        qflags = 0;
670
        info  |= VDISK_READONLY;
671
    }
672

    
673
    /* cdrom ? */
674
    if (blkdev->devtype && !strcmp(blkdev->devtype, "cdrom")) {
675
        info  |= VDISK_CDROM;
676
    }
677

    
678
    /* init qemu block driver */
679
    index = (blkdev->xendev.dev - 202 * 256) / 16;
680
    blkdev->dinfo = drive_get(IF_XEN, 0, index);
681
    if (!blkdev->dinfo) {
682
        /* setup via xenbus -> create new block driver instance */
683
        xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
684
        blkdev->bs = bdrv_new(blkdev->dev);
685
        if (blkdev->bs) {
686
            if (bdrv_open(blkdev->bs, blkdev->filename, qflags,
687
                        bdrv_find_whitelisted_format(blkdev->fileproto)) != 0) {
688
                bdrv_delete(blkdev->bs);
689
                blkdev->bs = NULL;
690
            }
691
        }
692
        if (!blkdev->bs) {
693
            goto out_error;
694
        }
695
    } else {
696
        /* setup via qemu cmdline -> already setup for us */
697
        xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n");
698
        blkdev->bs = blkdev->dinfo->bdrv;
699
    }
700
    bdrv_attach_dev_nofail(blkdev->bs, blkdev);
701
    blkdev->file_blk  = BLOCK_SIZE;
702
    blkdev->file_size = bdrv_getlength(blkdev->bs);
703
    if (blkdev->file_size < 0) {
704
        xen_be_printf(&blkdev->xendev, 1, "bdrv_getlength: %d (%s) | drv %s\n",
705
                      (int)blkdev->file_size, strerror(-blkdev->file_size),
706
                      blkdev->bs->drv ? blkdev->bs->drv->format_name : "-");
707
        blkdev->file_size = 0;
708
    }
709
    have_barriers = blkdev->bs->drv && blkdev->bs->drv->bdrv_flush ? 1 : 0;
710

    
711
    xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
712
                  " size %" PRId64 " (%" PRId64 " MB)\n",
713
                  blkdev->type, blkdev->fileproto, blkdev->filename,
714
                  blkdev->file_size, blkdev->file_size >> 20);
715

    
716
    /* fill info */
717
    xenstore_write_be_int(&blkdev->xendev, "feature-barrier", have_barriers);
718
    xenstore_write_be_int(&blkdev->xendev, "info",            info);
719
    xenstore_write_be_int(&blkdev->xendev, "sector-size",     blkdev->file_blk);
720
    xenstore_write_be_int(&blkdev->xendev, "sectors",
721
                          blkdev->file_size / blkdev->file_blk);
722
    return 0;
723

    
724
out_error:
725
    g_free(blkdev->params);
726
    blkdev->params = NULL;
727
    g_free(blkdev->mode);
728
    blkdev->mode = NULL;
729
    g_free(blkdev->type);
730
    blkdev->type = NULL;
731
    g_free(blkdev->dev);
732
    blkdev->dev = NULL;
733
    g_free(blkdev->devtype);
734
    blkdev->devtype = NULL;
735
    return -1;
736
}
737

    
738
static int blk_connect(struct XenDevice *xendev)
739
{
740
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
741

    
742
    if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
743
        return -1;
744
    }
745
    if (xenstore_read_fe_int(&blkdev->xendev, "event-channel",
746
                             &blkdev->xendev.remote_port) == -1) {
747
        return -1;
748
    }
749

    
750
    blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
751
    if (blkdev->xendev.protocol) {
752
        if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
753
            blkdev->protocol = BLKIF_PROTOCOL_X86_32;
754
        }
755
        if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_64) == 0) {
756
            blkdev->protocol = BLKIF_PROTOCOL_X86_64;
757
        }
758
    }
759

    
760
    blkdev->sring = xc_gnttab_map_grant_ref(blkdev->xendev.gnttabdev,
761
                                            blkdev->xendev.dom,
762
                                            blkdev->ring_ref,
763
                                            PROT_READ | PROT_WRITE);
764
    if (!blkdev->sring) {
765
        return -1;
766
    }
767
    blkdev->cnt_map++;
768

    
769
    switch (blkdev->protocol) {
770
    case BLKIF_PROTOCOL_NATIVE:
771
    {
772
        blkif_sring_t *sring_native = blkdev->sring;
773
        BACK_RING_INIT(&blkdev->rings.native, sring_native, XC_PAGE_SIZE);
774
        break;
775
    }
776
    case BLKIF_PROTOCOL_X86_32:
777
    {
778
        blkif_x86_32_sring_t *sring_x86_32 = blkdev->sring;
779

    
780
        BACK_RING_INIT(&blkdev->rings.x86_32_part, sring_x86_32, XC_PAGE_SIZE);
781
        break;
782
    }
783
    case BLKIF_PROTOCOL_X86_64:
784
    {
785
        blkif_x86_64_sring_t *sring_x86_64 = blkdev->sring;
786

    
787
        BACK_RING_INIT(&blkdev->rings.x86_64_part, sring_x86_64, XC_PAGE_SIZE);
788
        break;
789
    }
790
    }
791

    
792
    xen_be_bind_evtchn(&blkdev->xendev);
793

    
794
    xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
795
                  "remote port %d, local port %d\n",
796
                  blkdev->xendev.protocol, blkdev->ring_ref,
797
                  blkdev->xendev.remote_port, blkdev->xendev.local_port);
798
    return 0;
799
}
800

    
801
static void blk_disconnect(struct XenDevice *xendev)
802
{
803
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
804

    
805
    if (blkdev->bs) {
806
        if (!blkdev->dinfo) {
807
            /* close/delete only if we created it ourself */
808
            bdrv_close(blkdev->bs);
809
            bdrv_delete(blkdev->bs);
810
        }
811
        blkdev->bs = NULL;
812
    }
813
    xen_be_unbind_evtchn(&blkdev->xendev);
814

    
815
    if (blkdev->sring) {
816
        xc_gnttab_munmap(blkdev->xendev.gnttabdev, blkdev->sring, 1);
817
        blkdev->cnt_map--;
818
        blkdev->sring = NULL;
819
    }
820
}
821

    
822
static int blk_free(struct XenDevice *xendev)
823
{
824
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
825
    struct ioreq *ioreq;
826

    
827
    while (!QLIST_EMPTY(&blkdev->freelist)) {
828
        ioreq = QLIST_FIRST(&blkdev->freelist);
829
        QLIST_REMOVE(ioreq, list);
830
        qemu_iovec_destroy(&ioreq->v);
831
        g_free(ioreq);
832
    }
833

    
834
    g_free(blkdev->params);
835
    g_free(blkdev->mode);
836
    g_free(blkdev->type);
837
    g_free(blkdev->dev);
838
    g_free(blkdev->devtype);
839
    qemu_bh_delete(blkdev->bh);
840
    return 0;
841
}
842

    
843
static void blk_event(struct XenDevice *xendev)
844
{
845
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
846

    
847
    qemu_bh_schedule(blkdev->bh);
848
}
849

    
850
struct XenDevOps xen_blkdev_ops = {
851
    .size       = sizeof(struct XenBlkDev),
852
    .flags      = DEVOPS_FLAG_NEED_GNTDEV,
853
    .alloc      = blk_alloc,
854
    .init       = blk_init,
855
    .initialise    = blk_connect,
856
    .disconnect = blk_disconnect,
857
    .event      = blk_event,
858
    .free       = blk_free,
859
};