Statistics
| Branch: | Revision:

root / block.c @ 7327145f

History | View | Annotate | Download (149.6 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "block/qapi.h"
36
#include "qmp-commands.h"
37
#include "qemu/timer.h"
38

    
39
#ifdef CONFIG_BSD
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <sys/ioctl.h>
43
#include <sys/queue.h>
44
#ifndef __DragonFly__
45
#include <sys/disk.h>
46
#endif
47
#endif
48

    
49
#ifdef _WIN32
50
#include <windows.h>
51
#endif
52

    
53
struct BdrvDirtyBitmap {
54
    HBitmap *bitmap;
55
    QLIST_ENTRY(BdrvDirtyBitmap) list;
56
};
57

    
58
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59

    
60
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63
        BlockDriverCompletionFunc *cb, void *opaque);
64
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66
        BlockDriverCompletionFunc *cb, void *opaque);
67
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68
                                         int64_t sector_num, int nb_sectors,
69
                                         QEMUIOVector *iov);
70
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71
                                         int64_t sector_num, int nb_sectors,
72
                                         QEMUIOVector *iov);
73
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
74
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
75
    BdrvRequestFlags flags);
76
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
77
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
78
    BdrvRequestFlags flags);
79
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80
                                               int64_t sector_num,
81
                                               QEMUIOVector *qiov,
82
                                               int nb_sectors,
83
                                               BdrvRequestFlags flags,
84
                                               BlockDriverCompletionFunc *cb,
85
                                               void *opaque,
86
                                               bool is_write);
87
static void coroutine_fn bdrv_co_do_rw(void *opaque);
88
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96

    
97
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_set_io_limits(BlockDriverState *bs,
125
                        ThrottleConfig *cfg)
126
{
127
    int i;
128

    
129
    throttle_config(&bs->throttle_state, cfg);
130

    
131
    for (i = 0; i < 2; i++) {
132
        qemu_co_enter_next(&bs->throttled_reqs[i]);
133
    }
134
}
135

    
136
/* this function drain all the throttled IOs */
137
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138
{
139
    bool drained = false;
140
    bool enabled = bs->io_limits_enabled;
141
    int i;
142

    
143
    bs->io_limits_enabled = false;
144

    
145
    for (i = 0; i < 2; i++) {
146
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147
            drained = true;
148
        }
149
    }
150

    
151
    bs->io_limits_enabled = enabled;
152

    
153
    return drained;
154
}
155

    
156
void bdrv_io_limits_disable(BlockDriverState *bs)
157
{
158
    bs->io_limits_enabled = false;
159

    
160
    bdrv_start_throttled_reqs(bs);
161

    
162
    throttle_destroy(&bs->throttle_state);
163
}
164

    
165
static void bdrv_throttle_read_timer_cb(void *opaque)
166
{
167
    BlockDriverState *bs = opaque;
168
    qemu_co_enter_next(&bs->throttled_reqs[0]);
169
}
170

    
171
static void bdrv_throttle_write_timer_cb(void *opaque)
172
{
173
    BlockDriverState *bs = opaque;
174
    qemu_co_enter_next(&bs->throttled_reqs[1]);
175
}
176

    
177
/* should be called before bdrv_set_io_limits if a limit is set */
178
void bdrv_io_limits_enable(BlockDriverState *bs)
179
{
180
    assert(!bs->io_limits_enabled);
181
    throttle_init(&bs->throttle_state,
182
                  QEMU_CLOCK_VIRTUAL,
183
                  bdrv_throttle_read_timer_cb,
184
                  bdrv_throttle_write_timer_cb,
185
                  bs);
186
    bs->io_limits_enabled = true;
187
}
188

    
189
/* This function makes an IO wait if needed
190
 *
191
 * @nb_sectors: the number of sectors of the IO
192
 * @is_write:   is the IO a write
193
 */
194
static void bdrv_io_limits_intercept(BlockDriverState *bs,
195
                                     int nb_sectors,
196
                                     bool is_write)
197
{
198
    /* does this io must wait */
199
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200

    
201
    /* if must wait or any request of this type throttled queue the IO */
202
    if (must_wait ||
203
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205
    }
206

    
207
    /* the IO will be executed, do the accounting */
208
    throttle_account(&bs->throttle_state,
209
                     is_write,
210
                     nb_sectors * BDRV_SECTOR_SIZE);
211

    
212
    /* if the next request must wait -> do nothing */
213
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214
        return;
215
    }
216

    
217
    /* else queue next request for execution */
218
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219
}
220

    
221
size_t bdrv_opt_mem_align(BlockDriverState *bs)
222
{
223
    if (!bs || !bs->drv) {
224
        /* 4k should be on the safe side */
225
        return 4096;
226
    }
227

    
228
    return bs->bl.opt_mem_alignment;
229
}
230

    
231
/* check if the path starts with "<protocol>:" */
232
static int path_has_protocol(const char *path)
233
{
234
    const char *p;
235

    
236
#ifdef _WIN32
237
    if (is_windows_drive(path) ||
238
        is_windows_drive_prefix(path)) {
239
        return 0;
240
    }
241
    p = path + strcspn(path, ":/\\");
242
#else
243
    p = path + strcspn(path, ":/");
244
#endif
245

    
246
    return *p == ':';
247
}
248

    
249
int path_is_absolute(const char *path)
250
{
251
#ifdef _WIN32
252
    /* specific case for names like: "\\.\d:" */
253
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254
        return 1;
255
    }
256
    return (*path == '/' || *path == '\\');
257
#else
258
    return (*path == '/');
259
#endif
260
}
261

    
262
/* if filename is absolute, just copy it to dest. Otherwise, build a
263
   path to it by considering it is relative to base_path. URL are
264
   supported. */
265
void path_combine(char *dest, int dest_size,
266
                  const char *base_path,
267
                  const char *filename)
268
{
269
    const char *p, *p1;
270
    int len;
271

    
272
    if (dest_size <= 0)
273
        return;
274
    if (path_is_absolute(filename)) {
275
        pstrcpy(dest, dest_size, filename);
276
    } else {
277
        p = strchr(base_path, ':');
278
        if (p)
279
            p++;
280
        else
281
            p = base_path;
282
        p1 = strrchr(base_path, '/');
283
#ifdef _WIN32
284
        {
285
            const char *p2;
286
            p2 = strrchr(base_path, '\\');
287
            if (!p1 || p2 > p1)
288
                p1 = p2;
289
        }
290
#endif
291
        if (p1)
292
            p1++;
293
        else
294
            p1 = base_path;
295
        if (p1 > p)
296
            p = p1;
297
        len = p - base_path;
298
        if (len > dest_size - 1)
299
            len = dest_size - 1;
300
        memcpy(dest, base_path, len);
301
        dest[len] = '\0';
302
        pstrcat(dest, dest_size, filename);
303
    }
304
}
305

    
306
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307
{
308
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309
        pstrcpy(dest, sz, bs->backing_file);
310
    } else {
311
        path_combine(dest, sz, bs->filename, bs->backing_file);
312
    }
313
}
314

    
315
void bdrv_register(BlockDriver *bdrv)
316
{
317
    /* Block drivers without coroutine functions need emulation */
318
    if (!bdrv->bdrv_co_readv) {
319
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
320
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
321

    
322
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323
         * the block driver lacks aio we need to emulate that too.
324
         */
325
        if (!bdrv->bdrv_aio_readv) {
326
            /* add AIO emulation layer */
327
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329
        }
330
    }
331

    
332
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333
}
334

    
335
/* create a new block device (by default it is empty) */
336
BlockDriverState *bdrv_new(const char *device_name)
337
{
338
    BlockDriverState *bs;
339

    
340
    bs = g_malloc0(sizeof(BlockDriverState));
341
    QLIST_INIT(&bs->dirty_bitmaps);
342
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
343
    if (device_name[0] != '\0') {
344
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
345
    }
346
    bdrv_iostatus_disable(bs);
347
    notifier_list_init(&bs->close_notifiers);
348
    notifier_with_return_list_init(&bs->before_write_notifiers);
349
    qemu_co_queue_init(&bs->throttled_reqs[0]);
350
    qemu_co_queue_init(&bs->throttled_reqs[1]);
351
    bs->refcnt = 1;
352

    
353
    return bs;
354
}
355

    
356
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
357
{
358
    notifier_list_add(&bs->close_notifiers, notify);
359
}
360

    
361
BlockDriver *bdrv_find_format(const char *format_name)
362
{
363
    BlockDriver *drv1;
364
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
365
        if (!strcmp(drv1->format_name, format_name)) {
366
            return drv1;
367
        }
368
    }
369
    return NULL;
370
}
371

    
372
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
373
{
374
    static const char *whitelist_rw[] = {
375
        CONFIG_BDRV_RW_WHITELIST
376
    };
377
    static const char *whitelist_ro[] = {
378
        CONFIG_BDRV_RO_WHITELIST
379
    };
380
    const char **p;
381

    
382
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
383
        return 1;               /* no whitelist, anything goes */
384
    }
385

    
386
    for (p = whitelist_rw; *p; p++) {
387
        if (!strcmp(drv->format_name, *p)) {
388
            return 1;
389
        }
390
    }
391
    if (read_only) {
392
        for (p = whitelist_ro; *p; p++) {
393
            if (!strcmp(drv->format_name, *p)) {
394
                return 1;
395
            }
396
        }
397
    }
398
    return 0;
399
}
400

    
401
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
402
                                          bool read_only)
403
{
404
    BlockDriver *drv = bdrv_find_format(format_name);
405
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
406
}
407

    
408
typedef struct CreateCo {
409
    BlockDriver *drv;
410
    char *filename;
411
    QEMUOptionParameter *options;
412
    int ret;
413
    Error *err;
414
} CreateCo;
415

    
416
static void coroutine_fn bdrv_create_co_entry(void *opaque)
417
{
418
    Error *local_err = NULL;
419
    int ret;
420

    
421
    CreateCo *cco = opaque;
422
    assert(cco->drv);
423

    
424
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
425
    if (error_is_set(&local_err)) {
426
        error_propagate(&cco->err, local_err);
427
    }
428
    cco->ret = ret;
429
}
430

    
431
int bdrv_create(BlockDriver *drv, const char* filename,
432
    QEMUOptionParameter *options, Error **errp)
433
{
434
    int ret;
435

    
436
    Coroutine *co;
437
    CreateCo cco = {
438
        .drv = drv,
439
        .filename = g_strdup(filename),
440
        .options = options,
441
        .ret = NOT_DONE,
442
        .err = NULL,
443
    };
444

    
445
    if (!drv->bdrv_create) {
446
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
447
        ret = -ENOTSUP;
448
        goto out;
449
    }
450

    
451
    if (qemu_in_coroutine()) {
452
        /* Fast-path if already in coroutine context */
453
        bdrv_create_co_entry(&cco);
454
    } else {
455
        co = qemu_coroutine_create(bdrv_create_co_entry);
456
        qemu_coroutine_enter(co, &cco);
457
        while (cco.ret == NOT_DONE) {
458
            qemu_aio_wait();
459
        }
460
    }
461

    
462
    ret = cco.ret;
463
    if (ret < 0) {
464
        if (error_is_set(&cco.err)) {
465
            error_propagate(errp, cco.err);
466
        } else {
467
            error_setg_errno(errp, -ret, "Could not create image");
468
        }
469
    }
470

    
471
out:
472
    g_free(cco.filename);
473
    return ret;
474
}
475

    
476
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
477
                     Error **errp)
478
{
479
    BlockDriver *drv;
480
    Error *local_err = NULL;
481
    int ret;
482

    
483
    drv = bdrv_find_protocol(filename, true);
484
    if (drv == NULL) {
485
        error_setg(errp, "Could not find protocol for file '%s'", filename);
486
        return -ENOENT;
487
    }
488

    
489
    ret = bdrv_create(drv, filename, options, &local_err);
490
    if (error_is_set(&local_err)) {
491
        error_propagate(errp, local_err);
492
    }
493
    return ret;
494
}
495

    
496
int bdrv_refresh_limits(BlockDriverState *bs)
497
{
498
    BlockDriver *drv = bs->drv;
499

    
500
    memset(&bs->bl, 0, sizeof(bs->bl));
501

    
502
    if (!drv) {
503
        return 0;
504
    }
505

    
506
    /* Take some limits from the children as a default */
507
    if (bs->file) {
508
        bdrv_refresh_limits(bs->file);
509
        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
510
        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
511
    } else {
512
        bs->bl.opt_mem_alignment = 512;
513
    }
514

    
515
    if (bs->backing_hd) {
516
        bdrv_refresh_limits(bs->backing_hd);
517
        bs->bl.opt_transfer_length =
518
            MAX(bs->bl.opt_transfer_length,
519
                bs->backing_hd->bl.opt_transfer_length);
520
        bs->bl.opt_mem_alignment =
521
            MAX(bs->bl.opt_mem_alignment,
522
                bs->backing_hd->bl.opt_mem_alignment);
523
    }
524

    
525
    /* Then let the driver override it */
526
    if (drv->bdrv_refresh_limits) {
527
        return drv->bdrv_refresh_limits(bs);
528
    }
529

    
530
    return 0;
531
}
532

    
533
/*
534
 * Create a uniquely-named empty temporary file.
535
 * Return 0 upon success, otherwise a negative errno value.
536
 */
537
int get_tmp_filename(char *filename, int size)
538
{
539
#ifdef _WIN32
540
    char temp_dir[MAX_PATH];
541
    /* GetTempFileName requires that its output buffer (4th param)
542
       have length MAX_PATH or greater.  */
543
    assert(size >= MAX_PATH);
544
    return (GetTempPath(MAX_PATH, temp_dir)
545
            && GetTempFileName(temp_dir, "qem", 0, filename)
546
            ? 0 : -GetLastError());
547
#else
548
    int fd;
549
    const char *tmpdir;
550
    tmpdir = getenv("TMPDIR");
551
    if (!tmpdir)
552
        tmpdir = "/tmp";
553
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
554
        return -EOVERFLOW;
555
    }
556
    fd = mkstemp(filename);
557
    if (fd < 0) {
558
        return -errno;
559
    }
560
    if (close(fd) != 0) {
561
        unlink(filename);
562
        return -errno;
563
    }
564
    return 0;
565
#endif
566
}
567

    
568
/*
569
 * Detect host devices. By convention, /dev/cdrom[N] is always
570
 * recognized as a host CDROM.
571
 */
572
static BlockDriver *find_hdev_driver(const char *filename)
573
{
574
    int score_max = 0, score;
575
    BlockDriver *drv = NULL, *d;
576

    
577
    QLIST_FOREACH(d, &bdrv_drivers, list) {
578
        if (d->bdrv_probe_device) {
579
            score = d->bdrv_probe_device(filename);
580
            if (score > score_max) {
581
                score_max = score;
582
                drv = d;
583
            }
584
        }
585
    }
586

    
587
    return drv;
588
}
589

    
590
BlockDriver *bdrv_find_protocol(const char *filename,
591
                                bool allow_protocol_prefix)
592
{
593
    BlockDriver *drv1;
594
    char protocol[128];
595
    int len;
596
    const char *p;
597

    
598
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
599

    
600
    /*
601
     * XXX(hch): we really should not let host device detection
602
     * override an explicit protocol specification, but moving this
603
     * later breaks access to device names with colons in them.
604
     * Thanks to the brain-dead persistent naming schemes on udev-
605
     * based Linux systems those actually are quite common.
606
     */
607
    drv1 = find_hdev_driver(filename);
608
    if (drv1) {
609
        return drv1;
610
    }
611

    
612
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
613
        return bdrv_find_format("file");
614
    }
615

    
616
    p = strchr(filename, ':');
617
    assert(p != NULL);
618
    len = p - filename;
619
    if (len > sizeof(protocol) - 1)
620
        len = sizeof(protocol) - 1;
621
    memcpy(protocol, filename, len);
622
    protocol[len] = '\0';
623
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
624
        if (drv1->protocol_name &&
625
            !strcmp(drv1->protocol_name, protocol)) {
626
            return drv1;
627
        }
628
    }
629
    return NULL;
630
}
631

    
632
static int find_image_format(BlockDriverState *bs, const char *filename,
633
                             BlockDriver **pdrv, Error **errp)
634
{
635
    int score, score_max;
636
    BlockDriver *drv1, *drv;
637
    uint8_t buf[2048];
638
    int ret = 0;
639

    
640
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
641
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
642
        drv = bdrv_find_format("raw");
643
        if (!drv) {
644
            error_setg(errp, "Could not find raw image format");
645
            ret = -ENOENT;
646
        }
647
        *pdrv = drv;
648
        return ret;
649
    }
650

    
651
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
652
    if (ret < 0) {
653
        error_setg_errno(errp, -ret, "Could not read image for determining its "
654
                         "format");
655
        *pdrv = NULL;
656
        return ret;
657
    }
658

    
659
    score_max = 0;
660
    drv = NULL;
661
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
662
        if (drv1->bdrv_probe) {
663
            score = drv1->bdrv_probe(buf, ret, filename);
664
            if (score > score_max) {
665
                score_max = score;
666
                drv = drv1;
667
            }
668
        }
669
    }
670
    if (!drv) {
671
        error_setg(errp, "Could not determine image format: No compatible "
672
                   "driver found");
673
        ret = -ENOENT;
674
    }
675
    *pdrv = drv;
676
    return ret;
677
}
678

    
679
/**
680
 * Set the current 'total_sectors' value
681
 */
682
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
683
{
684
    BlockDriver *drv = bs->drv;
685

    
686
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
687
    if (bs->sg)
688
        return 0;
689

    
690
    /* query actual device if possible, otherwise just trust the hint */
691
    if (drv->bdrv_getlength) {
692
        int64_t length = drv->bdrv_getlength(bs);
693
        if (length < 0) {
694
            return length;
695
        }
696
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
697
    }
698

    
699
    bs->total_sectors = hint;
700
    return 0;
701
}
702

    
703
/**
704
 * Set open flags for a given discard mode
705
 *
706
 * Return 0 on success, -1 if the discard mode was invalid.
707
 */
708
int bdrv_parse_discard_flags(const char *mode, int *flags)
709
{
710
    *flags &= ~BDRV_O_UNMAP;
711

    
712
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
713
        /* do nothing */
714
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
715
        *flags |= BDRV_O_UNMAP;
716
    } else {
717
        return -1;
718
    }
719

    
720
    return 0;
721
}
722

    
723
/**
724
 * Set open flags for a given cache mode
725
 *
726
 * Return 0 on success, -1 if the cache mode was invalid.
727
 */
728
int bdrv_parse_cache_flags(const char *mode, int *flags)
729
{
730
    *flags &= ~BDRV_O_CACHE_MASK;
731

    
732
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
733
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
734
    } else if (!strcmp(mode, "directsync")) {
735
        *flags |= BDRV_O_NOCACHE;
736
    } else if (!strcmp(mode, "writeback")) {
737
        *flags |= BDRV_O_CACHE_WB;
738
    } else if (!strcmp(mode, "unsafe")) {
739
        *flags |= BDRV_O_CACHE_WB;
740
        *flags |= BDRV_O_NO_FLUSH;
741
    } else if (!strcmp(mode, "writethrough")) {
742
        /* this is the default */
743
    } else {
744
        return -1;
745
    }
746

    
747
    return 0;
748
}
749

    
750
/**
751
 * The copy-on-read flag is actually a reference count so multiple users may
752
 * use the feature without worrying about clobbering its previous state.
753
 * Copy-on-read stays enabled until all users have called to disable it.
754
 */
755
void bdrv_enable_copy_on_read(BlockDriverState *bs)
756
{
757
    bs->copy_on_read++;
758
}
759

    
760
void bdrv_disable_copy_on_read(BlockDriverState *bs)
761
{
762
    assert(bs->copy_on_read > 0);
763
    bs->copy_on_read--;
764
}
765

    
766
static int bdrv_open_flags(BlockDriverState *bs, int flags)
767
{
768
    int open_flags = flags | BDRV_O_CACHE_WB;
769

    
770
    /*
771
     * Clear flags that are internal to the block layer before opening the
772
     * image.
773
     */
774
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
775

    
776
    /*
777
     * Snapshots should be writable.
778
     */
779
    if (bs->is_temporary) {
780
        open_flags |= BDRV_O_RDWR;
781
    }
782

    
783
    return open_flags;
784
}
785

    
786
static int bdrv_assign_node_name(BlockDriverState *bs,
787
                                 const char *node_name,
788
                                 Error **errp)
789
{
790
    if (!node_name) {
791
        return 0;
792
    }
793

    
794
    /* empty string node name is invalid */
795
    if (node_name[0] == '\0') {
796
        error_setg(errp, "Empty node name");
797
        return -EINVAL;
798
    }
799

    
800
    /* takes care of avoiding duplicates node names */
801
    if (bdrv_find_node(node_name)) {
802
        error_setg(errp, "Duplicate node name");
803
        return -EINVAL;
804
    }
805

    
806
    /* copy node name into the bs and insert it into the graph list */
807
    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
808
    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
809

    
810
    return 0;
811
}
812

    
813
/*
814
 * Common part for opening disk images and files
815
 *
816
 * Removes all processed options from *options.
817
 */
818
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
819
    QDict *options, int flags, BlockDriver *drv, Error **errp)
820
{
821
    int ret, open_flags;
822
    const char *filename;
823
    const char *node_name = NULL;
824
    Error *local_err = NULL;
825

    
826
    assert(drv != NULL);
827
    assert(bs->file == NULL);
828
    assert(options != NULL && bs->options != options);
829

    
830
    if (file != NULL) {
831
        filename = file->filename;
832
    } else {
833
        filename = qdict_get_try_str(options, "filename");
834
    }
835

    
836
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
837

    
838
    node_name = qdict_get_try_str(options, "node-name");
839
    ret = bdrv_assign_node_name(bs, node_name, errp);
840
    if (ret < 0) {
841
        return ret;
842
    }
843
    qdict_del(options, "node-name");
844

    
845
    /* bdrv_open() with directly using a protocol as drv. This layer is already
846
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
847
     * and return immediately. */
848
    if (file != NULL && drv->bdrv_file_open) {
849
        bdrv_swap(file, bs);
850
        return 0;
851
    }
852

    
853
    bs->open_flags = flags;
854
    bs->guest_block_size = 512;
855
    bs->request_alignment = 512;
856
    bs->zero_beyond_eof = true;
857
    open_flags = bdrv_open_flags(bs, flags);
858
    bs->read_only = !(open_flags & BDRV_O_RDWR);
859

    
860
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
861
        error_setg(errp,
862
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
863
                        ? "Driver '%s' can only be used for read-only devices"
864
                        : "Driver '%s' is not whitelisted",
865
                   drv->format_name);
866
        return -ENOTSUP;
867
    }
868

    
869
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
870
    if (flags & BDRV_O_COPY_ON_READ) {
871
        if (!bs->read_only) {
872
            bdrv_enable_copy_on_read(bs);
873
        } else {
874
            error_setg(errp, "Can't use copy-on-read on read-only device");
875
            return -EINVAL;
876
        }
877
    }
878

    
879
    if (filename != NULL) {
880
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
881
    } else {
882
        bs->filename[0] = '\0';
883
    }
884

    
885
    bs->drv = drv;
886
    bs->opaque = g_malloc0(drv->instance_size);
887

    
888
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
889

    
890
    /* Open the image, either directly or using a protocol */
891
    if (drv->bdrv_file_open) {
892
        assert(file == NULL);
893
        assert(!drv->bdrv_needs_filename || filename != NULL);
894
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
895
    } else {
896
        if (file == NULL) {
897
            error_setg(errp, "Can't use '%s' as a block driver for the "
898
                       "protocol level", drv->format_name);
899
            ret = -EINVAL;
900
            goto free_and_fail;
901
        }
902
        bs->file = file;
903
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
904
    }
905

    
906
    if (ret < 0) {
907
        if (error_is_set(&local_err)) {
908
            error_propagate(errp, local_err);
909
        } else if (bs->filename[0]) {
910
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
911
        } else {
912
            error_setg_errno(errp, -ret, "Could not open image");
913
        }
914
        goto free_and_fail;
915
    }
916

    
917
    ret = refresh_total_sectors(bs, bs->total_sectors);
918
    if (ret < 0) {
919
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
920
        goto free_and_fail;
921
    }
922

    
923
    bdrv_refresh_limits(bs);
924
    assert(bdrv_opt_mem_align(bs) != 0);
925
    assert(bs->request_alignment != 0);
926

    
927
#ifndef _WIN32
928
    if (bs->is_temporary) {
929
        assert(bs->filename[0] != '\0');
930
        unlink(bs->filename);
931
    }
932
#endif
933
    return 0;
934

    
935
free_and_fail:
936
    bs->file = NULL;
937
    g_free(bs->opaque);
938
    bs->opaque = NULL;
939
    bs->drv = NULL;
940
    return ret;
941
}
942

    
943
/*
944
 * Opens a file using a protocol (file, host_device, nbd, ...)
945
 *
946
 * options is a QDict of options to pass to the block drivers, or NULL for an
947
 * empty set of options. The reference to the QDict belongs to the block layer
948
 * after the call (even on failure), so if the caller intends to reuse the
949
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
950
 */
951
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
952
                   const char *reference, QDict *options, int flags,
953
                   Error **errp)
954
{
955
    BlockDriverState *bs = NULL;
956
    BlockDriver *drv;
957
    const char *drvname;
958
    bool allow_protocol_prefix = false;
959
    Error *local_err = NULL;
960
    int ret;
961

    
962
    /* NULL means an empty set of options */
963
    if (options == NULL) {
964
        options = qdict_new();
965
    }
966

    
967
    if (reference) {
968
        if (filename || qdict_size(options)) {
969
            error_setg(errp, "Cannot reference an existing block device with "
970
                       "additional options or a new filename");
971
            return -EINVAL;
972
        }
973
        QDECREF(options);
974

    
975
        bs = bdrv_find(reference);
976
        if (!bs) {
977
            error_setg(errp, "Cannot find block device '%s'", reference);
978
            return -ENODEV;
979
        }
980
        bdrv_ref(bs);
981
        *pbs = bs;
982
        return 0;
983
    }
984

    
985
    bs = bdrv_new("");
986
    bs->options = options;
987
    options = qdict_clone_shallow(options);
988

    
989
    /* Fetch the file name from the options QDict if necessary */
990
    if (!filename) {
991
        filename = qdict_get_try_str(options, "filename");
992
    } else if (filename && !qdict_haskey(options, "filename")) {
993
        qdict_put(options, "filename", qstring_from_str(filename));
994
        allow_protocol_prefix = true;
995
    } else {
996
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
997
                   "same time");
998
        ret = -EINVAL;
999
        goto fail;
1000
    }
1001

    
1002
    /* Find the right block driver */
1003
    drvname = qdict_get_try_str(options, "driver");
1004
    if (drvname) {
1005
        drv = bdrv_find_format(drvname);
1006
        if (!drv) {
1007
            error_setg(errp, "Unknown driver '%s'", drvname);
1008
        }
1009
        qdict_del(options, "driver");
1010
    } else if (filename) {
1011
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1012
        if (!drv) {
1013
            error_setg(errp, "Unknown protocol");
1014
        }
1015
    } else {
1016
        error_setg(errp, "Must specify either driver or file");
1017
        drv = NULL;
1018
    }
1019

    
1020
    if (!drv) {
1021
        /* errp has been set already */
1022
        ret = -ENOENT;
1023
        goto fail;
1024
    }
1025

    
1026
    /* Parse the filename and open it */
1027
    if (drv->bdrv_parse_filename && filename) {
1028
        drv->bdrv_parse_filename(filename, options, &local_err);
1029
        if (error_is_set(&local_err)) {
1030
            error_propagate(errp, local_err);
1031
            ret = -EINVAL;
1032
            goto fail;
1033
        }
1034
        qdict_del(options, "filename");
1035
    } else if (drv->bdrv_needs_filename && !filename) {
1036
        error_setg(errp, "The '%s' block driver requires a file name",
1037
                   drv->format_name);
1038
        ret = -EINVAL;
1039
        goto fail;
1040
    }
1041

    
1042
    if (!drv->bdrv_file_open) {
1043
        ret = bdrv_open(bs, filename, options, flags, drv, &local_err);
1044
        options = NULL;
1045
    } else {
1046
        ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1047
    }
1048
    if (ret < 0) {
1049
        error_propagate(errp, local_err);
1050
        goto fail;
1051
    }
1052

    
1053
    /* Check if any unknown options were used */
1054
    if (options && (qdict_size(options) != 0)) {
1055
        const QDictEntry *entry = qdict_first(options);
1056
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1057
                   drv->format_name, entry->key);
1058
        ret = -EINVAL;
1059
        goto fail;
1060
    }
1061
    QDECREF(options);
1062

    
1063
    bs->growable = 1;
1064
    *pbs = bs;
1065
    return 0;
1066

    
1067
fail:
1068
    QDECREF(options);
1069
    if (!bs->drv) {
1070
        QDECREF(bs->options);
1071
    }
1072
    bdrv_unref(bs);
1073
    return ret;
1074
}
1075

    
1076
/*
1077
 * Opens the backing file for a BlockDriverState if not yet open
1078
 *
1079
 * options is a QDict of options to pass to the block drivers, or NULL for an
1080
 * empty set of options. The reference to the QDict is transferred to this
1081
 * function (even on failure), so if the caller intends to reuse the dictionary,
1082
 * it needs to use QINCREF() before calling bdrv_file_open.
1083
 */
1084
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1085
{
1086
    char backing_filename[PATH_MAX];
1087
    int back_flags, ret;
1088
    BlockDriver *back_drv = NULL;
1089
    Error *local_err = NULL;
1090

    
1091
    if (bs->backing_hd != NULL) {
1092
        QDECREF(options);
1093
        return 0;
1094
    }
1095

    
1096
    /* NULL means an empty set of options */
1097
    if (options == NULL) {
1098
        options = qdict_new();
1099
    }
1100

    
1101
    bs->open_flags &= ~BDRV_O_NO_BACKING;
1102
    if (qdict_haskey(options, "file.filename")) {
1103
        backing_filename[0] = '\0';
1104
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1105
        QDECREF(options);
1106
        return 0;
1107
    } else {
1108
        bdrv_get_full_backing_filename(bs, backing_filename,
1109
                                       sizeof(backing_filename));
1110
    }
1111

    
1112
    bs->backing_hd = bdrv_new("");
1113

    
1114
    if (bs->backing_format[0] != '\0') {
1115
        back_drv = bdrv_find_format(bs->backing_format);
1116
    }
1117

    
1118
    /* backing files always opened read-only */
1119
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1120
                                    BDRV_O_COPY_ON_READ);
1121

    
1122
    ret = bdrv_open(bs->backing_hd,
1123
                    *backing_filename ? backing_filename : NULL, options,
1124
                    back_flags, back_drv, &local_err);
1125
    if (ret < 0) {
1126
        bdrv_unref(bs->backing_hd);
1127
        bs->backing_hd = NULL;
1128
        bs->open_flags |= BDRV_O_NO_BACKING;
1129
        error_setg(errp, "Could not open backing file: %s",
1130
                   error_get_pretty(local_err));
1131
        error_free(local_err);
1132
        return ret;
1133
    }
1134

    
1135
    if (bs->backing_hd->file) {
1136
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1137
                bs->backing_hd->file->filename);
1138
    }
1139

    
1140
    /* Recalculate the BlockLimits with the backing file */
1141
    bdrv_refresh_limits(bs);
1142

    
1143
    return 0;
1144
}
1145

    
1146
/*
1147
 * Opens a disk image whose options are given as BlockdevRef in another block
1148
 * device's options.
1149
 *
1150
 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1151
 * image format auto-detection. If it is false and a filename is given,
1152
 * bdrv_open() will be used for auto-detection.
1153
 *
1154
 * If allow_none is true, no image will be opened if filename is false and no
1155
 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1156
 *
1157
 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1158
 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1159
 * itself, all options starting with "${bdref_key}." are considered part of the
1160
 * BlockdevRef.
1161
 *
1162
 * The BlockdevRef will be removed from the options QDict.
1163
 */
1164
int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1165
                    QDict *options, const char *bdref_key, int flags,
1166
                    bool force_raw, bool allow_none, Error **errp)
1167
{
1168
    QDict *image_options;
1169
    int ret;
1170
    char *bdref_key_dot;
1171
    const char *reference;
1172

    
1173
    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1174
    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1175
    g_free(bdref_key_dot);
1176

    
1177
    reference = qdict_get_try_str(options, bdref_key);
1178
    if (!filename && !reference && !qdict_size(image_options)) {
1179
        if (allow_none) {
1180
            ret = 0;
1181
        } else {
1182
            error_setg(errp, "A block device must be specified for \"%s\"",
1183
                       bdref_key);
1184
            ret = -EINVAL;
1185
        }
1186
        goto done;
1187
    }
1188

    
1189
    if (filename && !force_raw) {
1190
        /* If a filename is given and the block driver should be detected
1191
           automatically (instead of using none), use bdrv_open() in order to do
1192
           that auto-detection. */
1193
        BlockDriverState *bs;
1194

    
1195
        if (reference) {
1196
            error_setg(errp, "Cannot reference an existing block device while "
1197
                       "giving a filename");
1198
            ret = -EINVAL;
1199
            goto done;
1200
        }
1201

    
1202
        bs = bdrv_new("");
1203
        ret = bdrv_open(bs, filename, image_options, flags, NULL, errp);
1204
        if (ret < 0) {
1205
            bdrv_unref(bs);
1206
        } else {
1207
            *pbs = bs;
1208
        }
1209
    } else {
1210
        ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
1211
                             errp);
1212
    }
1213

    
1214
done:
1215
    qdict_del(options, bdref_key);
1216
    return ret;
1217
}
1218

    
1219
/*
1220
 * Opens a disk image (raw, qcow2, vmdk, ...)
1221
 *
1222
 * options is a QDict of options to pass to the block drivers, or NULL for an
1223
 * empty set of options. The reference to the QDict belongs to the block layer
1224
 * after the call (even on failure), so if the caller intends to reuse the
1225
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1226
 */
1227
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1228
              int flags, BlockDriver *drv, Error **errp)
1229
{
1230
    int ret;
1231
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1232
    char tmp_filename[PATH_MAX + 1];
1233
    BlockDriverState *file = NULL;
1234
    const char *drvname;
1235
    Error *local_err = NULL;
1236

    
1237
    /* NULL means an empty set of options */
1238
    if (options == NULL) {
1239
        options = qdict_new();
1240
    }
1241

    
1242
    bs->options = options;
1243
    options = qdict_clone_shallow(options);
1244

    
1245
    /* For snapshot=on, create a temporary qcow2 overlay */
1246
    if (flags & BDRV_O_SNAPSHOT) {
1247
        BlockDriverState *bs1;
1248
        int64_t total_size;
1249
        BlockDriver *bdrv_qcow2;
1250
        QEMUOptionParameter *create_options;
1251
        QDict *snapshot_options;
1252

    
1253
        /* if snapshot, we create a temporary backing file and open it
1254
           instead of opening 'filename' directly */
1255

    
1256
        /* Get the required size from the image */
1257
        bs1 = bdrv_new("");
1258
        QINCREF(options);
1259
        ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
1260
                        drv, &local_err);
1261
        if (ret < 0) {
1262
            bdrv_unref(bs1);
1263
            goto fail;
1264
        }
1265
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1266

    
1267
        bdrv_unref(bs1);
1268

    
1269
        /* Create the temporary image */
1270
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1271
        if (ret < 0) {
1272
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1273
            goto fail;
1274
        }
1275

    
1276
        bdrv_qcow2 = bdrv_find_format("qcow2");
1277
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1278
                                                 NULL);
1279

    
1280
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1281

    
1282
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1283
        free_option_parameters(create_options);
1284
        if (ret < 0) {
1285
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1286
                             "'%s': %s", tmp_filename,
1287
                             error_get_pretty(local_err));
1288
            error_free(local_err);
1289
            local_err = NULL;
1290
            goto fail;
1291
        }
1292

    
1293
        /* Prepare a new options QDict for the temporary file, where user
1294
         * options refer to the backing file */
1295
        if (filename) {
1296
            qdict_put(options, "file.filename", qstring_from_str(filename));
1297
        }
1298
        if (drv) {
1299
            qdict_put(options, "driver", qstring_from_str(drv->format_name));
1300
        }
1301

    
1302
        snapshot_options = qdict_new();
1303
        qdict_put(snapshot_options, "backing", options);
1304
        qdict_flatten(snapshot_options);
1305

    
1306
        bs->options = snapshot_options;
1307
        options = qdict_clone_shallow(bs->options);
1308

    
1309
        filename = tmp_filename;
1310
        drv = bdrv_qcow2;
1311
        bs->is_temporary = 1;
1312
    }
1313

    
1314
    /* Open image file without format layer */
1315
    if (flags & BDRV_O_RDWR) {
1316
        flags |= BDRV_O_ALLOW_RDWR;
1317
    }
1318

    
1319
    ret = bdrv_open_image(&file, filename, options, "file",
1320
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1321
                          &local_err);
1322
    if (ret < 0) {
1323
        goto fail;
1324
    }
1325

    
1326
    /* Find the right image format driver */
1327
    drvname = qdict_get_try_str(options, "driver");
1328
    if (drvname) {
1329
        drv = bdrv_find_format(drvname);
1330
        qdict_del(options, "driver");
1331
        if (!drv) {
1332
            error_setg(errp, "Invalid driver: '%s'", drvname);
1333
            ret = -EINVAL;
1334
            goto unlink_and_fail;
1335
        }
1336
    }
1337

    
1338
    if (!drv) {
1339
        if (file) {
1340
            ret = find_image_format(file, filename, &drv, &local_err);
1341
        } else {
1342
            error_setg(errp, "Must specify either driver or file");
1343
            ret = -EINVAL;
1344
            goto unlink_and_fail;
1345
        }
1346
    }
1347

    
1348
    if (!drv) {
1349
        goto unlink_and_fail;
1350
    }
1351

    
1352
    /* Open the image */
1353
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1354
    if (ret < 0) {
1355
        goto unlink_and_fail;
1356
    }
1357

    
1358
    if (file && (bs->file != file)) {
1359
        bdrv_unref(file);
1360
        file = NULL;
1361
    }
1362

    
1363
    /* If there is a backing file, use it */
1364
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1365
        QDict *backing_options;
1366

    
1367
        qdict_extract_subqdict(options, &backing_options, "backing.");
1368
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1369
        if (ret < 0) {
1370
            goto close_and_fail;
1371
        }
1372
    }
1373

    
1374
    /* Check if any unknown options were used */
1375
    if (qdict_size(options) != 0) {
1376
        const QDictEntry *entry = qdict_first(options);
1377
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1378
                   "support the option '%s'", drv->format_name, bs->device_name,
1379
                   entry->key);
1380

    
1381
        ret = -EINVAL;
1382
        goto close_and_fail;
1383
    }
1384
    QDECREF(options);
1385

    
1386
    if (!bdrv_key_required(bs)) {
1387
        bdrv_dev_change_media_cb(bs, true);
1388
    }
1389

    
1390
    return 0;
1391

    
1392
unlink_and_fail:
1393
    if (file != NULL) {
1394
        bdrv_unref(file);
1395
    }
1396
    if (bs->is_temporary) {
1397
        unlink(filename);
1398
    }
1399
fail:
1400
    QDECREF(bs->options);
1401
    QDECREF(options);
1402
    bs->options = NULL;
1403
    if (error_is_set(&local_err)) {
1404
        error_propagate(errp, local_err);
1405
    }
1406
    return ret;
1407

    
1408
close_and_fail:
1409
    bdrv_close(bs);
1410
    QDECREF(options);
1411
    if (error_is_set(&local_err)) {
1412
        error_propagate(errp, local_err);
1413
    }
1414
    return ret;
1415
}
1416

    
1417
typedef struct BlockReopenQueueEntry {
1418
     bool prepared;
1419
     BDRVReopenState state;
1420
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1421
} BlockReopenQueueEntry;
1422

    
1423
/*
1424
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1425
 * reopen of multiple devices.
1426
 *
1427
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1428
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1429
 * be created and initialized. This newly created BlockReopenQueue should be
1430
 * passed back in for subsequent calls that are intended to be of the same
1431
 * atomic 'set'.
1432
 *
1433
 * bs is the BlockDriverState to add to the reopen queue.
1434
 *
1435
 * flags contains the open flags for the associated bs
1436
 *
1437
 * returns a pointer to bs_queue, which is either the newly allocated
1438
 * bs_queue, or the existing bs_queue being used.
1439
 *
1440
 */
1441
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1442
                                    BlockDriverState *bs, int flags)
1443
{
1444
    assert(bs != NULL);
1445

    
1446
    BlockReopenQueueEntry *bs_entry;
1447
    if (bs_queue == NULL) {
1448
        bs_queue = g_new0(BlockReopenQueue, 1);
1449
        QSIMPLEQ_INIT(bs_queue);
1450
    }
1451

    
1452
    if (bs->file) {
1453
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1454
    }
1455

    
1456
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1457
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1458

    
1459
    bs_entry->state.bs = bs;
1460
    bs_entry->state.flags = flags;
1461

    
1462
    return bs_queue;
1463
}
1464

    
1465
/*
1466
 * Reopen multiple BlockDriverStates atomically & transactionally.
1467
 *
1468
 * The queue passed in (bs_queue) must have been built up previous
1469
 * via bdrv_reopen_queue().
1470
 *
1471
 * Reopens all BDS specified in the queue, with the appropriate
1472
 * flags.  All devices are prepared for reopen, and failure of any
1473
 * device will cause all device changes to be abandonded, and intermediate
1474
 * data cleaned up.
1475
 *
1476
 * If all devices prepare successfully, then the changes are committed
1477
 * to all devices.
1478
 *
1479
 */
1480
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1481
{
1482
    int ret = -1;
1483
    BlockReopenQueueEntry *bs_entry, *next;
1484
    Error *local_err = NULL;
1485

    
1486
    assert(bs_queue != NULL);
1487

    
1488
    bdrv_drain_all();
1489

    
1490
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1491
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1492
            error_propagate(errp, local_err);
1493
            goto cleanup;
1494
        }
1495
        bs_entry->prepared = true;
1496
    }
1497

    
1498
    /* If we reach this point, we have success and just need to apply the
1499
     * changes
1500
     */
1501
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1502
        bdrv_reopen_commit(&bs_entry->state);
1503
    }
1504

    
1505
    ret = 0;
1506

    
1507
cleanup:
1508
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1509
        if (ret && bs_entry->prepared) {
1510
            bdrv_reopen_abort(&bs_entry->state);
1511
        }
1512
        g_free(bs_entry);
1513
    }
1514
    g_free(bs_queue);
1515
    return ret;
1516
}
1517

    
1518

    
1519
/* Reopen a single BlockDriverState with the specified flags. */
1520
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1521
{
1522
    int ret = -1;
1523
    Error *local_err = NULL;
1524
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1525

    
1526
    ret = bdrv_reopen_multiple(queue, &local_err);
1527
    if (local_err != NULL) {
1528
        error_propagate(errp, local_err);
1529
    }
1530
    return ret;
1531
}
1532

    
1533

    
1534
/*
1535
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1536
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1537
 * the block driver layer .bdrv_reopen_prepare()
1538
 *
1539
 * bs is the BlockDriverState to reopen
1540
 * flags are the new open flags
1541
 * queue is the reopen queue
1542
 *
1543
 * Returns 0 on success, non-zero on error.  On error errp will be set
1544
 * as well.
1545
 *
1546
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1547
 * It is the responsibility of the caller to then call the abort() or
1548
 * commit() for any other BDS that have been left in a prepare() state
1549
 *
1550
 */
1551
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1552
                        Error **errp)
1553
{
1554
    int ret = -1;
1555
    Error *local_err = NULL;
1556
    BlockDriver *drv;
1557

    
1558
    assert(reopen_state != NULL);
1559
    assert(reopen_state->bs->drv != NULL);
1560
    drv = reopen_state->bs->drv;
1561

    
1562
    /* if we are to stay read-only, do not allow permission change
1563
     * to r/w */
1564
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1565
        reopen_state->flags & BDRV_O_RDWR) {
1566
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1567
                  reopen_state->bs->device_name);
1568
        goto error;
1569
    }
1570

    
1571

    
1572
    ret = bdrv_flush(reopen_state->bs);
1573
    if (ret) {
1574
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1575
                  strerror(-ret));
1576
        goto error;
1577
    }
1578

    
1579
    if (drv->bdrv_reopen_prepare) {
1580
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1581
        if (ret) {
1582
            if (local_err != NULL) {
1583
                error_propagate(errp, local_err);
1584
            } else {
1585
                error_setg(errp, "failed while preparing to reopen image '%s'",
1586
                           reopen_state->bs->filename);
1587
            }
1588
            goto error;
1589
        }
1590
    } else {
1591
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1592
         * handler for each supported drv. */
1593
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1594
                  drv->format_name, reopen_state->bs->device_name,
1595
                 "reopening of file");
1596
        ret = -1;
1597
        goto error;
1598
    }
1599

    
1600
    ret = 0;
1601

    
1602
error:
1603
    return ret;
1604
}
1605

    
1606
/*
1607
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1608
 * makes them final by swapping the staging BlockDriverState contents into
1609
 * the active BlockDriverState contents.
1610
 */
1611
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1612
{
1613
    BlockDriver *drv;
1614

    
1615
    assert(reopen_state != NULL);
1616
    drv = reopen_state->bs->drv;
1617
    assert(drv != NULL);
1618

    
1619
    /* If there are any driver level actions to take */
1620
    if (drv->bdrv_reopen_commit) {
1621
        drv->bdrv_reopen_commit(reopen_state);
1622
    }
1623

    
1624
    /* set BDS specific flags now */
1625
    reopen_state->bs->open_flags         = reopen_state->flags;
1626
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1627
                                              BDRV_O_CACHE_WB);
1628
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1629

    
1630
    bdrv_refresh_limits(reopen_state->bs);
1631
}
1632

    
1633
/*
1634
 * Abort the reopen, and delete and free the staged changes in
1635
 * reopen_state
1636
 */
1637
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1638
{
1639
    BlockDriver *drv;
1640

    
1641
    assert(reopen_state != NULL);
1642
    drv = reopen_state->bs->drv;
1643
    assert(drv != NULL);
1644

    
1645
    if (drv->bdrv_reopen_abort) {
1646
        drv->bdrv_reopen_abort(reopen_state);
1647
    }
1648
}
1649

    
1650

    
1651
void bdrv_close(BlockDriverState *bs)
1652
{
1653
    if (bs->job) {
1654
        block_job_cancel_sync(bs->job);
1655
    }
1656
    bdrv_drain_all(); /* complete I/O */
1657
    bdrv_flush(bs);
1658
    bdrv_drain_all(); /* in case flush left pending I/O */
1659
    notifier_list_notify(&bs->close_notifiers, bs);
1660

    
1661
    if (bs->drv) {
1662
        if (bs->backing_hd) {
1663
            bdrv_unref(bs->backing_hd);
1664
            bs->backing_hd = NULL;
1665
        }
1666
        bs->drv->bdrv_close(bs);
1667
        g_free(bs->opaque);
1668
#ifdef _WIN32
1669
        if (bs->is_temporary) {
1670
            unlink(bs->filename);
1671
        }
1672
#endif
1673
        bs->opaque = NULL;
1674
        bs->drv = NULL;
1675
        bs->copy_on_read = 0;
1676
        bs->backing_file[0] = '\0';
1677
        bs->backing_format[0] = '\0';
1678
        bs->total_sectors = 0;
1679
        bs->encrypted = 0;
1680
        bs->valid_key = 0;
1681
        bs->sg = 0;
1682
        bs->growable = 0;
1683
        bs->zero_beyond_eof = false;
1684
        QDECREF(bs->options);
1685
        bs->options = NULL;
1686

    
1687
        if (bs->file != NULL) {
1688
            bdrv_unref(bs->file);
1689
            bs->file = NULL;
1690
        }
1691
    }
1692

    
1693
    bdrv_dev_change_media_cb(bs, false);
1694

    
1695
    /*throttling disk I/O limits*/
1696
    if (bs->io_limits_enabled) {
1697
        bdrv_io_limits_disable(bs);
1698
    }
1699
}
1700

    
1701
void bdrv_close_all(void)
1702
{
1703
    BlockDriverState *bs;
1704

    
1705
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1706
        bdrv_close(bs);
1707
    }
1708
}
1709

    
1710
/* Check if any requests are in-flight (including throttled requests) */
1711
static bool bdrv_requests_pending(BlockDriverState *bs)
1712
{
1713
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1714
        return true;
1715
    }
1716
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1717
        return true;
1718
    }
1719
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1720
        return true;
1721
    }
1722
    if (bs->file && bdrv_requests_pending(bs->file)) {
1723
        return true;
1724
    }
1725
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1726
        return true;
1727
    }
1728
    return false;
1729
}
1730

    
1731
static bool bdrv_requests_pending_all(void)
1732
{
1733
    BlockDriverState *bs;
1734
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1735
        if (bdrv_requests_pending(bs)) {
1736
            return true;
1737
        }
1738
    }
1739
    return false;
1740
}
1741

    
1742
/*
1743
 * Wait for pending requests to complete across all BlockDriverStates
1744
 *
1745
 * This function does not flush data to disk, use bdrv_flush_all() for that
1746
 * after calling this function.
1747
 *
1748
 * Note that completion of an asynchronous I/O operation can trigger any
1749
 * number of other I/O operations on other devices---for example a coroutine
1750
 * can be arbitrarily complex and a constant flow of I/O can come until the
1751
 * coroutine is complete.  Because of this, it is not possible to have a
1752
 * function to drain a single device's I/O queue.
1753
 */
1754
void bdrv_drain_all(void)
1755
{
1756
    /* Always run first iteration so any pending completion BHs run */
1757
    bool busy = true;
1758
    BlockDriverState *bs;
1759

    
1760
    while (busy) {
1761
        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1762
            bdrv_start_throttled_reqs(bs);
1763
        }
1764

    
1765
        busy = bdrv_requests_pending_all();
1766
        busy |= aio_poll(qemu_get_aio_context(), busy);
1767
    }
1768
}
1769

    
1770
/* make a BlockDriverState anonymous by removing from bdrv_state and
1771
 * graph_bdrv_state list.
1772
   Also, NULL terminate the device_name to prevent double remove */
1773
void bdrv_make_anon(BlockDriverState *bs)
1774
{
1775
    if (bs->device_name[0] != '\0') {
1776
        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1777
    }
1778
    bs->device_name[0] = '\0';
1779
    if (bs->node_name[0] != '\0') {
1780
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1781
    }
1782
    bs->node_name[0] = '\0';
1783
}
1784

    
1785
static void bdrv_rebind(BlockDriverState *bs)
1786
{
1787
    if (bs->drv && bs->drv->bdrv_rebind) {
1788
        bs->drv->bdrv_rebind(bs);
1789
    }
1790
}
1791

    
1792
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1793
                                     BlockDriverState *bs_src)
1794
{
1795
    /* move some fields that need to stay attached to the device */
1796
    bs_dest->open_flags         = bs_src->open_flags;
1797

    
1798
    /* dev info */
1799
    bs_dest->dev_ops            = bs_src->dev_ops;
1800
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1801
    bs_dest->dev                = bs_src->dev;
1802
    bs_dest->guest_block_size   = bs_src->guest_block_size;
1803
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1804

    
1805
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1806

    
1807
    /* i/o throttled req */
1808
    memcpy(&bs_dest->throttle_state,
1809
           &bs_src->throttle_state,
1810
           sizeof(ThrottleState));
1811
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1812
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1813
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1814

    
1815
    /* r/w error */
1816
    bs_dest->on_read_error      = bs_src->on_read_error;
1817
    bs_dest->on_write_error     = bs_src->on_write_error;
1818

    
1819
    /* i/o status */
1820
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1821
    bs_dest->iostatus           = bs_src->iostatus;
1822

    
1823
    /* dirty bitmap */
1824
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1825

    
1826
    /* reference count */
1827
    bs_dest->refcnt             = bs_src->refcnt;
1828

    
1829
    /* job */
1830
    bs_dest->in_use             = bs_src->in_use;
1831
    bs_dest->job                = bs_src->job;
1832

    
1833
    /* keep the same entry in bdrv_states */
1834
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1835
            bs_src->device_name);
1836
    bs_dest->device_list = bs_src->device_list;
1837

    
1838
    /* keep the same entry in graph_bdrv_states
1839
     * We do want to swap name but don't want to swap linked list entries
1840
     */
1841
    bs_dest->node_list   = bs_src->node_list;
1842
}
1843

    
1844
/*
1845
 * Swap bs contents for two image chains while they are live,
1846
 * while keeping required fields on the BlockDriverState that is
1847
 * actually attached to a device.
1848
 *
1849
 * This will modify the BlockDriverState fields, and swap contents
1850
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1851
 *
1852
 * bs_new is required to be anonymous.
1853
 *
1854
 * This function does not create any image files.
1855
 */
1856
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1857
{
1858
    BlockDriverState tmp;
1859

    
1860
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1861
    assert(bs_new->device_name[0] == '\0');
1862
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1863
    assert(bs_new->job == NULL);
1864
    assert(bs_new->dev == NULL);
1865
    assert(bs_new->in_use == 0);
1866
    assert(bs_new->io_limits_enabled == false);
1867
    assert(!throttle_have_timer(&bs_new->throttle_state));
1868

    
1869
    tmp = *bs_new;
1870
    *bs_new = *bs_old;
1871
    *bs_old = tmp;
1872

    
1873
    /* there are some fields that should not be swapped, move them back */
1874
    bdrv_move_feature_fields(&tmp, bs_old);
1875
    bdrv_move_feature_fields(bs_old, bs_new);
1876
    bdrv_move_feature_fields(bs_new, &tmp);
1877

    
1878
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1879
    assert(bs_new->device_name[0] == '\0');
1880

    
1881
    /* Check a few fields that should remain attached to the device */
1882
    assert(bs_new->dev == NULL);
1883
    assert(bs_new->job == NULL);
1884
    assert(bs_new->in_use == 0);
1885
    assert(bs_new->io_limits_enabled == false);
1886
    assert(!throttle_have_timer(&bs_new->throttle_state));
1887

    
1888
    bdrv_rebind(bs_new);
1889
    bdrv_rebind(bs_old);
1890
}
1891

    
1892
/*
1893
 * Add new bs contents at the top of an image chain while the chain is
1894
 * live, while keeping required fields on the top layer.
1895
 *
1896
 * This will modify the BlockDriverState fields, and swap contents
1897
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1898
 *
1899
 * bs_new is required to be anonymous.
1900
 *
1901
 * This function does not create any image files.
1902
 */
1903
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1904
{
1905
    bdrv_swap(bs_new, bs_top);
1906

    
1907
    /* The contents of 'tmp' will become bs_top, as we are
1908
     * swapping bs_new and bs_top contents. */
1909
    bs_top->backing_hd = bs_new;
1910
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1911
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1912
            bs_new->filename);
1913
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1914
            bs_new->drv ? bs_new->drv->format_name : "");
1915
}
1916

    
1917
static void bdrv_delete(BlockDriverState *bs)
1918
{
1919
    assert(!bs->dev);
1920
    assert(!bs->job);
1921
    assert(!bs->in_use);
1922
    assert(!bs->refcnt);
1923
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1924

    
1925
    bdrv_close(bs);
1926

    
1927
    /* remove from list, if necessary */
1928
    bdrv_make_anon(bs);
1929

    
1930
    g_free(bs);
1931
}
1932

    
1933
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1934
/* TODO change to DeviceState *dev when all users are qdevified */
1935
{
1936
    if (bs->dev) {
1937
        return -EBUSY;
1938
    }
1939
    bs->dev = dev;
1940
    bdrv_iostatus_reset(bs);
1941
    return 0;
1942
}
1943

    
1944
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1945
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1946
{
1947
    if (bdrv_attach_dev(bs, dev) < 0) {
1948
        abort();
1949
    }
1950
}
1951

    
1952
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1953
/* TODO change to DeviceState *dev when all users are qdevified */
1954
{
1955
    assert(bs->dev == dev);
1956
    bs->dev = NULL;
1957
    bs->dev_ops = NULL;
1958
    bs->dev_opaque = NULL;
1959
    bs->guest_block_size = 512;
1960
}
1961

    
1962
/* TODO change to return DeviceState * when all users are qdevified */
1963
void *bdrv_get_attached_dev(BlockDriverState *bs)
1964
{
1965
    return bs->dev;
1966
}
1967

    
1968
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1969
                      void *opaque)
1970
{
1971
    bs->dev_ops = ops;
1972
    bs->dev_opaque = opaque;
1973
}
1974

    
1975
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1976
                               enum MonitorEvent ev,
1977
                               BlockErrorAction action, bool is_read)
1978
{
1979
    QObject *data;
1980
    const char *action_str;
1981

    
1982
    switch (action) {
1983
    case BDRV_ACTION_REPORT:
1984
        action_str = "report";
1985
        break;
1986
    case BDRV_ACTION_IGNORE:
1987
        action_str = "ignore";
1988
        break;
1989
    case BDRV_ACTION_STOP:
1990
        action_str = "stop";
1991
        break;
1992
    default:
1993
        abort();
1994
    }
1995

    
1996
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1997
                              bdrv->device_name,
1998
                              action_str,
1999
                              is_read ? "read" : "write");
2000
    monitor_protocol_event(ev, data);
2001

    
2002
    qobject_decref(data);
2003
}
2004

    
2005
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2006
{
2007
    QObject *data;
2008

    
2009
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2010
                              bdrv_get_device_name(bs), ejected);
2011
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2012

    
2013
    qobject_decref(data);
2014
}
2015

    
2016
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2017
{
2018
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2019
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2020
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2021
        if (tray_was_closed) {
2022
            /* tray open */
2023
            bdrv_emit_qmp_eject_event(bs, true);
2024
        }
2025
        if (load) {
2026
            /* tray close */
2027
            bdrv_emit_qmp_eject_event(bs, false);
2028
        }
2029
    }
2030
}
2031

    
2032
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2033
{
2034
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2035
}
2036

    
2037
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2038
{
2039
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2040
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2041
    }
2042
}
2043

    
2044
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2045
{
2046
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2047
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
2048
    }
2049
    return false;
2050
}
2051

    
2052
static void bdrv_dev_resize_cb(BlockDriverState *bs)
2053
{
2054
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
2055
        bs->dev_ops->resize_cb(bs->dev_opaque);
2056
    }
2057
}
2058

    
2059
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2060
{
2061
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2062
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2063
    }
2064
    return false;
2065
}
2066

    
2067
/*
2068
 * Run consistency checks on an image
2069
 *
2070
 * Returns 0 if the check could be completed (it doesn't mean that the image is
2071
 * free of errors) or -errno when an internal error occurred. The results of the
2072
 * check are stored in res.
2073
 */
2074
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2075
{
2076
    if (bs->drv->bdrv_check == NULL) {
2077
        return -ENOTSUP;
2078
    }
2079

    
2080
    memset(res, 0, sizeof(*res));
2081
    return bs->drv->bdrv_check(bs, res, fix);
2082
}
2083

    
2084
#define COMMIT_BUF_SECTORS 2048
2085

    
2086
/* commit COW file into the raw image */
2087
int bdrv_commit(BlockDriverState *bs)
2088
{
2089
    BlockDriver *drv = bs->drv;
2090
    int64_t sector, total_sectors, length, backing_length;
2091
    int n, ro, open_flags;
2092
    int ret = 0;
2093
    uint8_t *buf = NULL;
2094
    char filename[PATH_MAX];
2095

    
2096
    if (!drv)
2097
        return -ENOMEDIUM;
2098
    
2099
    if (!bs->backing_hd) {
2100
        return -ENOTSUP;
2101
    }
2102

    
2103
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2104
        return -EBUSY;
2105
    }
2106

    
2107
    ro = bs->backing_hd->read_only;
2108
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2109
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2110
    open_flags =  bs->backing_hd->open_flags;
2111

    
2112
    if (ro) {
2113
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2114
            return -EACCES;
2115
        }
2116
    }
2117

    
2118
    length = bdrv_getlength(bs);
2119
    if (length < 0) {
2120
        ret = length;
2121
        goto ro_cleanup;
2122
    }
2123

    
2124
    backing_length = bdrv_getlength(bs->backing_hd);
2125
    if (backing_length < 0) {
2126
        ret = backing_length;
2127
        goto ro_cleanup;
2128
    }
2129

    
2130
    /* If our top snapshot is larger than the backing file image,
2131
     * grow the backing file image if possible.  If not possible,
2132
     * we must return an error */
2133
    if (length > backing_length) {
2134
        ret = bdrv_truncate(bs->backing_hd, length);
2135
        if (ret < 0) {
2136
            goto ro_cleanup;
2137
        }
2138
    }
2139

    
2140
    total_sectors = length >> BDRV_SECTOR_BITS;
2141
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2142

    
2143
    for (sector = 0; sector < total_sectors; sector += n) {
2144
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2145
        if (ret < 0) {
2146
            goto ro_cleanup;
2147
        }
2148
        if (ret) {
2149
            ret = bdrv_read(bs, sector, buf, n);
2150
            if (ret < 0) {
2151
                goto ro_cleanup;
2152
            }
2153

    
2154
            ret = bdrv_write(bs->backing_hd, sector, buf, n);
2155
            if (ret < 0) {
2156
                goto ro_cleanup;
2157
            }
2158
        }
2159
    }
2160

    
2161
    if (drv->bdrv_make_empty) {
2162
        ret = drv->bdrv_make_empty(bs);
2163
        if (ret < 0) {
2164
            goto ro_cleanup;
2165
        }
2166
        bdrv_flush(bs);
2167
    }
2168

    
2169
    /*
2170
     * Make sure all data we wrote to the backing device is actually
2171
     * stable on disk.
2172
     */
2173
    if (bs->backing_hd) {
2174
        bdrv_flush(bs->backing_hd);
2175
    }
2176

    
2177
    ret = 0;
2178
ro_cleanup:
2179
    g_free(buf);
2180

    
2181
    if (ro) {
2182
        /* ignoring error return here */
2183
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2184
    }
2185

    
2186
    return ret;
2187
}
2188

    
2189
int bdrv_commit_all(void)
2190
{
2191
    BlockDriverState *bs;
2192

    
2193
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2194
        if (bs->drv && bs->backing_hd) {
2195
            int ret = bdrv_commit(bs);
2196
            if (ret < 0) {
2197
                return ret;
2198
            }
2199
        }
2200
    }
2201
    return 0;
2202
}
2203

    
2204
/**
2205
 * Remove an active request from the tracked requests list
2206
 *
2207
 * This function should be called when a tracked request is completing.
2208
 */
2209
static void tracked_request_end(BdrvTrackedRequest *req)
2210
{
2211
    if (req->serialising) {
2212
        req->bs->serialising_in_flight--;
2213
    }
2214

    
2215
    QLIST_REMOVE(req, list);
2216
    qemu_co_queue_restart_all(&req->wait_queue);
2217
}
2218

    
2219
/**
2220
 * Add an active request to the tracked requests list
2221
 */
2222
static void tracked_request_begin(BdrvTrackedRequest *req,
2223
                                  BlockDriverState *bs,
2224
                                  int64_t offset,
2225
                                  unsigned int bytes, bool is_write)
2226
{
2227
    *req = (BdrvTrackedRequest){
2228
        .bs = bs,
2229
        .offset         = offset,
2230
        .bytes          = bytes,
2231
        .is_write       = is_write,
2232
        .co             = qemu_coroutine_self(),
2233
        .serialising    = false,
2234
        .overlap_offset = offset,
2235
        .overlap_bytes  = bytes,
2236
    };
2237

    
2238
    qemu_co_queue_init(&req->wait_queue);
2239

    
2240
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2241
}
2242

    
2243
static void mark_request_serialising(BdrvTrackedRequest *req, size_t align)
2244
{
2245
    int64_t overlap_offset = req->offset & ~(align - 1);
2246
    int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2247
                      - overlap_offset;
2248

    
2249
    if (!req->serialising) {
2250
        req->bs->serialising_in_flight++;
2251
        req->serialising = true;
2252
    }
2253

    
2254
    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2255
    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2256
}
2257

    
2258
/**
2259
 * Round a region to cluster boundaries
2260
 */
2261
void bdrv_round_to_clusters(BlockDriverState *bs,
2262
                            int64_t sector_num, int nb_sectors,
2263
                            int64_t *cluster_sector_num,
2264
                            int *cluster_nb_sectors)
2265
{
2266
    BlockDriverInfo bdi;
2267

    
2268
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2269
        *cluster_sector_num = sector_num;
2270
        *cluster_nb_sectors = nb_sectors;
2271
    } else {
2272
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2273
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2274
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2275
                                            nb_sectors, c);
2276
    }
2277
}
2278

    
2279
static int bdrv_get_cluster_size(BlockDriverState *bs)
2280
{
2281
    BlockDriverInfo bdi;
2282
    int ret;
2283

    
2284
    ret = bdrv_get_info(bs, &bdi);
2285
    if (ret < 0 || bdi.cluster_size == 0) {
2286
        return bs->request_alignment;
2287
    } else {
2288
        return bdi.cluster_size;
2289
    }
2290
}
2291

    
2292
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2293
                                     int64_t offset, unsigned int bytes)
2294
{
2295
    /*        aaaa   bbbb */
2296
    if (offset >= req->overlap_offset + req->overlap_bytes) {
2297
        return false;
2298
    }
2299
    /* bbbb   aaaa        */
2300
    if (req->overlap_offset >= offset + bytes) {
2301
        return false;
2302
    }
2303
    return true;
2304
}
2305

    
2306
static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2307
{
2308
    BlockDriverState *bs = self->bs;
2309
    BdrvTrackedRequest *req;
2310
    bool retry;
2311

    
2312
    if (!bs->serialising_in_flight) {
2313
        return;
2314
    }
2315

    
2316
    do {
2317
        retry = false;
2318
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2319
            if (req == self || (!req->serialising && !self->serialising)) {
2320
                continue;
2321
            }
2322
            if (tracked_request_overlaps(req, self->overlap_offset,
2323
                                         self->overlap_bytes))
2324
            {
2325
                /* Hitting this means there was a reentrant request, for
2326
                 * example, a block driver issuing nested requests.  This must
2327
                 * never happen since it means deadlock.
2328
                 */
2329
                assert(qemu_coroutine_self() != req->co);
2330

    
2331
                qemu_co_queue_wait(&req->wait_queue);
2332
                retry = true;
2333
                break;
2334
            }
2335
        }
2336
    } while (retry);
2337
}
2338

    
2339
/*
2340
 * Return values:
2341
 * 0        - success
2342
 * -EINVAL  - backing format specified, but no file
2343
 * -ENOSPC  - can't update the backing file because no space is left in the
2344
 *            image file header
2345
 * -ENOTSUP - format driver doesn't support changing the backing file
2346
 */
2347
int bdrv_change_backing_file(BlockDriverState *bs,
2348
    const char *backing_file, const char *backing_fmt)
2349
{
2350
    BlockDriver *drv = bs->drv;
2351
    int ret;
2352

    
2353
    /* Backing file format doesn't make sense without a backing file */
2354
    if (backing_fmt && !backing_file) {
2355
        return -EINVAL;
2356
    }
2357

    
2358
    if (drv->bdrv_change_backing_file != NULL) {
2359
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2360
    } else {
2361
        ret = -ENOTSUP;
2362
    }
2363

    
2364
    if (ret == 0) {
2365
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2366
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2367
    }
2368
    return ret;
2369
}
2370

    
2371
/*
2372
 * Finds the image layer in the chain that has 'bs' as its backing file.
2373
 *
2374
 * active is the current topmost image.
2375
 *
2376
 * Returns NULL if bs is not found in active's image chain,
2377
 * or if active == bs.
2378
 */
2379
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2380
                                    BlockDriverState *bs)
2381
{
2382
    BlockDriverState *overlay = NULL;
2383
    BlockDriverState *intermediate;
2384

    
2385
    assert(active != NULL);
2386
    assert(bs != NULL);
2387

    
2388
    /* if bs is the same as active, then by definition it has no overlay
2389
     */
2390
    if (active == bs) {
2391
        return NULL;
2392
    }
2393

    
2394
    intermediate = active;
2395
    while (intermediate->backing_hd) {
2396
        if (intermediate->backing_hd == bs) {
2397
            overlay = intermediate;
2398
            break;
2399
        }
2400
        intermediate = intermediate->backing_hd;
2401
    }
2402

    
2403
    return overlay;
2404
}
2405

    
2406
typedef struct BlkIntermediateStates {
2407
    BlockDriverState *bs;
2408
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2409
} BlkIntermediateStates;
2410

    
2411

    
2412
/*
2413
 * Drops images above 'base' up to and including 'top', and sets the image
2414
 * above 'top' to have base as its backing file.
2415
 *
2416
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2417
 * information in 'bs' can be properly updated.
2418
 *
2419
 * E.g., this will convert the following chain:
2420
 * bottom <- base <- intermediate <- top <- active
2421
 *
2422
 * to
2423
 *
2424
 * bottom <- base <- active
2425
 *
2426
 * It is allowed for bottom==base, in which case it converts:
2427
 *
2428
 * base <- intermediate <- top <- active
2429
 *
2430
 * to
2431
 *
2432
 * base <- active
2433
 *
2434
 * Error conditions:
2435
 *  if active == top, that is considered an error
2436
 *
2437
 */
2438
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2439
                           BlockDriverState *base)
2440
{
2441
    BlockDriverState *intermediate;
2442
    BlockDriverState *base_bs = NULL;
2443
    BlockDriverState *new_top_bs = NULL;
2444
    BlkIntermediateStates *intermediate_state, *next;
2445
    int ret = -EIO;
2446

    
2447
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2448
    QSIMPLEQ_INIT(&states_to_delete);
2449

    
2450
    if (!top->drv || !base->drv) {
2451
        goto exit;
2452
    }
2453

    
2454
    new_top_bs = bdrv_find_overlay(active, top);
2455

    
2456
    if (new_top_bs == NULL) {
2457
        /* we could not find the image above 'top', this is an error */
2458
        goto exit;
2459
    }
2460

    
2461
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2462
     * to do, no intermediate images */
2463
    if (new_top_bs->backing_hd == base) {
2464
        ret = 0;
2465
        goto exit;
2466
    }
2467

    
2468
    intermediate = top;
2469

    
2470
    /* now we will go down through the list, and add each BDS we find
2471
     * into our deletion queue, until we hit the 'base'
2472
     */
2473
    while (intermediate) {
2474
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2475
        intermediate_state->bs = intermediate;
2476
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2477

    
2478
        if (intermediate->backing_hd == base) {
2479
            base_bs = intermediate->backing_hd;
2480
            break;
2481
        }
2482
        intermediate = intermediate->backing_hd;
2483
    }
2484
    if (base_bs == NULL) {
2485
        /* something went wrong, we did not end at the base. safely
2486
         * unravel everything, and exit with error */
2487
        goto exit;
2488
    }
2489

    
2490
    /* success - we can delete the intermediate states, and link top->base */
2491
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2492
                                   base_bs->drv ? base_bs->drv->format_name : "");
2493
    if (ret) {
2494
        goto exit;
2495
    }
2496
    new_top_bs->backing_hd = base_bs;
2497

    
2498
    bdrv_refresh_limits(new_top_bs);
2499

    
2500
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2501
        /* so that bdrv_close() does not recursively close the chain */
2502
        intermediate_state->bs->backing_hd = NULL;
2503
        bdrv_unref(intermediate_state->bs);
2504
    }
2505
    ret = 0;
2506

    
2507
exit:
2508
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2509
        g_free(intermediate_state);
2510
    }
2511
    return ret;
2512
}
2513

    
2514

    
2515
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2516
                                   size_t size)
2517
{
2518
    int64_t len;
2519

    
2520
    if (!bdrv_is_inserted(bs))
2521
        return -ENOMEDIUM;
2522

    
2523
    if (bs->growable)
2524
        return 0;
2525

    
2526
    len = bdrv_getlength(bs);
2527

    
2528
    if (offset < 0)
2529
        return -EIO;
2530

    
2531
    if ((offset > len) || (len - offset < size))
2532
        return -EIO;
2533

    
2534
    return 0;
2535
}
2536

    
2537
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2538
                              int nb_sectors)
2539
{
2540
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2541
                                   nb_sectors * BDRV_SECTOR_SIZE);
2542
}
2543

    
2544
typedef struct RwCo {
2545
    BlockDriverState *bs;
2546
    int64_t sector_num;
2547
    int nb_sectors;
2548
    QEMUIOVector *qiov;
2549
    bool is_write;
2550
    int ret;
2551
    BdrvRequestFlags flags;
2552
} RwCo;
2553

    
2554
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2555
{
2556
    RwCo *rwco = opaque;
2557

    
2558
    if (!rwco->is_write) {
2559
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2560
                                     rwco->nb_sectors, rwco->qiov,
2561
                                     rwco->flags);
2562
    } else {
2563
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2564
                                      rwco->nb_sectors, rwco->qiov,
2565
                                      rwco->flags);
2566
    }
2567
}
2568

    
2569
/*
2570
 * Process a vectored synchronous request using coroutines
2571
 */
2572
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2573
                       QEMUIOVector *qiov, bool is_write,
2574
                       BdrvRequestFlags flags)
2575
{
2576
    Coroutine *co;
2577
    RwCo rwco = {
2578
        .bs = bs,
2579
        .sector_num = sector_num,
2580
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2581
        .qiov = qiov,
2582
        .is_write = is_write,
2583
        .ret = NOT_DONE,
2584
        .flags = flags,
2585
    };
2586
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2587

    
2588
    /**
2589
     * In sync call context, when the vcpu is blocked, this throttling timer
2590
     * will not fire; so the I/O throttling function has to be disabled here
2591
     * if it has been enabled.
2592
     */
2593
    if (bs->io_limits_enabled) {
2594
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2595
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2596
        bdrv_io_limits_disable(bs);
2597
    }
2598

    
2599
    if (qemu_in_coroutine()) {
2600
        /* Fast-path if already in coroutine context */
2601
        bdrv_rw_co_entry(&rwco);
2602
    } else {
2603
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2604
        qemu_coroutine_enter(co, &rwco);
2605
        while (rwco.ret == NOT_DONE) {
2606
            qemu_aio_wait();
2607
        }
2608
    }
2609
    return rwco.ret;
2610
}
2611

    
2612
/*
2613
 * Process a synchronous request using coroutines
2614
 */
2615
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2616
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2617
{
2618
    QEMUIOVector qiov;
2619
    struct iovec iov = {
2620
        .iov_base = (void *)buf,
2621
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2622
    };
2623

    
2624
    qemu_iovec_init_external(&qiov, &iov, 1);
2625
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2626
}
2627

    
2628
/* return < 0 if error. See bdrv_write() for the return codes */
2629
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2630
              uint8_t *buf, int nb_sectors)
2631
{
2632
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2633
}
2634

    
2635
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2636
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2637
                          uint8_t *buf, int nb_sectors)
2638
{
2639
    bool enabled;
2640
    int ret;
2641

    
2642
    enabled = bs->io_limits_enabled;
2643
    bs->io_limits_enabled = false;
2644
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2645
    bs->io_limits_enabled = enabled;
2646
    return ret;
2647
}
2648

    
2649
/* Return < 0 if error. Important errors are:
2650
  -EIO         generic I/O error (may happen for all errors)
2651
  -ENOMEDIUM   No media inserted.
2652
  -EINVAL      Invalid sector number or nb_sectors
2653
  -EACCES      Trying to write a read-only device
2654
*/
2655
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2656
               const uint8_t *buf, int nb_sectors)
2657
{
2658
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2659
}
2660

    
2661
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2662
{
2663
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2664
}
2665

    
2666
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2667
                      int nb_sectors, BdrvRequestFlags flags)
2668
{
2669
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2670
                      BDRV_REQ_ZERO_WRITE | flags);
2671
}
2672

    
2673
/*
2674
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2675
 * The operation is sped up by checking the block status and only writing
2676
 * zeroes to the device if they currently do not return zeroes. Optional
2677
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2678
 *
2679
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2680
 */
2681
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2682
{
2683
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2684
    int64_t ret, nb_sectors, sector_num = 0;
2685
    int n;
2686

    
2687
    for (;;) {
2688
        nb_sectors = target_size - sector_num;
2689
        if (nb_sectors <= 0) {
2690
            return 0;
2691
        }
2692
        if (nb_sectors > INT_MAX) {
2693
            nb_sectors = INT_MAX;
2694
        }
2695
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2696
        if (ret < 0) {
2697
            error_report("error getting block status at sector %" PRId64 ": %s",
2698
                         sector_num, strerror(-ret));
2699
            return ret;
2700
        }
2701
        if (ret & BDRV_BLOCK_ZERO) {
2702
            sector_num += n;
2703
            continue;
2704
        }
2705
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2706
        if (ret < 0) {
2707
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2708
                         sector_num, strerror(-ret));
2709
            return ret;
2710
        }
2711
        sector_num += n;
2712
    }
2713
}
2714

    
2715
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2716
               void *buf, int count1)
2717
{
2718
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2719
    int len, nb_sectors, count;
2720
    int64_t sector_num;
2721
    int ret;
2722

    
2723
    count = count1;
2724
    /* first read to align to sector start */
2725
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2726
    if (len > count)
2727
        len = count;
2728
    sector_num = offset >> BDRV_SECTOR_BITS;
2729
    if (len > 0) {
2730
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2731
            return ret;
2732
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2733
        count -= len;
2734
        if (count == 0)
2735
            return count1;
2736
        sector_num++;
2737
        buf += len;
2738
    }
2739

    
2740
    /* read the sectors "in place" */
2741
    nb_sectors = count >> BDRV_SECTOR_BITS;
2742
    if (nb_sectors > 0) {
2743
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2744
            return ret;
2745
        sector_num += nb_sectors;
2746
        len = nb_sectors << BDRV_SECTOR_BITS;
2747
        buf += len;
2748
        count -= len;
2749
    }
2750

    
2751
    /* add data from the last sector */
2752
    if (count > 0) {
2753
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2754
            return ret;
2755
        memcpy(buf, tmp_buf, count);
2756
    }
2757
    return count1;
2758
}
2759

    
2760
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2761
{
2762
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2763
    int len, nb_sectors, count;
2764
    int64_t sector_num;
2765
    int ret;
2766

    
2767
    count = qiov->size;
2768

    
2769
    /* first write to align to sector start */
2770
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2771
    if (len > count)
2772
        len = count;
2773
    sector_num = offset >> BDRV_SECTOR_BITS;
2774
    if (len > 0) {
2775
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2776
            return ret;
2777
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2778
                          len);
2779
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2780
            return ret;
2781
        count -= len;
2782
        if (count == 0)
2783
            return qiov->size;
2784
        sector_num++;
2785
    }
2786

    
2787
    /* write the sectors "in place" */
2788
    nb_sectors = count >> BDRV_SECTOR_BITS;
2789
    if (nb_sectors > 0) {
2790
        QEMUIOVector qiov_inplace;
2791

    
2792
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2793
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2794
                          nb_sectors << BDRV_SECTOR_BITS);
2795
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2796
        qemu_iovec_destroy(&qiov_inplace);
2797
        if (ret < 0) {
2798
            return ret;
2799
        }
2800

    
2801
        sector_num += nb_sectors;
2802
        len = nb_sectors << BDRV_SECTOR_BITS;
2803
        count -= len;
2804
    }
2805

    
2806
    /* add data from the last sector */
2807
    if (count > 0) {
2808
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2809
            return ret;
2810
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2811
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2812
            return ret;
2813
    }
2814
    return qiov->size;
2815
}
2816

    
2817
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2818
                const void *buf, int count1)
2819
{
2820
    QEMUIOVector qiov;
2821
    struct iovec iov = {
2822
        .iov_base   = (void *) buf,
2823
        .iov_len    = count1,
2824
    };
2825

    
2826
    qemu_iovec_init_external(&qiov, &iov, 1);
2827
    return bdrv_pwritev(bs, offset, &qiov);
2828
}
2829

    
2830
/*
2831
 * Writes to the file and ensures that no writes are reordered across this
2832
 * request (acts as a barrier)
2833
 *
2834
 * Returns 0 on success, -errno in error cases.
2835
 */
2836
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2837
    const void *buf, int count)
2838
{
2839
    int ret;
2840

    
2841
    ret = bdrv_pwrite(bs, offset, buf, count);
2842
    if (ret < 0) {
2843
        return ret;
2844
    }
2845

    
2846
    /* No flush needed for cache modes that already do it */
2847
    if (bs->enable_write_cache) {
2848
        bdrv_flush(bs);
2849
    }
2850

    
2851
    return 0;
2852
}
2853

    
2854
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2855
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2856
{
2857
    /* Perform I/O through a temporary buffer so that users who scribble over
2858
     * their read buffer while the operation is in progress do not end up
2859
     * modifying the image file.  This is critical for zero-copy guest I/O
2860
     * where anything might happen inside guest memory.
2861
     */
2862
    void *bounce_buffer;
2863

    
2864
    BlockDriver *drv = bs->drv;
2865
    struct iovec iov;
2866
    QEMUIOVector bounce_qiov;
2867
    int64_t cluster_sector_num;
2868
    int cluster_nb_sectors;
2869
    size_t skip_bytes;
2870
    int ret;
2871

    
2872
    /* Cover entire cluster so no additional backing file I/O is required when
2873
     * allocating cluster in the image file.
2874
     */
2875
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2876
                           &cluster_sector_num, &cluster_nb_sectors);
2877

    
2878
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2879
                                   cluster_sector_num, cluster_nb_sectors);
2880

    
2881
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2882
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2883
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2884

    
2885
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2886
                             &bounce_qiov);
2887
    if (ret < 0) {
2888
        goto err;
2889
    }
2890

    
2891
    if (drv->bdrv_co_write_zeroes &&
2892
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2893
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2894
                                      cluster_nb_sectors, 0);
2895
    } else {
2896
        /* This does not change the data on the disk, it is not necessary
2897
         * to flush even in cache=writethrough mode.
2898
         */
2899
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2900
                                  &bounce_qiov);
2901
    }
2902

    
2903
    if (ret < 0) {
2904
        /* It might be okay to ignore write errors for guest requests.  If this
2905
         * is a deliberate copy-on-read then we don't want to ignore the error.
2906
         * Simply report it in all cases.
2907
         */
2908
        goto err;
2909
    }
2910

    
2911
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2912
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2913
                        nb_sectors * BDRV_SECTOR_SIZE);
2914

    
2915
err:
2916
    qemu_vfree(bounce_buffer);
2917
    return ret;
2918
}
2919

    
2920
/*
2921
 * Forwards an already correctly aligned request to the BlockDriver. This
2922
 * handles copy on read and zeroing after EOF; any other features must be
2923
 * implemented by the caller.
2924
 */
2925
static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2926
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2927
    int64_t align, QEMUIOVector *qiov, int flags)
2928
{
2929
    BlockDriver *drv = bs->drv;
2930
    int ret;
2931

    
2932
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2933
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2934

    
2935
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2936
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2937

    
2938
    /* Handle Copy on Read and associated serialisation */
2939
    if (flags & BDRV_REQ_COPY_ON_READ) {
2940
        /* If we touch the same cluster it counts as an overlap.  This
2941
         * guarantees that allocating writes will be serialized and not race
2942
         * with each other for the same cluster.  For example, in copy-on-read
2943
         * it ensures that the CoR read and write operations are atomic and
2944
         * guest writes cannot interleave between them. */
2945
        mark_request_serialising(req, bdrv_get_cluster_size(bs));
2946
    }
2947

    
2948
    wait_serialising_requests(req);
2949

    
2950
    if (flags & BDRV_REQ_COPY_ON_READ) {
2951
        int pnum;
2952

    
2953
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2954
        if (ret < 0) {
2955
            goto out;
2956
        }
2957

    
2958
        if (!ret || pnum != nb_sectors) {
2959
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2960
            goto out;
2961
        }
2962
    }
2963

    
2964
    /* Forward the request to the BlockDriver */
2965
    if (!(bs->zero_beyond_eof && bs->growable)) {
2966
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2967
    } else {
2968
        /* Read zeros after EOF of growable BDSes */
2969
        int64_t len, total_sectors, max_nb_sectors;
2970

    
2971
        len = bdrv_getlength(bs);
2972
        if (len < 0) {
2973
            ret = len;
2974
            goto out;
2975
        }
2976

    
2977
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2978
        max_nb_sectors = MAX(0, ROUND_UP(total_sectors - sector_num,
2979
                                         align >> BDRV_SECTOR_BITS));
2980
        if (max_nb_sectors > 0) {
2981
            ret = drv->bdrv_co_readv(bs, sector_num,
2982
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2983
        } else {
2984
            ret = 0;
2985
        }
2986

    
2987
        /* Reading beyond end of file is supposed to produce zeroes */
2988
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2989
            uint64_t offset = MAX(0, total_sectors - sector_num);
2990
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2991
                              BDRV_SECTOR_SIZE;
2992
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2993
        }
2994
    }
2995

    
2996
out:
2997
    return ret;
2998
}
2999

    
3000
/*
3001
 * Handle a read request in coroutine context
3002
 */
3003
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3004
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3005
    BdrvRequestFlags flags)
3006
{
3007
    BlockDriver *drv = bs->drv;
3008
    BdrvTrackedRequest req;
3009

    
3010
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3011
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3012
    uint8_t *head_buf = NULL;
3013
    uint8_t *tail_buf = NULL;
3014
    QEMUIOVector local_qiov;
3015
    bool use_local_qiov = false;
3016
    int ret;
3017

    
3018
    if (!drv) {
3019
        return -ENOMEDIUM;
3020
    }
3021
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3022
        return -EIO;
3023
    }
3024

    
3025
    if (bs->copy_on_read) {
3026
        flags |= BDRV_REQ_COPY_ON_READ;
3027
    }
3028

    
3029
    /* throttling disk I/O */
3030
    if (bs->io_limits_enabled) {
3031
        /* TODO Switch to byte granularity */
3032
        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, false);
3033
    }
3034

    
3035
    /* Align read if necessary by padding qiov */
3036
    if (offset & (align - 1)) {
3037
        head_buf = qemu_blockalign(bs, align);
3038
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3039
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3040
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3041
        use_local_qiov = true;
3042

    
3043
        bytes += offset & (align - 1);
3044
        offset = offset & ~(align - 1);
3045
    }
3046

    
3047
    if ((offset + bytes) & (align - 1)) {
3048
        if (!use_local_qiov) {
3049
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3050
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3051
            use_local_qiov = true;
3052
        }
3053
        tail_buf = qemu_blockalign(bs, align);
3054
        qemu_iovec_add(&local_qiov, tail_buf,
3055
                       align - ((offset + bytes) & (align - 1)));
3056

    
3057
        bytes = ROUND_UP(bytes, align);
3058
    }
3059

    
3060
    tracked_request_begin(&req, bs, offset, bytes, false);
3061
    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3062
                              use_local_qiov ? &local_qiov : qiov,
3063
                              flags);
3064
    tracked_request_end(&req);
3065

    
3066
    if (use_local_qiov) {
3067
        qemu_iovec_destroy(&local_qiov);
3068
        qemu_vfree(head_buf);
3069
        qemu_vfree(tail_buf);
3070
    }
3071

    
3072
    return ret;
3073
}
3074

    
3075
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3076
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3077
    BdrvRequestFlags flags)
3078
{
3079
    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3080
        return -EINVAL;
3081
    }
3082

    
3083
    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3084
                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3085
}
3086

    
3087
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3088
    int nb_sectors, QEMUIOVector *qiov)
3089
{
3090
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3091

    
3092
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3093
}
3094

    
3095
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3096
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3097
{
3098
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3099

    
3100
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3101
                            BDRV_REQ_COPY_ON_READ);
3102
}
3103

    
3104
/* if no limit is specified in the BlockLimits use a default
3105
 * of 32768 512-byte sectors (16 MiB) per request.
3106
 */
3107
#define MAX_WRITE_ZEROES_DEFAULT 32768
3108

    
3109
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3110
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3111
{
3112
    BlockDriver *drv = bs->drv;
3113
    QEMUIOVector qiov;
3114
    struct iovec iov = {0};
3115
    int ret = 0;
3116

    
3117
    int max_write_zeroes = bs->bl.max_write_zeroes ?
3118
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3119

    
3120
    while (nb_sectors > 0 && !ret) {
3121
        int num = nb_sectors;
3122

    
3123
        /* Align request.  Block drivers can expect the "bulk" of the request
3124
         * to be aligned.
3125
         */
3126
        if (bs->bl.write_zeroes_alignment
3127
            && num > bs->bl.write_zeroes_alignment) {
3128
            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3129
                /* Make a small request up to the first aligned sector.  */
3130
                num = bs->bl.write_zeroes_alignment;
3131
                num -= sector_num % bs->bl.write_zeroes_alignment;
3132
            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3133
                /* Shorten the request to the last aligned sector.  num cannot
3134
                 * underflow because num > bs->bl.write_zeroes_alignment.
3135
                 */
3136
                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3137
            }
3138
        }
3139

    
3140
        /* limit request size */
3141
        if (num > max_write_zeroes) {
3142
            num = max_write_zeroes;
3143
        }
3144

    
3145
        ret = -ENOTSUP;
3146
        /* First try the efficient write zeroes operation */
3147
        if (drv->bdrv_co_write_zeroes) {
3148
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3149
        }
3150

    
3151
        if (ret == -ENOTSUP) {
3152
            /* Fall back to bounce buffer if write zeroes is unsupported */
3153
            iov.iov_len = num * BDRV_SECTOR_SIZE;
3154
            if (iov.iov_base == NULL) {
3155
                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3156
                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3157
            }
3158
            qemu_iovec_init_external(&qiov, &iov, 1);
3159

    
3160
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3161

    
3162
            /* Keep bounce buffer around if it is big enough for all
3163
             * all future requests.
3164
             */
3165
            if (num < max_write_zeroes) {
3166
                qemu_vfree(iov.iov_base);
3167
                iov.iov_base = NULL;
3168
            }
3169
        }
3170

    
3171
        sector_num += num;
3172
        nb_sectors -= num;
3173
    }
3174

    
3175
    qemu_vfree(iov.iov_base);
3176
    return ret;
3177
}
3178

    
3179
/*
3180
 * Forwards an already correctly aligned write request to the BlockDriver.
3181
 */
3182
static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3183
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3184
    QEMUIOVector *qiov, int flags)
3185
{
3186
    BlockDriver *drv = bs->drv;
3187
    int ret;
3188

    
3189
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3190
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3191

    
3192
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3193
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3194

    
3195
    wait_serialising_requests(req);
3196

    
3197
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3198

    
3199
    if (ret < 0) {
3200
        /* Do nothing, write notifier decided to fail this request */
3201
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
3202
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3203
    } else {
3204
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3205
    }
3206

    
3207
    if (ret == 0 && !bs->enable_write_cache) {
3208
        ret = bdrv_co_flush(bs);
3209
    }
3210

    
3211
    bdrv_set_dirty(bs, sector_num, nb_sectors);
3212

    
3213
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3214
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
3215
    }
3216
    if (bs->growable && ret >= 0) {
3217
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3218
    }
3219

    
3220
    return ret;
3221
}
3222

    
3223
/*
3224
 * Handle a write request in coroutine context
3225
 */
3226
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3227
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3228
    BdrvRequestFlags flags)
3229
{
3230
    BdrvTrackedRequest req;
3231
    int ret;
3232

    
3233
    if (!bs->drv) {
3234
        return -ENOMEDIUM;
3235
    }
3236
    if (bs->read_only) {
3237
        return -EACCES;
3238
    }
3239
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3240
        return -EIO;
3241
    }
3242

    
3243
    /* throttling disk I/O */
3244
    if (bs->io_limits_enabled) {
3245
        /* TODO Switch to byte granularity */
3246
        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true);
3247
    }
3248

    
3249
    tracked_request_begin(&req, bs, offset, bytes, true);
3250
    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, qiov, flags);
3251
    tracked_request_end(&req);
3252

    
3253
    return ret;
3254
}
3255

    
3256
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3257
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3258
    BdrvRequestFlags flags)
3259
{
3260
    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3261
        return -EINVAL;
3262
    }
3263

    
3264
    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3265
                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3266
}
3267

    
3268
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3269
    int nb_sectors, QEMUIOVector *qiov)
3270
{
3271
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3272

    
3273
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3274
}
3275

    
3276
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3277
                                      int64_t sector_num, int nb_sectors,
3278
                                      BdrvRequestFlags flags)
3279
{
3280
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3281

    
3282
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
3283
        flags &= ~BDRV_REQ_MAY_UNMAP;
3284
    }
3285

    
3286
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3287
                             BDRV_REQ_ZERO_WRITE | flags);
3288
}
3289

    
3290
/**
3291
 * Truncate file to 'offset' bytes (needed only for file protocols)
3292
 */
3293
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3294
{
3295
    BlockDriver *drv = bs->drv;
3296
    int ret;
3297
    if (!drv)
3298
        return -ENOMEDIUM;
3299
    if (!drv->bdrv_truncate)
3300
        return -ENOTSUP;
3301
    if (bs->read_only)
3302
        return -EACCES;
3303
    if (bdrv_in_use(bs))
3304
        return -EBUSY;
3305
    ret = drv->bdrv_truncate(bs, offset);
3306
    if (ret == 0) {
3307
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3308
        bdrv_dev_resize_cb(bs);
3309
    }
3310
    return ret;
3311
}
3312

    
3313
/**
3314
 * Length of a allocated file in bytes. Sparse files are counted by actual
3315
 * allocated space. Return < 0 if error or unknown.
3316
 */
3317
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3318
{
3319
    BlockDriver *drv = bs->drv;
3320
    if (!drv) {
3321
        return -ENOMEDIUM;
3322
    }
3323
    if (drv->bdrv_get_allocated_file_size) {
3324
        return drv->bdrv_get_allocated_file_size(bs);
3325
    }
3326
    if (bs->file) {
3327
        return bdrv_get_allocated_file_size(bs->file);
3328
    }
3329
    return -ENOTSUP;
3330
}
3331

    
3332
/**
3333
 * Length of a file in bytes. Return < 0 if error or unknown.
3334
 */
3335
int64_t bdrv_getlength(BlockDriverState *bs)
3336
{
3337
    BlockDriver *drv = bs->drv;
3338
    if (!drv)
3339
        return -ENOMEDIUM;
3340

    
3341
    if (drv->has_variable_length) {
3342
        int ret = refresh_total_sectors(bs, bs->total_sectors);
3343
        if (ret < 0) {
3344
            return ret;
3345
        }
3346
    }
3347
    return bs->total_sectors * BDRV_SECTOR_SIZE;
3348
}
3349

    
3350
/* return 0 as number of sectors if no device present or error */
3351
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3352
{
3353
    int64_t length;
3354
    length = bdrv_getlength(bs);
3355
    if (length < 0)
3356
        length = 0;
3357
    else
3358
        length = length >> BDRV_SECTOR_BITS;
3359
    *nb_sectors_ptr = length;
3360
}
3361

    
3362
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3363
                       BlockdevOnError on_write_error)
3364
{
3365
    bs->on_read_error = on_read_error;
3366
    bs->on_write_error = on_write_error;
3367
}
3368

    
3369
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3370
{
3371
    return is_read ? bs->on_read_error : bs->on_write_error;
3372
}
3373

    
3374
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3375
{
3376
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3377

    
3378
    switch (on_err) {
3379
    case BLOCKDEV_ON_ERROR_ENOSPC:
3380
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3381
    case BLOCKDEV_ON_ERROR_STOP:
3382
        return BDRV_ACTION_STOP;
3383
    case BLOCKDEV_ON_ERROR_REPORT:
3384
        return BDRV_ACTION_REPORT;
3385
    case BLOCKDEV_ON_ERROR_IGNORE:
3386
        return BDRV_ACTION_IGNORE;
3387
    default:
3388
        abort();
3389
    }
3390
}
3391

    
3392
/* This is done by device models because, while the block layer knows
3393
 * about the error, it does not know whether an operation comes from
3394
 * the device or the block layer (from a job, for example).
3395
 */
3396
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3397
                       bool is_read, int error)
3398
{
3399
    assert(error >= 0);
3400
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3401
    if (action == BDRV_ACTION_STOP) {
3402
        vm_stop(RUN_STATE_IO_ERROR);
3403
        bdrv_iostatus_set_err(bs, error);
3404
    }
3405
}
3406

    
3407
int bdrv_is_read_only(BlockDriverState *bs)
3408
{
3409
    return bs->read_only;
3410
}
3411

    
3412
int bdrv_is_sg(BlockDriverState *bs)
3413
{
3414
    return bs->sg;
3415
}
3416

    
3417
int bdrv_enable_write_cache(BlockDriverState *bs)
3418
{
3419
    return bs->enable_write_cache;
3420
}
3421

    
3422
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3423
{
3424
    bs->enable_write_cache = wce;
3425

    
3426
    /* so a reopen() will preserve wce */
3427
    if (wce) {
3428
        bs->open_flags |= BDRV_O_CACHE_WB;
3429
    } else {
3430
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3431
    }
3432
}
3433

    
3434
int bdrv_is_encrypted(BlockDriverState *bs)
3435
{
3436
    if (bs->backing_hd && bs->backing_hd->encrypted)
3437
        return 1;
3438
    return bs->encrypted;
3439
}
3440

    
3441
int bdrv_key_required(BlockDriverState *bs)
3442
{
3443
    BlockDriverState *backing_hd = bs->backing_hd;
3444

    
3445
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3446
        return 1;
3447
    return (bs->encrypted && !bs->valid_key);
3448
}
3449

    
3450
int bdrv_set_key(BlockDriverState *bs, const char *key)
3451
{
3452
    int ret;
3453
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3454
        ret = bdrv_set_key(bs->backing_hd, key);
3455
        if (ret < 0)
3456
            return ret;
3457
        if (!bs->encrypted)
3458
            return 0;
3459
    }
3460
    if (!bs->encrypted) {
3461
        return -EINVAL;
3462
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3463
        return -ENOMEDIUM;
3464
    }
3465
    ret = bs->drv->bdrv_set_key(bs, key);
3466
    if (ret < 0) {
3467
        bs->valid_key = 0;
3468
    } else if (!bs->valid_key) {
3469
        bs->valid_key = 1;
3470
        /* call the change callback now, we skipped it on open */
3471
        bdrv_dev_change_media_cb(bs, true);
3472
    }
3473
    return ret;
3474
}
3475

    
3476
const char *bdrv_get_format_name(BlockDriverState *bs)
3477
{
3478
    return bs->drv ? bs->drv->format_name : NULL;
3479
}
3480

    
3481
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3482
                         void *opaque)
3483
{
3484
    BlockDriver *drv;
3485

    
3486
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3487
        it(opaque, drv->format_name);
3488
    }
3489
}
3490

    
3491
/* This function is to find block backend bs */
3492
BlockDriverState *bdrv_find(const char *name)
3493
{
3494
    BlockDriverState *bs;
3495

    
3496
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3497
        if (!strcmp(name, bs->device_name)) {
3498
            return bs;
3499
        }
3500
    }
3501
    return NULL;
3502
}
3503

    
3504
/* This function is to find a node in the bs graph */
3505
BlockDriverState *bdrv_find_node(const char *node_name)
3506
{
3507
    BlockDriverState *bs;
3508

    
3509
    assert(node_name);
3510

    
3511
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3512
        if (!strcmp(node_name, bs->node_name)) {
3513
            return bs;
3514
        }
3515
    }
3516
    return NULL;
3517
}
3518

    
3519
/* Put this QMP function here so it can access the static graph_bdrv_states. */
3520
BlockDeviceInfoList *bdrv_named_nodes_list(void)
3521
{
3522
    BlockDeviceInfoList *list, *entry;
3523
    BlockDriverState *bs;
3524

    
3525
    list = NULL;
3526
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3527
        entry = g_malloc0(sizeof(*entry));
3528
        entry->value = bdrv_block_device_info(bs);
3529
        entry->next = list;
3530
        list = entry;
3531
    }
3532

    
3533
    return list;
3534
}
3535

    
3536
BlockDriverState *bdrv_lookup_bs(const char *device,
3537
                                 const char *node_name,
3538
                                 Error **errp)
3539
{
3540
    BlockDriverState *bs = NULL;
3541

    
3542
    if ((!device && !node_name) || (device && node_name)) {
3543
        error_setg(errp, "Use either device or node-name but not both");
3544
        return NULL;
3545
    }
3546

    
3547
    if (device) {
3548
        bs = bdrv_find(device);
3549

    
3550
        if (!bs) {
3551
            error_set(errp, QERR_DEVICE_NOT_FOUND, device);
3552
            return NULL;
3553
        }
3554

    
3555
        return bs;
3556
    }
3557

    
3558
    bs = bdrv_find_node(node_name);
3559

    
3560
    if (!bs) {
3561
        error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
3562
        return NULL;
3563
    }
3564

    
3565
    return bs;
3566
}
3567

    
3568
BlockDriverState *bdrv_next(BlockDriverState *bs)
3569
{
3570
    if (!bs) {
3571
        return QTAILQ_FIRST(&bdrv_states);
3572
    }
3573
    return QTAILQ_NEXT(bs, device_list);
3574
}
3575

    
3576
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3577
{
3578
    BlockDriverState *bs;
3579

    
3580
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3581
        it(opaque, bs);
3582
    }
3583
}
3584

    
3585
const char *bdrv_get_device_name(BlockDriverState *bs)
3586
{
3587
    return bs->device_name;
3588
}
3589

    
3590
int bdrv_get_flags(BlockDriverState *bs)
3591
{
3592
    return bs->open_flags;
3593
}
3594

    
3595
int bdrv_flush_all(void)
3596
{
3597
    BlockDriverState *bs;
3598
    int result = 0;
3599

    
3600
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3601
        int ret = bdrv_flush(bs);
3602
        if (ret < 0 && !result) {
3603
            result = ret;
3604
        }
3605
    }
3606

    
3607
    return result;
3608
}
3609

    
3610
int bdrv_has_zero_init_1(BlockDriverState *bs)
3611
{
3612
    return 1;
3613
}
3614

    
3615
int bdrv_has_zero_init(BlockDriverState *bs)
3616
{
3617
    assert(bs->drv);
3618

    
3619
    /* If BS is a copy on write image, it is initialized to
3620
       the contents of the base image, which may not be zeroes.  */
3621
    if (bs->backing_hd) {
3622
        return 0;
3623
    }
3624
    if (bs->drv->bdrv_has_zero_init) {
3625
        return bs->drv->bdrv_has_zero_init(bs);
3626
    }
3627

    
3628
    /* safe default */
3629
    return 0;
3630
}
3631

    
3632
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3633
{
3634
    BlockDriverInfo bdi;
3635

    
3636
    if (bs->backing_hd) {
3637
        return false;
3638
    }
3639

    
3640
    if (bdrv_get_info(bs, &bdi) == 0) {
3641
        return bdi.unallocated_blocks_are_zero;
3642
    }
3643

    
3644
    return false;
3645
}
3646

    
3647
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3648
{
3649
    BlockDriverInfo bdi;
3650

    
3651
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3652
        return false;
3653
    }
3654

    
3655
    if (bdrv_get_info(bs, &bdi) == 0) {
3656
        return bdi.can_write_zeroes_with_unmap;
3657
    }
3658

    
3659
    return false;
3660
}
3661

    
3662
typedef struct BdrvCoGetBlockStatusData {
3663
    BlockDriverState *bs;
3664
    BlockDriverState *base;
3665
    int64_t sector_num;
3666
    int nb_sectors;
3667
    int *pnum;
3668
    int64_t ret;
3669
    bool done;
3670
} BdrvCoGetBlockStatusData;
3671

    
3672
/*
3673
 * Returns true iff the specified sector is present in the disk image. Drivers
3674
 * not implementing the functionality are assumed to not support backing files,
3675
 * hence all their sectors are reported as allocated.
3676
 *
3677
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3678
 * and 'pnum' is set to 0.
3679
 *
3680
 * 'pnum' is set to the number of sectors (including and immediately following
3681
 * the specified sector) that are known to be in the same
3682
 * allocated/unallocated state.
3683
 *
3684
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3685
 * beyond the end of the disk image it will be clamped.
3686
 */
3687
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3688
                                                     int64_t sector_num,
3689
                                                     int nb_sectors, int *pnum)
3690
{
3691
    int64_t length;
3692
    int64_t n;
3693
    int64_t ret, ret2;
3694

    
3695
    length = bdrv_getlength(bs);
3696
    if (length < 0) {
3697
        return length;
3698
    }
3699

    
3700
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3701
        *pnum = 0;
3702
        return 0;
3703
    }
3704

    
3705
    n = bs->total_sectors - sector_num;
3706
    if (n < nb_sectors) {
3707
        nb_sectors = n;
3708
    }
3709

    
3710
    if (!bs->drv->bdrv_co_get_block_status) {
3711
        *pnum = nb_sectors;
3712
        ret = BDRV_BLOCK_DATA;
3713
        if (bs->drv->protocol_name) {
3714
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3715
        }
3716
        return ret;
3717
    }
3718

    
3719
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3720
    if (ret < 0) {
3721
        *pnum = 0;
3722
        return ret;
3723
    }
3724

    
3725
    if (ret & BDRV_BLOCK_RAW) {
3726
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3727
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3728
                                     *pnum, pnum);
3729
    }
3730

    
3731
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3732
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3733
            ret |= BDRV_BLOCK_ZERO;
3734
        } else if (bs->backing_hd) {
3735
            BlockDriverState *bs2 = bs->backing_hd;
3736
            int64_t length2 = bdrv_getlength(bs2);
3737
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3738
                ret |= BDRV_BLOCK_ZERO;
3739
            }
3740
        }
3741
    }
3742

    
3743
    if (bs->file &&
3744
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3745
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3746
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3747
                                        *pnum, pnum);
3748
        if (ret2 >= 0) {
3749
            /* Ignore errors.  This is just providing extra information, it
3750
             * is useful but not necessary.
3751
             */
3752
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3753
        }
3754
    }
3755

    
3756
    return ret;
3757
}
3758

    
3759
/* Coroutine wrapper for bdrv_get_block_status() */
3760
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3761
{
3762
    BdrvCoGetBlockStatusData *data = opaque;
3763
    BlockDriverState *bs = data->bs;
3764

    
3765
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3766
                                         data->pnum);
3767
    data->done = true;
3768
}
3769

    
3770
/*
3771
 * Synchronous wrapper around bdrv_co_get_block_status().
3772
 *
3773
 * See bdrv_co_get_block_status() for details.
3774
 */
3775
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3776
                              int nb_sectors, int *pnum)
3777
{
3778
    Coroutine *co;
3779
    BdrvCoGetBlockStatusData data = {
3780
        .bs = bs,
3781
        .sector_num = sector_num,
3782
        .nb_sectors = nb_sectors,
3783
        .pnum = pnum,
3784
        .done = false,
3785
    };
3786

    
3787
    if (qemu_in_coroutine()) {
3788
        /* Fast-path if already in coroutine context */
3789
        bdrv_get_block_status_co_entry(&data);
3790
    } else {
3791
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3792
        qemu_coroutine_enter(co, &data);
3793
        while (!data.done) {
3794
            qemu_aio_wait();
3795
        }
3796
    }
3797
    return data.ret;
3798
}
3799

    
3800
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3801
                                   int nb_sectors, int *pnum)
3802
{
3803
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3804
    if (ret < 0) {
3805
        return ret;
3806
    }
3807
    return
3808
        (ret & BDRV_BLOCK_DATA) ||
3809
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3810
}
3811

    
3812
/*
3813
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3814
 *
3815
 * Return true if the given sector is allocated in any image between
3816
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3817
 * sector is allocated in any image of the chain.  Return false otherwise.
3818
 *
3819
 * 'pnum' is set to the number of sectors (including and immediately following
3820
 *  the specified sector) that are known to be in the same
3821
 *  allocated/unallocated state.
3822
 *
3823
 */
3824
int bdrv_is_allocated_above(BlockDriverState *top,
3825
                            BlockDriverState *base,
3826
                            int64_t sector_num,
3827
                            int nb_sectors, int *pnum)
3828
{
3829
    BlockDriverState *intermediate;
3830
    int ret, n = nb_sectors;
3831

    
3832
    intermediate = top;
3833
    while (intermediate && intermediate != base) {
3834
        int pnum_inter;
3835
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3836
                                &pnum_inter);
3837
        if (ret < 0) {
3838
            return ret;
3839
        } else if (ret) {
3840
            *pnum = pnum_inter;
3841
            return 1;
3842
        }
3843

    
3844
        /*
3845
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3846
         * might have
3847
         *
3848
         * [sector_num+x, nr_sectors] allocated.
3849
         */
3850
        if (n > pnum_inter &&
3851
            (intermediate == top ||
3852
             sector_num + pnum_inter < intermediate->total_sectors)) {
3853
            n = pnum_inter;
3854
        }
3855

    
3856
        intermediate = intermediate->backing_hd;
3857
    }
3858

    
3859
    *pnum = n;
3860
    return 0;
3861
}
3862

    
3863
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3864
{
3865
    if (bs->backing_hd && bs->backing_hd->encrypted)
3866
        return bs->backing_file;
3867
    else if (bs->encrypted)
3868
        return bs->filename;
3869
    else
3870
        return NULL;
3871
}
3872

    
3873
void bdrv_get_backing_filename(BlockDriverState *bs,
3874
                               char *filename, int filename_size)
3875
{
3876
    pstrcpy(filename, filename_size, bs->backing_file);
3877
}
3878

    
3879
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3880
                          const uint8_t *buf, int nb_sectors)
3881
{
3882
    BlockDriver *drv = bs->drv;
3883
    if (!drv)
3884
        return -ENOMEDIUM;
3885
    if (!drv->bdrv_write_compressed)
3886
        return -ENOTSUP;
3887
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3888
        return -EIO;
3889

    
3890
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3891

    
3892
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3893
}
3894

    
3895
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3896
{
3897
    BlockDriver *drv = bs->drv;
3898
    if (!drv)
3899
        return -ENOMEDIUM;
3900
    if (!drv->bdrv_get_info)
3901
        return -ENOTSUP;
3902
    memset(bdi, 0, sizeof(*bdi));
3903
    return drv->bdrv_get_info(bs, bdi);
3904
}
3905

    
3906
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3907
{
3908
    BlockDriver *drv = bs->drv;
3909
    if (drv && drv->bdrv_get_specific_info) {
3910
        return drv->bdrv_get_specific_info(bs);
3911
    }
3912
    return NULL;
3913
}
3914

    
3915
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3916
                      int64_t pos, int size)
3917
{
3918
    QEMUIOVector qiov;
3919
    struct iovec iov = {
3920
        .iov_base   = (void *) buf,
3921
        .iov_len    = size,
3922
    };
3923

    
3924
    qemu_iovec_init_external(&qiov, &iov, 1);
3925
    return bdrv_writev_vmstate(bs, &qiov, pos);
3926
}
3927

    
3928
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3929
{
3930
    BlockDriver *drv = bs->drv;
3931

    
3932
    if (!drv) {
3933
        return -ENOMEDIUM;
3934
    } else if (drv->bdrv_save_vmstate) {
3935
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3936
    } else if (bs->file) {
3937
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3938
    }
3939

    
3940
    return -ENOTSUP;
3941
}
3942

    
3943
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3944
                      int64_t pos, int size)
3945
{
3946
    BlockDriver *drv = bs->drv;
3947
    if (!drv)
3948
        return -ENOMEDIUM;
3949
    if (drv->bdrv_load_vmstate)
3950
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3951
    if (bs->file)
3952
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3953
    return -ENOTSUP;
3954
}
3955

    
3956
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3957
{
3958
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3959
        return;
3960
    }
3961

    
3962
    bs->drv->bdrv_debug_event(bs, event);
3963
}
3964

    
3965
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3966
                          const char *tag)
3967
{
3968
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3969
        bs = bs->file;
3970
    }
3971

    
3972
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3973
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3974
    }
3975

    
3976
    return -ENOTSUP;
3977
}
3978

    
3979
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
3980
{
3981
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
3982
        bs = bs->file;
3983
    }
3984

    
3985
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
3986
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
3987
    }
3988

    
3989
    return -ENOTSUP;
3990
}
3991

    
3992
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3993
{
3994
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3995
        bs = bs->file;
3996
    }
3997

    
3998
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3999
        return bs->drv->bdrv_debug_resume(bs, tag);
4000
    }
4001

    
4002
    return -ENOTSUP;
4003
}
4004

    
4005
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4006
{
4007
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4008
        bs = bs->file;
4009
    }
4010

    
4011
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4012
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
4013
    }
4014

    
4015
    return false;
4016
}
4017

    
4018
int bdrv_is_snapshot(BlockDriverState *bs)
4019
{
4020
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4021
}
4022

    
4023
/* backing_file can either be relative, or absolute, or a protocol.  If it is
4024
 * relative, it must be relative to the chain.  So, passing in bs->filename
4025
 * from a BDS as backing_file should not be done, as that may be relative to
4026
 * the CWD rather than the chain. */
4027
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4028
        const char *backing_file)
4029
{
4030
    char *filename_full = NULL;
4031
    char *backing_file_full = NULL;
4032
    char *filename_tmp = NULL;
4033
    int is_protocol = 0;
4034
    BlockDriverState *curr_bs = NULL;
4035
    BlockDriverState *retval = NULL;
4036

    
4037
    if (!bs || !bs->drv || !backing_file) {
4038
        return NULL;
4039
    }
4040

    
4041
    filename_full     = g_malloc(PATH_MAX);
4042
    backing_file_full = g_malloc(PATH_MAX);
4043
    filename_tmp      = g_malloc(PATH_MAX);
4044

    
4045
    is_protocol = path_has_protocol(backing_file);
4046

    
4047
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4048

    
4049
        /* If either of the filename paths is actually a protocol, then
4050
         * compare unmodified paths; otherwise make paths relative */
4051
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4052
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4053
                retval = curr_bs->backing_hd;
4054
                break;
4055
            }
4056
        } else {
4057
            /* If not an absolute filename path, make it relative to the current
4058
             * image's filename path */
4059
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4060
                         backing_file);
4061

    
4062
            /* We are going to compare absolute pathnames */
4063
            if (!realpath(filename_tmp, filename_full)) {
4064
                continue;
4065
            }
4066

    
4067
            /* We need to make sure the backing filename we are comparing against
4068
             * is relative to the current image filename (or absolute) */
4069
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4070
                         curr_bs->backing_file);
4071

    
4072
            if (!realpath(filename_tmp, backing_file_full)) {
4073
                continue;
4074
            }
4075

    
4076
            if (strcmp(backing_file_full, filename_full) == 0) {
4077
                retval = curr_bs->backing_hd;
4078
                break;
4079
            }
4080
        }
4081
    }
4082

    
4083
    g_free(filename_full);
4084
    g_free(backing_file_full);
4085
    g_free(filename_tmp);
4086
    return retval;
4087
}
4088

    
4089
int bdrv_get_backing_file_depth(BlockDriverState *bs)
4090
{
4091
    if (!bs->drv) {
4092
        return 0;
4093
    }
4094

    
4095
    if (!bs->backing_hd) {
4096
        return 0;
4097
    }
4098

    
4099
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4100
}
4101

    
4102
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4103
{
4104
    BlockDriverState *curr_bs = NULL;
4105

    
4106
    if (!bs) {
4107
        return NULL;
4108
    }
4109

    
4110
    curr_bs = bs;
4111

    
4112
    while (curr_bs->backing_hd) {
4113
        curr_bs = curr_bs->backing_hd;
4114
    }
4115
    return curr_bs;
4116
}
4117

    
4118
/**************************************************************/
4119
/* async I/Os */
4120

    
4121
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4122
                                 QEMUIOVector *qiov, int nb_sectors,
4123
                                 BlockDriverCompletionFunc *cb, void *opaque)
4124
{
4125
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4126

    
4127
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4128
                                 cb, opaque, false);
4129
}
4130

    
4131
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4132
                                  QEMUIOVector *qiov, int nb_sectors,
4133
                                  BlockDriverCompletionFunc *cb, void *opaque)
4134
{
4135
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4136

    
4137
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4138
                                 cb, opaque, true);
4139
}
4140

    
4141
BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4142
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4143
        BlockDriverCompletionFunc *cb, void *opaque)
4144
{
4145
    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4146

    
4147
    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4148
                                 BDRV_REQ_ZERO_WRITE | flags,
4149
                                 cb, opaque, true);
4150
}
4151

    
4152

    
4153
typedef struct MultiwriteCB {
4154
    int error;
4155
    int num_requests;
4156
    int num_callbacks;
4157
    struct {
4158
        BlockDriverCompletionFunc *cb;
4159
        void *opaque;
4160
        QEMUIOVector *free_qiov;
4161
    } callbacks[];
4162
} MultiwriteCB;
4163

    
4164
static void multiwrite_user_cb(MultiwriteCB *mcb)
4165
{
4166
    int i;
4167

    
4168
    for (i = 0; i < mcb->num_callbacks; i++) {
4169
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4170
        if (mcb->callbacks[i].free_qiov) {
4171
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4172
        }
4173
        g_free(mcb->callbacks[i].free_qiov);
4174
    }
4175
}
4176

    
4177
static void multiwrite_cb(void *opaque, int ret)
4178
{
4179
    MultiwriteCB *mcb = opaque;
4180

    
4181
    trace_multiwrite_cb(mcb, ret);
4182

    
4183
    if (ret < 0 && !mcb->error) {
4184
        mcb->error = ret;
4185
    }
4186

    
4187
    mcb->num_requests--;
4188
    if (mcb->num_requests == 0) {
4189
        multiwrite_user_cb(mcb);
4190
        g_free(mcb);
4191
    }
4192
}
4193

    
4194
static int multiwrite_req_compare(const void *a, const void *b)
4195
{
4196
    const BlockRequest *req1 = a, *req2 = b;
4197

    
4198
    /*
4199
     * Note that we can't simply subtract req2->sector from req1->sector
4200
     * here as that could overflow the return value.
4201
     */
4202
    if (req1->sector > req2->sector) {
4203
        return 1;
4204
    } else if (req1->sector < req2->sector) {
4205
        return -1;
4206
    } else {
4207
        return 0;
4208
    }
4209
}
4210

    
4211
/*
4212
 * Takes a bunch of requests and tries to merge them. Returns the number of
4213
 * requests that remain after merging.
4214
 */
4215
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4216
    int num_reqs, MultiwriteCB *mcb)
4217
{
4218
    int i, outidx;
4219

    
4220
    // Sort requests by start sector
4221
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4222

    
4223
    // Check if adjacent requests touch the same clusters. If so, combine them,
4224
    // filling up gaps with zero sectors.
4225
    outidx = 0;
4226
    for (i = 1; i < num_reqs; i++) {
4227
        int merge = 0;
4228
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4229

    
4230
        // Handle exactly sequential writes and overlapping writes.
4231
        if (reqs[i].sector <= oldreq_last) {
4232
            merge = 1;
4233
        }
4234

    
4235
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4236
            merge = 0;
4237
        }
4238

    
4239
        if (merge) {
4240
            size_t size;
4241
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4242
            qemu_iovec_init(qiov,
4243
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4244

    
4245
            // Add the first request to the merged one. If the requests are
4246
            // overlapping, drop the last sectors of the first request.
4247
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
4248
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4249

    
4250
            // We should need to add any zeros between the two requests
4251
            assert (reqs[i].sector <= oldreq_last);
4252

    
4253
            // Add the second request
4254
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4255

    
4256
            reqs[outidx].nb_sectors = qiov->size >> 9;
4257
            reqs[outidx].qiov = qiov;
4258

    
4259
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4260
        } else {
4261
            outidx++;
4262
            reqs[outidx].sector     = reqs[i].sector;
4263
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4264
            reqs[outidx].qiov       = reqs[i].qiov;
4265
        }
4266
    }
4267

    
4268
    return outidx + 1;
4269
}
4270

    
4271
/*
4272
 * Submit multiple AIO write requests at once.
4273
 *
4274
 * On success, the function returns 0 and all requests in the reqs array have
4275
 * been submitted. In error case this function returns -1, and any of the
4276
 * requests may or may not be submitted yet. In particular, this means that the
4277
 * callback will be called for some of the requests, for others it won't. The
4278
 * caller must check the error field of the BlockRequest to wait for the right
4279
 * callbacks (if error != 0, no callback will be called).
4280
 *
4281
 * The implementation may modify the contents of the reqs array, e.g. to merge
4282
 * requests. However, the fields opaque and error are left unmodified as they
4283
 * are used to signal failure for a single request to the caller.
4284
 */
4285
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4286
{
4287
    MultiwriteCB *mcb;
4288
    int i;
4289

    
4290
    /* don't submit writes if we don't have a medium */
4291
    if (bs->drv == NULL) {
4292
        for (i = 0; i < num_reqs; i++) {
4293
            reqs[i].error = -ENOMEDIUM;
4294
        }
4295
        return -1;
4296
    }
4297

    
4298
    if (num_reqs == 0) {
4299
        return 0;
4300
    }
4301

    
4302
    // Create MultiwriteCB structure
4303
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4304
    mcb->num_requests = 0;
4305
    mcb->num_callbacks = num_reqs;
4306

    
4307
    for (i = 0; i < num_reqs; i++) {
4308
        mcb->callbacks[i].cb = reqs[i].cb;
4309
        mcb->callbacks[i].opaque = reqs[i].opaque;
4310
    }
4311

    
4312
    // Check for mergable requests
4313
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4314

    
4315
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4316

    
4317
    /* Run the aio requests. */
4318
    mcb->num_requests = num_reqs;
4319
    for (i = 0; i < num_reqs; i++) {
4320
        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4321
                              reqs[i].nb_sectors, reqs[i].flags,
4322
                              multiwrite_cb, mcb,
4323
                              true);
4324
    }
4325

    
4326
    return 0;
4327
}
4328

    
4329
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4330
{
4331
    acb->aiocb_info->cancel(acb);
4332
}
4333

    
4334
/**************************************************************/
4335
/* async block device emulation */
4336

    
4337
typedef struct BlockDriverAIOCBSync {
4338
    BlockDriverAIOCB common;
4339
    QEMUBH *bh;
4340
    int ret;
4341
    /* vector translation state */
4342
    QEMUIOVector *qiov;
4343
    uint8_t *bounce;
4344
    int is_write;
4345
} BlockDriverAIOCBSync;
4346

    
4347
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4348
{
4349
    BlockDriverAIOCBSync *acb =
4350
        container_of(blockacb, BlockDriverAIOCBSync, common);
4351
    qemu_bh_delete(acb->bh);
4352
    acb->bh = NULL;
4353
    qemu_aio_release(acb);
4354
}
4355

    
4356
static const AIOCBInfo bdrv_em_aiocb_info = {
4357
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4358
    .cancel             = bdrv_aio_cancel_em,
4359
};
4360

    
4361
static void bdrv_aio_bh_cb(void *opaque)
4362
{
4363
    BlockDriverAIOCBSync *acb = opaque;
4364

    
4365
    if (!acb->is_write)
4366
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4367
    qemu_vfree(acb->bounce);
4368
    acb->common.cb(acb->common.opaque, acb->ret);
4369
    qemu_bh_delete(acb->bh);
4370
    acb->bh = NULL;
4371
    qemu_aio_release(acb);
4372
}
4373

    
4374
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4375
                                            int64_t sector_num,
4376
                                            QEMUIOVector *qiov,
4377
                                            int nb_sectors,
4378
                                            BlockDriverCompletionFunc *cb,
4379
                                            void *opaque,
4380
                                            int is_write)
4381

    
4382
{
4383
    BlockDriverAIOCBSync *acb;
4384

    
4385
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4386
    acb->is_write = is_write;
4387
    acb->qiov = qiov;
4388
    acb->bounce = qemu_blockalign(bs, qiov->size);
4389
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4390

    
4391
    if (is_write) {
4392
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4393
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4394
    } else {
4395
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4396
    }
4397

    
4398
    qemu_bh_schedule(acb->bh);
4399

    
4400
    return &acb->common;
4401
}
4402

    
4403
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4404
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4405
        BlockDriverCompletionFunc *cb, void *opaque)
4406
{
4407
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4408
}
4409

    
4410
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4411
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4412
        BlockDriverCompletionFunc *cb, void *opaque)
4413
{
4414
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4415
}
4416

    
4417

    
4418
typedef struct BlockDriverAIOCBCoroutine {
4419
    BlockDriverAIOCB common;
4420
    BlockRequest req;
4421
    bool is_write;
4422
    bool *done;
4423
    QEMUBH* bh;
4424
} BlockDriverAIOCBCoroutine;
4425

    
4426
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4427
{
4428
    BlockDriverAIOCBCoroutine *acb =
4429
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4430
    bool done = false;
4431

    
4432
    acb->done = &done;
4433
    while (!done) {
4434
        qemu_aio_wait();
4435
    }
4436
}
4437

    
4438
static const AIOCBInfo bdrv_em_co_aiocb_info = {
4439
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4440
    .cancel             = bdrv_aio_co_cancel_em,
4441
};
4442

    
4443
static void bdrv_co_em_bh(void *opaque)
4444
{
4445
    BlockDriverAIOCBCoroutine *acb = opaque;
4446

    
4447
    acb->common.cb(acb->common.opaque, acb->req.error);
4448

    
4449
    if (acb->done) {
4450
        *acb->done = true;
4451
    }
4452

    
4453
    qemu_bh_delete(acb->bh);
4454
    qemu_aio_release(acb);
4455
}
4456

    
4457
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4458
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4459
{
4460
    BlockDriverAIOCBCoroutine *acb = opaque;
4461
    BlockDriverState *bs = acb->common.bs;
4462

    
4463
    if (!acb->is_write) {
4464
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4465
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4466
    } else {
4467
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4468
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4469
    }
4470

    
4471
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4472
    qemu_bh_schedule(acb->bh);
4473
}
4474

    
4475
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4476
                                               int64_t sector_num,
4477
                                               QEMUIOVector *qiov,
4478
                                               int nb_sectors,
4479
                                               BdrvRequestFlags flags,
4480
                                               BlockDriverCompletionFunc *cb,
4481
                                               void *opaque,
4482
                                               bool is_write)
4483
{
4484
    Coroutine *co;
4485
    BlockDriverAIOCBCoroutine *acb;
4486

    
4487
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4488
    acb->req.sector = sector_num;
4489
    acb->req.nb_sectors = nb_sectors;
4490
    acb->req.qiov = qiov;
4491
    acb->req.flags = flags;
4492
    acb->is_write = is_write;
4493
    acb->done = NULL;
4494

    
4495
    co = qemu_coroutine_create(bdrv_co_do_rw);
4496
    qemu_coroutine_enter(co, acb);
4497

    
4498
    return &acb->common;
4499
}
4500

    
4501
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4502
{
4503
    BlockDriverAIOCBCoroutine *acb = opaque;
4504
    BlockDriverState *bs = acb->common.bs;
4505

    
4506
    acb->req.error = bdrv_co_flush(bs);
4507
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4508
    qemu_bh_schedule(acb->bh);
4509
}
4510

    
4511
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4512
        BlockDriverCompletionFunc *cb, void *opaque)
4513
{
4514
    trace_bdrv_aio_flush(bs, opaque);
4515

    
4516
    Coroutine *co;
4517
    BlockDriverAIOCBCoroutine *acb;
4518

    
4519
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4520
    acb->done = NULL;
4521

    
4522
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4523
    qemu_coroutine_enter(co, acb);
4524

    
4525
    return &acb->common;
4526
}
4527

    
4528
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4529
{
4530
    BlockDriverAIOCBCoroutine *acb = opaque;
4531
    BlockDriverState *bs = acb->common.bs;
4532

    
4533
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4534
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4535
    qemu_bh_schedule(acb->bh);
4536
}
4537

    
4538
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4539
        int64_t sector_num, int nb_sectors,
4540
        BlockDriverCompletionFunc *cb, void *opaque)
4541
{
4542
    Coroutine *co;
4543
    BlockDriverAIOCBCoroutine *acb;
4544

    
4545
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4546

    
4547
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4548
    acb->req.sector = sector_num;
4549
    acb->req.nb_sectors = nb_sectors;
4550
    acb->done = NULL;
4551
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4552
    qemu_coroutine_enter(co, acb);
4553

    
4554
    return &acb->common;
4555
}
4556

    
4557
void bdrv_init(void)
4558
{
4559
    module_call_init(MODULE_INIT_BLOCK);
4560
}
4561

    
4562
void bdrv_init_with_whitelist(void)
4563
{
4564
    use_bdrv_whitelist = 1;
4565
    bdrv_init();
4566
}
4567

    
4568
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4569
                   BlockDriverCompletionFunc *cb, void *opaque)
4570
{
4571
    BlockDriverAIOCB *acb;
4572

    
4573
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4574
    acb->aiocb_info = aiocb_info;
4575
    acb->bs = bs;
4576
    acb->cb = cb;
4577
    acb->opaque = opaque;
4578
    return acb;
4579
}
4580

    
4581
void qemu_aio_release(void *p)
4582
{
4583
    BlockDriverAIOCB *acb = p;
4584
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4585
}
4586

    
4587
/**************************************************************/
4588
/* Coroutine block device emulation */
4589

    
4590
typedef struct CoroutineIOCompletion {
4591
    Coroutine *coroutine;
4592
    int ret;
4593
} CoroutineIOCompletion;
4594

    
4595
static void bdrv_co_io_em_complete(void *opaque, int ret)
4596
{
4597
    CoroutineIOCompletion *co = opaque;
4598

    
4599
    co->ret = ret;
4600
    qemu_coroutine_enter(co->coroutine, NULL);
4601
}
4602

    
4603
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4604
                                      int nb_sectors, QEMUIOVector *iov,
4605
                                      bool is_write)
4606
{
4607
    CoroutineIOCompletion co = {
4608
        .coroutine = qemu_coroutine_self(),
4609
    };
4610
    BlockDriverAIOCB *acb;
4611

    
4612
    if (is_write) {
4613
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4614
                                       bdrv_co_io_em_complete, &co);
4615
    } else {
4616
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4617
                                      bdrv_co_io_em_complete, &co);
4618
    }
4619

    
4620
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4621
    if (!acb) {
4622
        return -EIO;
4623
    }
4624
    qemu_coroutine_yield();
4625

    
4626
    return co.ret;
4627
}
4628

    
4629
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4630
                                         int64_t sector_num, int nb_sectors,
4631
                                         QEMUIOVector *iov)
4632
{
4633
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4634
}
4635

    
4636
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4637
                                         int64_t sector_num, int nb_sectors,
4638
                                         QEMUIOVector *iov)
4639
{
4640
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4641
}
4642

    
4643
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4644
{
4645
    RwCo *rwco = opaque;
4646

    
4647
    rwco->ret = bdrv_co_flush(rwco->bs);
4648
}
4649

    
4650
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4651
{
4652
    int ret;
4653

    
4654
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4655
        return 0;
4656
    }
4657

    
4658
    /* Write back cached data to the OS even with cache=unsafe */
4659
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4660
    if (bs->drv->bdrv_co_flush_to_os) {
4661
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4662
        if (ret < 0) {
4663
            return ret;
4664
        }
4665
    }
4666

    
4667
    /* But don't actually force it to the disk with cache=unsafe */
4668
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4669
        goto flush_parent;
4670
    }
4671

    
4672
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4673
    if (bs->drv->bdrv_co_flush_to_disk) {
4674
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4675
    } else if (bs->drv->bdrv_aio_flush) {
4676
        BlockDriverAIOCB *acb;
4677
        CoroutineIOCompletion co = {
4678
            .coroutine = qemu_coroutine_self(),
4679
        };
4680

    
4681
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4682
        if (acb == NULL) {
4683
            ret = -EIO;
4684
        } else {
4685
            qemu_coroutine_yield();
4686
            ret = co.ret;
4687
        }
4688
    } else {
4689
        /*
4690
         * Some block drivers always operate in either writethrough or unsafe
4691
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4692
         * know how the server works (because the behaviour is hardcoded or
4693
         * depends on server-side configuration), so we can't ensure that
4694
         * everything is safe on disk. Returning an error doesn't work because
4695
         * that would break guests even if the server operates in writethrough
4696
         * mode.
4697
         *
4698
         * Let's hope the user knows what he's doing.
4699
         */
4700
        ret = 0;
4701
    }
4702
    if (ret < 0) {
4703
        return ret;
4704
    }
4705

    
4706
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4707
     * in the case of cache=unsafe, so there are no useless flushes.
4708
     */
4709
flush_parent:
4710
    return bdrv_co_flush(bs->file);
4711
}
4712

    
4713
void bdrv_invalidate_cache(BlockDriverState *bs)
4714
{
4715
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4716
        bs->drv->bdrv_invalidate_cache(bs);
4717
    }
4718
}
4719

    
4720
void bdrv_invalidate_cache_all(void)
4721
{
4722
    BlockDriverState *bs;
4723

    
4724
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4725
        bdrv_invalidate_cache(bs);
4726
    }
4727
}
4728

    
4729
void bdrv_clear_incoming_migration_all(void)
4730
{
4731
    BlockDriverState *bs;
4732

    
4733
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4734
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4735
    }
4736
}
4737

    
4738
int bdrv_flush(BlockDriverState *bs)
4739
{
4740
    Coroutine *co;
4741
    RwCo rwco = {
4742
        .bs = bs,
4743
        .ret = NOT_DONE,
4744
    };
4745

    
4746
    if (qemu_in_coroutine()) {
4747
        /* Fast-path if already in coroutine context */
4748
        bdrv_flush_co_entry(&rwco);
4749
    } else {
4750
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4751
        qemu_coroutine_enter(co, &rwco);
4752
        while (rwco.ret == NOT_DONE) {
4753
            qemu_aio_wait();
4754
        }
4755
    }
4756

    
4757
    return rwco.ret;
4758
}
4759

    
4760
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4761
{
4762
    RwCo *rwco = opaque;
4763

    
4764
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4765
}
4766

    
4767
/* if no limit is specified in the BlockLimits use a default
4768
 * of 32768 512-byte sectors (16 MiB) per request.
4769
 */
4770
#define MAX_DISCARD_DEFAULT 32768
4771

    
4772
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4773
                                 int nb_sectors)
4774
{
4775
    int max_discard;
4776

    
4777
    if (!bs->drv) {
4778
        return -ENOMEDIUM;
4779
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4780
        return -EIO;
4781
    } else if (bs->read_only) {
4782
        return -EROFS;
4783
    }
4784

    
4785
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4786

    
4787
    /* Do nothing if disabled.  */
4788
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4789
        return 0;
4790
    }
4791

    
4792
    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4793
        return 0;
4794
    }
4795

    
4796
    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4797
    while (nb_sectors > 0) {
4798
        int ret;
4799
        int num = nb_sectors;
4800

    
4801
        /* align request */
4802
        if (bs->bl.discard_alignment &&
4803
            num >= bs->bl.discard_alignment &&
4804
            sector_num % bs->bl.discard_alignment) {
4805
            if (num > bs->bl.discard_alignment) {
4806
                num = bs->bl.discard_alignment;
4807
            }
4808
            num -= sector_num % bs->bl.discard_alignment;
4809
        }
4810

    
4811
        /* limit request size */
4812
        if (num > max_discard) {
4813
            num = max_discard;
4814
        }
4815

    
4816
        if (bs->drv->bdrv_co_discard) {
4817
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4818
        } else {
4819
            BlockDriverAIOCB *acb;
4820
            CoroutineIOCompletion co = {
4821
                .coroutine = qemu_coroutine_self(),
4822
            };
4823

    
4824
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4825
                                            bdrv_co_io_em_complete, &co);
4826
            if (acb == NULL) {
4827
                return -EIO;
4828
            } else {
4829
                qemu_coroutine_yield();
4830
                ret = co.ret;
4831
            }
4832
        }
4833
        if (ret && ret != -ENOTSUP) {
4834
            return ret;
4835
        }
4836

    
4837
        sector_num += num;
4838
        nb_sectors -= num;
4839
    }
4840
    return 0;
4841
}
4842

    
4843
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4844
{
4845
    Coroutine *co;
4846
    RwCo rwco = {
4847
        .bs = bs,
4848
        .sector_num = sector_num,
4849
        .nb_sectors = nb_sectors,
4850
        .ret = NOT_DONE,
4851
    };
4852

    
4853
    if (qemu_in_coroutine()) {
4854
        /* Fast-path if already in coroutine context */
4855
        bdrv_discard_co_entry(&rwco);
4856
    } else {
4857
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4858
        qemu_coroutine_enter(co, &rwco);
4859
        while (rwco.ret == NOT_DONE) {
4860
            qemu_aio_wait();
4861
        }
4862
    }
4863

    
4864
    return rwco.ret;
4865
}
4866

    
4867
/**************************************************************/
4868
/* removable device support */
4869

    
4870
/**
4871
 * Return TRUE if the media is present
4872
 */
4873
int bdrv_is_inserted(BlockDriverState *bs)
4874
{
4875
    BlockDriver *drv = bs->drv;
4876

    
4877
    if (!drv)
4878
        return 0;
4879
    if (!drv->bdrv_is_inserted)
4880
        return 1;
4881
    return drv->bdrv_is_inserted(bs);
4882
}
4883

    
4884
/**
4885
 * Return whether the media changed since the last call to this
4886
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4887
 */
4888
int bdrv_media_changed(BlockDriverState *bs)
4889
{
4890
    BlockDriver *drv = bs->drv;
4891

    
4892
    if (drv && drv->bdrv_media_changed) {
4893
        return drv->bdrv_media_changed(bs);
4894
    }
4895
    return -ENOTSUP;
4896
}
4897

    
4898
/**
4899
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4900
 */
4901
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4902
{
4903
    BlockDriver *drv = bs->drv;
4904

    
4905
    if (drv && drv->bdrv_eject) {
4906
        drv->bdrv_eject(bs, eject_flag);
4907
    }
4908

    
4909
    if (bs->device_name[0] != '\0') {
4910
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4911
    }
4912
}
4913

    
4914
/**
4915
 * Lock or unlock the media (if it is locked, the user won't be able
4916
 * to eject it manually).
4917
 */
4918
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4919
{
4920
    BlockDriver *drv = bs->drv;
4921

    
4922
    trace_bdrv_lock_medium(bs, locked);
4923

    
4924
    if (drv && drv->bdrv_lock_medium) {
4925
        drv->bdrv_lock_medium(bs, locked);
4926
    }
4927
}
4928

    
4929
/* needed for generic scsi interface */
4930

    
4931
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4932
{
4933
    BlockDriver *drv = bs->drv;
4934

    
4935
    if (drv && drv->bdrv_ioctl)
4936
        return drv->bdrv_ioctl(bs, req, buf);
4937
    return -ENOTSUP;
4938
}
4939

    
4940
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4941
        unsigned long int req, void *buf,
4942
        BlockDriverCompletionFunc *cb, void *opaque)
4943
{
4944
    BlockDriver *drv = bs->drv;
4945

    
4946
    if (drv && drv->bdrv_aio_ioctl)
4947
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4948
    return NULL;
4949
}
4950

    
4951
void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
4952
{
4953
    bs->guest_block_size = align;
4954
}
4955

    
4956
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4957
{
4958
    return qemu_memalign(bdrv_opt_mem_align(bs), size);
4959
}
4960

    
4961
/*
4962
 * Check if all memory in this vector is sector aligned.
4963
 */
4964
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4965
{
4966
    int i;
4967
    size_t alignment = bdrv_opt_mem_align(bs);
4968

    
4969
    for (i = 0; i < qiov->niov; i++) {
4970
        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
4971
            return false;
4972
        }
4973
        if (qiov->iov[i].iov_len % alignment) {
4974
            return false;
4975
        }
4976
    }
4977

    
4978
    return true;
4979
}
4980

    
4981
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
4982
{
4983
    int64_t bitmap_size;
4984
    BdrvDirtyBitmap *bitmap;
4985

    
4986
    assert((granularity & (granularity - 1)) == 0);
4987

    
4988
    granularity >>= BDRV_SECTOR_BITS;
4989
    assert(granularity);
4990
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4991
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
4992
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4993
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
4994
    return bitmap;
4995
}
4996

    
4997
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4998
{
4999
    BdrvDirtyBitmap *bm, *next;
5000
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5001
        if (bm == bitmap) {
5002
            QLIST_REMOVE(bitmap, list);
5003
            hbitmap_free(bitmap->bitmap);
5004
            g_free(bitmap);
5005
            return;
5006
        }
5007
    }
5008
}
5009

    
5010
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5011
{
5012
    BdrvDirtyBitmap *bm;
5013
    BlockDirtyInfoList *list = NULL;
5014
    BlockDirtyInfoList **plist = &list;
5015

    
5016
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5017
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5018
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5019
        info->count = bdrv_get_dirty_count(bs, bm);
5020
        info->granularity =
5021
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5022
        entry->value = info;
5023
        *plist = entry;
5024
        plist = &entry->next;
5025
    }
5026

    
5027
    return list;
5028
}
5029

    
5030
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5031
{
5032
    if (bitmap) {
5033
        return hbitmap_get(bitmap->bitmap, sector);
5034
    } else {
5035
        return 0;
5036
    }
5037
}
5038

    
5039
void bdrv_dirty_iter_init(BlockDriverState *bs,
5040
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5041
{
5042
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5043
}
5044

    
5045
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5046
                    int nr_sectors)
5047
{
5048
    BdrvDirtyBitmap *bitmap;
5049
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5050
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5051
    }
5052
}
5053

    
5054
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5055
{
5056
    BdrvDirtyBitmap *bitmap;
5057
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5058
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5059
    }
5060
}
5061

    
5062
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5063
{
5064
    return hbitmap_count(bitmap->bitmap);
5065
}
5066

    
5067
/* Get a reference to bs */
5068
void bdrv_ref(BlockDriverState *bs)
5069
{
5070
    bs->refcnt++;
5071
}
5072

    
5073
/* Release a previously grabbed reference to bs.
5074
 * If after releasing, reference count is zero, the BlockDriverState is
5075
 * deleted. */
5076
void bdrv_unref(BlockDriverState *bs)
5077
{
5078
    assert(bs->refcnt > 0);
5079
    if (--bs->refcnt == 0) {
5080
        bdrv_delete(bs);
5081
    }
5082
}
5083

    
5084
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5085
{
5086
    assert(bs->in_use != in_use);
5087
    bs->in_use = in_use;
5088
}
5089

    
5090
int bdrv_in_use(BlockDriverState *bs)
5091
{
5092
    return bs->in_use;
5093
}
5094

    
5095
void bdrv_iostatus_enable(BlockDriverState *bs)
5096
{
5097
    bs->iostatus_enabled = true;
5098
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5099
}
5100

    
5101
/* The I/O status is only enabled if the drive explicitly
5102
 * enables it _and_ the VM is configured to stop on errors */
5103
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5104
{
5105
    return (bs->iostatus_enabled &&
5106
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5107
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5108
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5109
}
5110

    
5111
void bdrv_iostatus_disable(BlockDriverState *bs)
5112
{
5113
    bs->iostatus_enabled = false;
5114
}
5115

    
5116
void bdrv_iostatus_reset(BlockDriverState *bs)
5117
{
5118
    if (bdrv_iostatus_is_enabled(bs)) {
5119
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5120
        if (bs->job) {
5121
            block_job_iostatus_reset(bs->job);
5122
        }
5123
    }
5124
}
5125

    
5126
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5127
{
5128
    assert(bdrv_iostatus_is_enabled(bs));
5129
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5130
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5131
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
5132
    }
5133
}
5134

    
5135
void
5136
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5137
        enum BlockAcctType type)
5138
{
5139
    assert(type < BDRV_MAX_IOTYPE);
5140

    
5141
    cookie->bytes = bytes;
5142
    cookie->start_time_ns = get_clock();
5143
    cookie->type = type;
5144
}
5145

    
5146
void
5147
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5148
{
5149
    assert(cookie->type < BDRV_MAX_IOTYPE);
5150

    
5151
    bs->nr_bytes[cookie->type] += cookie->bytes;
5152
    bs->nr_ops[cookie->type]++;
5153
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5154
}
5155

    
5156
void bdrv_img_create(const char *filename, const char *fmt,
5157
                     const char *base_filename, const char *base_fmt,
5158
                     char *options, uint64_t img_size, int flags,
5159
                     Error **errp, bool quiet)
5160
{
5161
    QEMUOptionParameter *param = NULL, *create_options = NULL;
5162
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
5163
    BlockDriver *drv, *proto_drv;
5164
    BlockDriver *backing_drv = NULL;
5165
    Error *local_err = NULL;
5166
    int ret = 0;
5167

    
5168
    /* Find driver and parse its options */
5169
    drv = bdrv_find_format(fmt);
5170
    if (!drv) {
5171
        error_setg(errp, "Unknown file format '%s'", fmt);
5172
        return;
5173
    }
5174

    
5175
    proto_drv = bdrv_find_protocol(filename, true);
5176
    if (!proto_drv) {
5177
        error_setg(errp, "Unknown protocol '%s'", filename);
5178
        return;
5179
    }
5180

    
5181
    create_options = append_option_parameters(create_options,
5182
                                              drv->create_options);
5183
    create_options = append_option_parameters(create_options,
5184
                                              proto_drv->create_options);
5185

    
5186
    /* Create parameter list with default values */
5187
    param = parse_option_parameters("", create_options, param);
5188

    
5189
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5190

    
5191
    /* Parse -o options */
5192
    if (options) {
5193
        param = parse_option_parameters(options, create_options, param);
5194
        if (param == NULL) {
5195
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
5196
            goto out;
5197
        }
5198
    }
5199

    
5200
    if (base_filename) {
5201
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5202
                                 base_filename)) {
5203
            error_setg(errp, "Backing file not supported for file format '%s'",
5204
                       fmt);
5205
            goto out;
5206
        }
5207
    }
5208

    
5209
    if (base_fmt) {
5210
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5211
            error_setg(errp, "Backing file format not supported for file "
5212
                             "format '%s'", fmt);
5213
            goto out;
5214
        }
5215
    }
5216

    
5217
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5218
    if (backing_file && backing_file->value.s) {
5219
        if (!strcmp(filename, backing_file->value.s)) {
5220
            error_setg(errp, "Error: Trying to create an image with the "
5221
                             "same filename as the backing file");
5222
            goto out;
5223
        }
5224
    }
5225

    
5226
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5227
    if (backing_fmt && backing_fmt->value.s) {
5228
        backing_drv = bdrv_find_format(backing_fmt->value.s);
5229
        if (!backing_drv) {
5230
            error_setg(errp, "Unknown backing file format '%s'",
5231
                       backing_fmt->value.s);
5232
            goto out;
5233
        }
5234
    }
5235

    
5236
    // The size for the image must always be specified, with one exception:
5237
    // If we are using a backing file, we can obtain the size from there
5238
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
5239
    if (size && size->value.n == -1) {
5240
        if (backing_file && backing_file->value.s) {
5241
            BlockDriverState *bs;
5242
            uint64_t size;
5243
            char buf[32];
5244
            int back_flags;
5245

    
5246
            /* backing files always opened read-only */
5247
            back_flags =
5248
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5249

    
5250
            bs = bdrv_new("");
5251

    
5252
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
5253
                            backing_drv, &local_err);
5254
            if (ret < 0) {
5255
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
5256
                                 backing_file->value.s,
5257
                                 error_get_pretty(local_err));
5258
                error_free(local_err);
5259
                local_err = NULL;
5260
                bdrv_unref(bs);
5261
                goto out;
5262
            }
5263
            bdrv_get_geometry(bs, &size);
5264
            size *= 512;
5265

    
5266
            snprintf(buf, sizeof(buf), "%" PRId64, size);
5267
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5268

    
5269
            bdrv_unref(bs);
5270
        } else {
5271
            error_setg(errp, "Image creation needs a size parameter");
5272
            goto out;
5273
        }
5274
    }
5275

    
5276
    if (!quiet) {
5277
        printf("Formatting '%s', fmt=%s ", filename, fmt);
5278
        print_option_parameters(param);
5279
        puts("");
5280
    }
5281
    ret = bdrv_create(drv, filename, param, &local_err);
5282
    if (ret == -EFBIG) {
5283
        /* This is generally a better message than whatever the driver would
5284
         * deliver (especially because of the cluster_size_hint), since that
5285
         * is most probably not much different from "image too large". */
5286
        const char *cluster_size_hint = "";
5287
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5288
            cluster_size_hint = " (try using a larger cluster size)";
5289
        }
5290
        error_setg(errp, "The image size is too large for file format '%s'"
5291
                   "%s", fmt, cluster_size_hint);
5292
        error_free(local_err);
5293
        local_err = NULL;
5294
    }
5295

    
5296
out:
5297
    free_option_parameters(create_options);
5298
    free_option_parameters(param);
5299

    
5300
    if (error_is_set(&local_err)) {
5301
        error_propagate(errp, local_err);
5302
    }
5303
}
5304

    
5305
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5306
{
5307
    /* Currently BlockDriverState always uses the main loop AioContext */
5308
    return qemu_get_aio_context();
5309
}
5310

    
5311
void bdrv_add_before_write_notifier(BlockDriverState *bs,
5312
                                    NotifierWithReturn *notifier)
5313
{
5314
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5315
}
5316

    
5317
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5318
{
5319
    if (bs->drv->bdrv_amend_options == NULL) {
5320
        return -ENOTSUP;
5321
    }
5322
    return bs->drv->bdrv_amend_options(bs, options);
5323
}
5324

    
5325
/* Used to recurse on single child block filters.
5326
 * Single child block filter will store their child in bs->file.
5327
 */
5328
bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5329
                                      BlockDriverState *candidate)
5330
{
5331
    if (!bs->drv) {
5332
        return false;
5333
    }
5334

    
5335
    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5336
        if (bs == candidate) {
5337
            return true;
5338
        } else {
5339
            return false;
5340
        }
5341
    }
5342

    
5343
    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5344
        return false;
5345
    }
5346

    
5347
    if (!bs->file) {
5348
        return false;
5349
    }
5350

    
5351
    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5352
}
5353

    
5354
bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5355
                                      BlockDriverState *candidate)
5356
{
5357
    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5358
        return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5359
    }
5360

    
5361
    return bdrv_generic_is_first_non_filter(bs, candidate);
5362
}
5363

    
5364
/* This function checks if the candidate is the first non filter bs down it's
5365
 * bs chain. Since we don't have pointers to parents it explore all bs chains
5366
 * from the top. Some filters can choose not to pass down the recursion.
5367
 */
5368
bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5369
{
5370
    BlockDriverState *bs;
5371

    
5372
    /* walk down the bs forest recursively */
5373
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5374
        bool perm;
5375

    
5376
        if (!bs->file) {
5377
            continue;
5378
        }
5379

    
5380
        perm = bdrv_recurse_is_first_non_filter(bs->file, candidate);
5381

    
5382
        /* candidate is the first non filter */
5383
        if (perm) {
5384
            return true;
5385
        }
5386
    }
5387

    
5388
    return false;
5389
}