Statistics
| Branch: | Revision:

root / block.c @ 1b0288ae

History | View | Annotate | Download (147.8 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "block/qapi.h"
36
#include "qmp-commands.h"
37
#include "qemu/timer.h"
38

    
39
#ifdef CONFIG_BSD
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <sys/ioctl.h>
43
#include <sys/queue.h>
44
#ifndef __DragonFly__
45
#include <sys/disk.h>
46
#endif
47
#endif
48

    
49
#ifdef _WIN32
50
#include <windows.h>
51
#endif
52

    
53
struct BdrvDirtyBitmap {
54
    HBitmap *bitmap;
55
    QLIST_ENTRY(BdrvDirtyBitmap) list;
56
};
57

    
58
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59

    
60
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63
        BlockDriverCompletionFunc *cb, void *opaque);
64
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66
        BlockDriverCompletionFunc *cb, void *opaque);
67
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68
                                         int64_t sector_num, int nb_sectors,
69
                                         QEMUIOVector *iov);
70
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71
                                         int64_t sector_num, int nb_sectors,
72
                                         QEMUIOVector *iov);
73
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
74
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
75
    BdrvRequestFlags flags);
76
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
77
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
78
    BdrvRequestFlags flags);
79
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80
                                               int64_t sector_num,
81
                                               QEMUIOVector *qiov,
82
                                               int nb_sectors,
83
                                               BdrvRequestFlags flags,
84
                                               BlockDriverCompletionFunc *cb,
85
                                               void *opaque,
86
                                               bool is_write);
87
static void coroutine_fn bdrv_co_do_rw(void *opaque);
88
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96

    
97
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_set_io_limits(BlockDriverState *bs,
125
                        ThrottleConfig *cfg)
126
{
127
    int i;
128

    
129
    throttle_config(&bs->throttle_state, cfg);
130

    
131
    for (i = 0; i < 2; i++) {
132
        qemu_co_enter_next(&bs->throttled_reqs[i]);
133
    }
134
}
135

    
136
/* this function drain all the throttled IOs */
137
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138
{
139
    bool drained = false;
140
    bool enabled = bs->io_limits_enabled;
141
    int i;
142

    
143
    bs->io_limits_enabled = false;
144

    
145
    for (i = 0; i < 2; i++) {
146
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147
            drained = true;
148
        }
149
    }
150

    
151
    bs->io_limits_enabled = enabled;
152

    
153
    return drained;
154
}
155

    
156
void bdrv_io_limits_disable(BlockDriverState *bs)
157
{
158
    bs->io_limits_enabled = false;
159

    
160
    bdrv_start_throttled_reqs(bs);
161

    
162
    throttle_destroy(&bs->throttle_state);
163
}
164

    
165
static void bdrv_throttle_read_timer_cb(void *opaque)
166
{
167
    BlockDriverState *bs = opaque;
168
    qemu_co_enter_next(&bs->throttled_reqs[0]);
169
}
170

    
171
static void bdrv_throttle_write_timer_cb(void *opaque)
172
{
173
    BlockDriverState *bs = opaque;
174
    qemu_co_enter_next(&bs->throttled_reqs[1]);
175
}
176

    
177
/* should be called before bdrv_set_io_limits if a limit is set */
178
void bdrv_io_limits_enable(BlockDriverState *bs)
179
{
180
    assert(!bs->io_limits_enabled);
181
    throttle_init(&bs->throttle_state,
182
                  QEMU_CLOCK_VIRTUAL,
183
                  bdrv_throttle_read_timer_cb,
184
                  bdrv_throttle_write_timer_cb,
185
                  bs);
186
    bs->io_limits_enabled = true;
187
}
188

    
189
/* This function makes an IO wait if needed
190
 *
191
 * @nb_sectors: the number of sectors of the IO
192
 * @is_write:   is the IO a write
193
 */
194
static void bdrv_io_limits_intercept(BlockDriverState *bs,
195
                                     int nb_sectors,
196
                                     bool is_write)
197
{
198
    /* does this io must wait */
199
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200

    
201
    /* if must wait or any request of this type throttled queue the IO */
202
    if (must_wait ||
203
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205
    }
206

    
207
    /* the IO will be executed, do the accounting */
208
    throttle_account(&bs->throttle_state,
209
                     is_write,
210
                     nb_sectors * BDRV_SECTOR_SIZE);
211

    
212
    /* if the next request must wait -> do nothing */
213
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214
        return;
215
    }
216

    
217
    /* else queue next request for execution */
218
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219
}
220

    
221
size_t bdrv_opt_mem_align(BlockDriverState *bs)
222
{
223
    if (!bs || !bs->drv) {
224
        /* 4k should be on the safe side */
225
        return 4096;
226
    }
227

    
228
    return bs->bl.opt_mem_alignment;
229
}
230

    
231
/* check if the path starts with "<protocol>:" */
232
static int path_has_protocol(const char *path)
233
{
234
    const char *p;
235

    
236
#ifdef _WIN32
237
    if (is_windows_drive(path) ||
238
        is_windows_drive_prefix(path)) {
239
        return 0;
240
    }
241
    p = path + strcspn(path, ":/\\");
242
#else
243
    p = path + strcspn(path, ":/");
244
#endif
245

    
246
    return *p == ':';
247
}
248

    
249
int path_is_absolute(const char *path)
250
{
251
#ifdef _WIN32
252
    /* specific case for names like: "\\.\d:" */
253
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254
        return 1;
255
    }
256
    return (*path == '/' || *path == '\\');
257
#else
258
    return (*path == '/');
259
#endif
260
}
261

    
262
/* if filename is absolute, just copy it to dest. Otherwise, build a
263
   path to it by considering it is relative to base_path. URL are
264
   supported. */
265
void path_combine(char *dest, int dest_size,
266
                  const char *base_path,
267
                  const char *filename)
268
{
269
    const char *p, *p1;
270
    int len;
271

    
272
    if (dest_size <= 0)
273
        return;
274
    if (path_is_absolute(filename)) {
275
        pstrcpy(dest, dest_size, filename);
276
    } else {
277
        p = strchr(base_path, ':');
278
        if (p)
279
            p++;
280
        else
281
            p = base_path;
282
        p1 = strrchr(base_path, '/');
283
#ifdef _WIN32
284
        {
285
            const char *p2;
286
            p2 = strrchr(base_path, '\\');
287
            if (!p1 || p2 > p1)
288
                p1 = p2;
289
        }
290
#endif
291
        if (p1)
292
            p1++;
293
        else
294
            p1 = base_path;
295
        if (p1 > p)
296
            p = p1;
297
        len = p - base_path;
298
        if (len > dest_size - 1)
299
            len = dest_size - 1;
300
        memcpy(dest, base_path, len);
301
        dest[len] = '\0';
302
        pstrcat(dest, dest_size, filename);
303
    }
304
}
305

    
306
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307
{
308
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309
        pstrcpy(dest, sz, bs->backing_file);
310
    } else {
311
        path_combine(dest, sz, bs->filename, bs->backing_file);
312
    }
313
}
314

    
315
void bdrv_register(BlockDriver *bdrv)
316
{
317
    /* Block drivers without coroutine functions need emulation */
318
    if (!bdrv->bdrv_co_readv) {
319
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
320
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
321

    
322
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323
         * the block driver lacks aio we need to emulate that too.
324
         */
325
        if (!bdrv->bdrv_aio_readv) {
326
            /* add AIO emulation layer */
327
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329
        }
330
    }
331

    
332
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333
}
334

    
335
/* create a new block device (by default it is empty) */
336
BlockDriverState *bdrv_new(const char *device_name)
337
{
338
    BlockDriverState *bs;
339

    
340
    bs = g_malloc0(sizeof(BlockDriverState));
341
    QLIST_INIT(&bs->dirty_bitmaps);
342
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
343
    if (device_name[0] != '\0') {
344
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
345
    }
346
    bdrv_iostatus_disable(bs);
347
    notifier_list_init(&bs->close_notifiers);
348
    notifier_with_return_list_init(&bs->before_write_notifiers);
349
    qemu_co_queue_init(&bs->throttled_reqs[0]);
350
    qemu_co_queue_init(&bs->throttled_reqs[1]);
351
    bs->refcnt = 1;
352

    
353
    return bs;
354
}
355

    
356
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
357
{
358
    notifier_list_add(&bs->close_notifiers, notify);
359
}
360

    
361
BlockDriver *bdrv_find_format(const char *format_name)
362
{
363
    BlockDriver *drv1;
364
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
365
        if (!strcmp(drv1->format_name, format_name)) {
366
            return drv1;
367
        }
368
    }
369
    return NULL;
370
}
371

    
372
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
373
{
374
    static const char *whitelist_rw[] = {
375
        CONFIG_BDRV_RW_WHITELIST
376
    };
377
    static const char *whitelist_ro[] = {
378
        CONFIG_BDRV_RO_WHITELIST
379
    };
380
    const char **p;
381

    
382
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
383
        return 1;               /* no whitelist, anything goes */
384
    }
385

    
386
    for (p = whitelist_rw; *p; p++) {
387
        if (!strcmp(drv->format_name, *p)) {
388
            return 1;
389
        }
390
    }
391
    if (read_only) {
392
        for (p = whitelist_ro; *p; p++) {
393
            if (!strcmp(drv->format_name, *p)) {
394
                return 1;
395
            }
396
        }
397
    }
398
    return 0;
399
}
400

    
401
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
402
                                          bool read_only)
403
{
404
    BlockDriver *drv = bdrv_find_format(format_name);
405
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
406
}
407

    
408
typedef struct CreateCo {
409
    BlockDriver *drv;
410
    char *filename;
411
    QEMUOptionParameter *options;
412
    int ret;
413
    Error *err;
414
} CreateCo;
415

    
416
static void coroutine_fn bdrv_create_co_entry(void *opaque)
417
{
418
    Error *local_err = NULL;
419
    int ret;
420

    
421
    CreateCo *cco = opaque;
422
    assert(cco->drv);
423

    
424
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
425
    if (error_is_set(&local_err)) {
426
        error_propagate(&cco->err, local_err);
427
    }
428
    cco->ret = ret;
429
}
430

    
431
int bdrv_create(BlockDriver *drv, const char* filename,
432
    QEMUOptionParameter *options, Error **errp)
433
{
434
    int ret;
435

    
436
    Coroutine *co;
437
    CreateCo cco = {
438
        .drv = drv,
439
        .filename = g_strdup(filename),
440
        .options = options,
441
        .ret = NOT_DONE,
442
        .err = NULL,
443
    };
444

    
445
    if (!drv->bdrv_create) {
446
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
447
        ret = -ENOTSUP;
448
        goto out;
449
    }
450

    
451
    if (qemu_in_coroutine()) {
452
        /* Fast-path if already in coroutine context */
453
        bdrv_create_co_entry(&cco);
454
    } else {
455
        co = qemu_coroutine_create(bdrv_create_co_entry);
456
        qemu_coroutine_enter(co, &cco);
457
        while (cco.ret == NOT_DONE) {
458
            qemu_aio_wait();
459
        }
460
    }
461

    
462
    ret = cco.ret;
463
    if (ret < 0) {
464
        if (error_is_set(&cco.err)) {
465
            error_propagate(errp, cco.err);
466
        } else {
467
            error_setg_errno(errp, -ret, "Could not create image");
468
        }
469
    }
470

    
471
out:
472
    g_free(cco.filename);
473
    return ret;
474
}
475

    
476
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
477
                     Error **errp)
478
{
479
    BlockDriver *drv;
480
    Error *local_err = NULL;
481
    int ret;
482

    
483
    drv = bdrv_find_protocol(filename, true);
484
    if (drv == NULL) {
485
        error_setg(errp, "Could not find protocol for file '%s'", filename);
486
        return -ENOENT;
487
    }
488

    
489
    ret = bdrv_create(drv, filename, options, &local_err);
490
    if (error_is_set(&local_err)) {
491
        error_propagate(errp, local_err);
492
    }
493
    return ret;
494
}
495

    
496
int bdrv_refresh_limits(BlockDriverState *bs)
497
{
498
    BlockDriver *drv = bs->drv;
499

    
500
    memset(&bs->bl, 0, sizeof(bs->bl));
501

    
502
    if (!drv) {
503
        return 0;
504
    }
505

    
506
    /* Take some limits from the children as a default */
507
    if (bs->file) {
508
        bdrv_refresh_limits(bs->file);
509
        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
510
        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
511
    } else {
512
        bs->bl.opt_mem_alignment = 512;
513
    }
514

    
515
    if (bs->backing_hd) {
516
        bdrv_refresh_limits(bs->backing_hd);
517
        bs->bl.opt_transfer_length =
518
            MAX(bs->bl.opt_transfer_length,
519
                bs->backing_hd->bl.opt_transfer_length);
520
        bs->bl.opt_mem_alignment =
521
            MAX(bs->bl.opt_mem_alignment,
522
                bs->backing_hd->bl.opt_mem_alignment);
523
    }
524

    
525
    /* Then let the driver override it */
526
    if (drv->bdrv_refresh_limits) {
527
        return drv->bdrv_refresh_limits(bs);
528
    }
529

    
530
    return 0;
531
}
532

    
533
/*
534
 * Create a uniquely-named empty temporary file.
535
 * Return 0 upon success, otherwise a negative errno value.
536
 */
537
int get_tmp_filename(char *filename, int size)
538
{
539
#ifdef _WIN32
540
    char temp_dir[MAX_PATH];
541
    /* GetTempFileName requires that its output buffer (4th param)
542
       have length MAX_PATH or greater.  */
543
    assert(size >= MAX_PATH);
544
    return (GetTempPath(MAX_PATH, temp_dir)
545
            && GetTempFileName(temp_dir, "qem", 0, filename)
546
            ? 0 : -GetLastError());
547
#else
548
    int fd;
549
    const char *tmpdir;
550
    tmpdir = getenv("TMPDIR");
551
    if (!tmpdir)
552
        tmpdir = "/tmp";
553
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
554
        return -EOVERFLOW;
555
    }
556
    fd = mkstemp(filename);
557
    if (fd < 0) {
558
        return -errno;
559
    }
560
    if (close(fd) != 0) {
561
        unlink(filename);
562
        return -errno;
563
    }
564
    return 0;
565
#endif
566
}
567

    
568
/*
569
 * Detect host devices. By convention, /dev/cdrom[N] is always
570
 * recognized as a host CDROM.
571
 */
572
static BlockDriver *find_hdev_driver(const char *filename)
573
{
574
    int score_max = 0, score;
575
    BlockDriver *drv = NULL, *d;
576

    
577
    QLIST_FOREACH(d, &bdrv_drivers, list) {
578
        if (d->bdrv_probe_device) {
579
            score = d->bdrv_probe_device(filename);
580
            if (score > score_max) {
581
                score_max = score;
582
                drv = d;
583
            }
584
        }
585
    }
586

    
587
    return drv;
588
}
589

    
590
BlockDriver *bdrv_find_protocol(const char *filename,
591
                                bool allow_protocol_prefix)
592
{
593
    BlockDriver *drv1;
594
    char protocol[128];
595
    int len;
596
    const char *p;
597

    
598
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
599

    
600
    /*
601
     * XXX(hch): we really should not let host device detection
602
     * override an explicit protocol specification, but moving this
603
     * later breaks access to device names with colons in them.
604
     * Thanks to the brain-dead persistent naming schemes on udev-
605
     * based Linux systems those actually are quite common.
606
     */
607
    drv1 = find_hdev_driver(filename);
608
    if (drv1) {
609
        return drv1;
610
    }
611

    
612
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
613
        return bdrv_find_format("file");
614
    }
615

    
616
    p = strchr(filename, ':');
617
    assert(p != NULL);
618
    len = p - filename;
619
    if (len > sizeof(protocol) - 1)
620
        len = sizeof(protocol) - 1;
621
    memcpy(protocol, filename, len);
622
    protocol[len] = '\0';
623
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
624
        if (drv1->protocol_name &&
625
            !strcmp(drv1->protocol_name, protocol)) {
626
            return drv1;
627
        }
628
    }
629
    return NULL;
630
}
631

    
632
static int find_image_format(BlockDriverState *bs, const char *filename,
633
                             BlockDriver **pdrv, Error **errp)
634
{
635
    int score, score_max;
636
    BlockDriver *drv1, *drv;
637
    uint8_t buf[2048];
638
    int ret = 0;
639

    
640
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
641
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
642
        drv = bdrv_find_format("raw");
643
        if (!drv) {
644
            error_setg(errp, "Could not find raw image format");
645
            ret = -ENOENT;
646
        }
647
        *pdrv = drv;
648
        return ret;
649
    }
650

    
651
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
652
    if (ret < 0) {
653
        error_setg_errno(errp, -ret, "Could not read image for determining its "
654
                         "format");
655
        *pdrv = NULL;
656
        return ret;
657
    }
658

    
659
    score_max = 0;
660
    drv = NULL;
661
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
662
        if (drv1->bdrv_probe) {
663
            score = drv1->bdrv_probe(buf, ret, filename);
664
            if (score > score_max) {
665
                score_max = score;
666
                drv = drv1;
667
            }
668
        }
669
    }
670
    if (!drv) {
671
        error_setg(errp, "Could not determine image format: No compatible "
672
                   "driver found");
673
        ret = -ENOENT;
674
    }
675
    *pdrv = drv;
676
    return ret;
677
}
678

    
679
/**
680
 * Set the current 'total_sectors' value
681
 */
682
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
683
{
684
    BlockDriver *drv = bs->drv;
685

    
686
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
687
    if (bs->sg)
688
        return 0;
689

    
690
    /* query actual device if possible, otherwise just trust the hint */
691
    if (drv->bdrv_getlength) {
692
        int64_t length = drv->bdrv_getlength(bs);
693
        if (length < 0) {
694
            return length;
695
        }
696
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
697
    }
698

    
699
    bs->total_sectors = hint;
700
    return 0;
701
}
702

    
703
/**
704
 * Set open flags for a given discard mode
705
 *
706
 * Return 0 on success, -1 if the discard mode was invalid.
707
 */
708
int bdrv_parse_discard_flags(const char *mode, int *flags)
709
{
710
    *flags &= ~BDRV_O_UNMAP;
711

    
712
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
713
        /* do nothing */
714
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
715
        *flags |= BDRV_O_UNMAP;
716
    } else {
717
        return -1;
718
    }
719

    
720
    return 0;
721
}
722

    
723
/**
724
 * Set open flags for a given cache mode
725
 *
726
 * Return 0 on success, -1 if the cache mode was invalid.
727
 */
728
int bdrv_parse_cache_flags(const char *mode, int *flags)
729
{
730
    *flags &= ~BDRV_O_CACHE_MASK;
731

    
732
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
733
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
734
    } else if (!strcmp(mode, "directsync")) {
735
        *flags |= BDRV_O_NOCACHE;
736
    } else if (!strcmp(mode, "writeback")) {
737
        *flags |= BDRV_O_CACHE_WB;
738
    } else if (!strcmp(mode, "unsafe")) {
739
        *flags |= BDRV_O_CACHE_WB;
740
        *flags |= BDRV_O_NO_FLUSH;
741
    } else if (!strcmp(mode, "writethrough")) {
742
        /* this is the default */
743
    } else {
744
        return -1;
745
    }
746

    
747
    return 0;
748
}
749

    
750
/**
751
 * The copy-on-read flag is actually a reference count so multiple users may
752
 * use the feature without worrying about clobbering its previous state.
753
 * Copy-on-read stays enabled until all users have called to disable it.
754
 */
755
void bdrv_enable_copy_on_read(BlockDriverState *bs)
756
{
757
    bs->copy_on_read++;
758
}
759

    
760
void bdrv_disable_copy_on_read(BlockDriverState *bs)
761
{
762
    assert(bs->copy_on_read > 0);
763
    bs->copy_on_read--;
764
}
765

    
766
static int bdrv_open_flags(BlockDriverState *bs, int flags)
767
{
768
    int open_flags = flags | BDRV_O_CACHE_WB;
769

    
770
    /*
771
     * Clear flags that are internal to the block layer before opening the
772
     * image.
773
     */
774
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
775

    
776
    /*
777
     * Snapshots should be writable.
778
     */
779
    if (bs->is_temporary) {
780
        open_flags |= BDRV_O_RDWR;
781
    }
782

    
783
    return open_flags;
784
}
785

    
786
static int bdrv_assign_node_name(BlockDriverState *bs,
787
                                 const char *node_name,
788
                                 Error **errp)
789
{
790
    if (!node_name) {
791
        return 0;
792
    }
793

    
794
    /* empty string node name is invalid */
795
    if (node_name[0] == '\0') {
796
        error_setg(errp, "Empty node name");
797
        return -EINVAL;
798
    }
799

    
800
    /* takes care of avoiding duplicates node names */
801
    if (bdrv_find_node(node_name)) {
802
        error_setg(errp, "Duplicate node name");
803
        return -EINVAL;
804
    }
805

    
806
    /* copy node name into the bs and insert it into the graph list */
807
    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
808
    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
809

    
810
    return 0;
811
}
812

    
813
/*
814
 * Common part for opening disk images and files
815
 *
816
 * Removes all processed options from *options.
817
 */
818
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
819
    QDict *options, int flags, BlockDriver *drv, Error **errp)
820
{
821
    int ret, open_flags;
822
    const char *filename;
823
    const char *node_name = NULL;
824
    Error *local_err = NULL;
825

    
826
    assert(drv != NULL);
827
    assert(bs->file == NULL);
828
    assert(options != NULL && bs->options != options);
829

    
830
    if (file != NULL) {
831
        filename = file->filename;
832
    } else {
833
        filename = qdict_get_try_str(options, "filename");
834
    }
835

    
836
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
837

    
838
    node_name = qdict_get_try_str(options, "node-name");
839
    ret = bdrv_assign_node_name(bs, node_name, errp);
840
    if (ret < 0) {
841
        return ret;
842
    }
843
    qdict_del(options, "node-name");
844

    
845
    /* bdrv_open() with directly using a protocol as drv. This layer is already
846
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
847
     * and return immediately. */
848
    if (file != NULL && drv->bdrv_file_open) {
849
        bdrv_swap(file, bs);
850
        return 0;
851
    }
852

    
853
    bs->open_flags = flags;
854
    bs->guest_block_size = 512;
855
    bs->request_alignment = 512;
856
    bs->zero_beyond_eof = true;
857
    open_flags = bdrv_open_flags(bs, flags);
858
    bs->read_only = !(open_flags & BDRV_O_RDWR);
859

    
860
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
861
        error_setg(errp,
862
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
863
                        ? "Driver '%s' can only be used for read-only devices"
864
                        : "Driver '%s' is not whitelisted",
865
                   drv->format_name);
866
        return -ENOTSUP;
867
    }
868

    
869
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
870
    if (flags & BDRV_O_COPY_ON_READ) {
871
        if (!bs->read_only) {
872
            bdrv_enable_copy_on_read(bs);
873
        } else {
874
            error_setg(errp, "Can't use copy-on-read on read-only device");
875
            return -EINVAL;
876
        }
877
    }
878

    
879
    if (filename != NULL) {
880
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
881
    } else {
882
        bs->filename[0] = '\0';
883
    }
884

    
885
    bs->drv = drv;
886
    bs->opaque = g_malloc0(drv->instance_size);
887

    
888
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
889

    
890
    /* Open the image, either directly or using a protocol */
891
    if (drv->bdrv_file_open) {
892
        assert(file == NULL);
893
        assert(!drv->bdrv_needs_filename || filename != NULL);
894
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
895
    } else {
896
        if (file == NULL) {
897
            error_setg(errp, "Can't use '%s' as a block driver for the "
898
                       "protocol level", drv->format_name);
899
            ret = -EINVAL;
900
            goto free_and_fail;
901
        }
902
        bs->file = file;
903
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
904
    }
905

    
906
    if (ret < 0) {
907
        if (error_is_set(&local_err)) {
908
            error_propagate(errp, local_err);
909
        } else if (bs->filename[0]) {
910
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
911
        } else {
912
            error_setg_errno(errp, -ret, "Could not open image");
913
        }
914
        goto free_and_fail;
915
    }
916

    
917
    ret = refresh_total_sectors(bs, bs->total_sectors);
918
    if (ret < 0) {
919
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
920
        goto free_and_fail;
921
    }
922

    
923
    bdrv_refresh_limits(bs);
924
    assert(bdrv_opt_mem_align(bs) != 0);
925
    assert(bs->request_alignment != 0);
926

    
927
#ifndef _WIN32
928
    if (bs->is_temporary) {
929
        assert(bs->filename[0] != '\0');
930
        unlink(bs->filename);
931
    }
932
#endif
933
    return 0;
934

    
935
free_and_fail:
936
    bs->file = NULL;
937
    g_free(bs->opaque);
938
    bs->opaque = NULL;
939
    bs->drv = NULL;
940
    return ret;
941
}
942

    
943
/*
944
 * Opens a file using a protocol (file, host_device, nbd, ...)
945
 *
946
 * options is a QDict of options to pass to the block drivers, or NULL for an
947
 * empty set of options. The reference to the QDict belongs to the block layer
948
 * after the call (even on failure), so if the caller intends to reuse the
949
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
950
 */
951
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
952
                   const char *reference, QDict *options, int flags,
953
                   Error **errp)
954
{
955
    BlockDriverState *bs = NULL;
956
    BlockDriver *drv;
957
    const char *drvname;
958
    bool allow_protocol_prefix = false;
959
    Error *local_err = NULL;
960
    int ret;
961

    
962
    /* NULL means an empty set of options */
963
    if (options == NULL) {
964
        options = qdict_new();
965
    }
966

    
967
    if (reference) {
968
        if (filename || qdict_size(options)) {
969
            error_setg(errp, "Cannot reference an existing block device with "
970
                       "additional options or a new filename");
971
            return -EINVAL;
972
        }
973
        QDECREF(options);
974

    
975
        bs = bdrv_find(reference);
976
        if (!bs) {
977
            error_setg(errp, "Cannot find block device '%s'", reference);
978
            return -ENODEV;
979
        }
980
        bdrv_ref(bs);
981
        *pbs = bs;
982
        return 0;
983
    }
984

    
985
    bs = bdrv_new("");
986
    bs->options = options;
987
    options = qdict_clone_shallow(options);
988

    
989
    /* Fetch the file name from the options QDict if necessary */
990
    if (!filename) {
991
        filename = qdict_get_try_str(options, "filename");
992
    } else if (filename && !qdict_haskey(options, "filename")) {
993
        qdict_put(options, "filename", qstring_from_str(filename));
994
        allow_protocol_prefix = true;
995
    } else {
996
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
997
                   "same time");
998
        ret = -EINVAL;
999
        goto fail;
1000
    }
1001

    
1002
    /* Find the right block driver */
1003
    drvname = qdict_get_try_str(options, "driver");
1004
    if (drvname) {
1005
        drv = bdrv_find_format(drvname);
1006
        if (!drv) {
1007
            error_setg(errp, "Unknown driver '%s'", drvname);
1008
        }
1009
        qdict_del(options, "driver");
1010
    } else if (filename) {
1011
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1012
        if (!drv) {
1013
            error_setg(errp, "Unknown protocol");
1014
        }
1015
    } else {
1016
        error_setg(errp, "Must specify either driver or file");
1017
        drv = NULL;
1018
    }
1019

    
1020
    if (!drv) {
1021
        /* errp has been set already */
1022
        ret = -ENOENT;
1023
        goto fail;
1024
    }
1025

    
1026
    /* Parse the filename and open it */
1027
    if (drv->bdrv_parse_filename && filename) {
1028
        drv->bdrv_parse_filename(filename, options, &local_err);
1029
        if (error_is_set(&local_err)) {
1030
            error_propagate(errp, local_err);
1031
            ret = -EINVAL;
1032
            goto fail;
1033
        }
1034
        qdict_del(options, "filename");
1035
    } else if (drv->bdrv_needs_filename && !filename) {
1036
        error_setg(errp, "The '%s' block driver requires a file name",
1037
                   drv->format_name);
1038
        ret = -EINVAL;
1039
        goto fail;
1040
    }
1041

    
1042
    if (!drv->bdrv_file_open) {
1043
        ret = bdrv_open(bs, filename, options, flags, drv, &local_err);
1044
        options = NULL;
1045
    } else {
1046
        ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1047
    }
1048
    if (ret < 0) {
1049
        error_propagate(errp, local_err);
1050
        goto fail;
1051
    }
1052

    
1053
    /* Check if any unknown options were used */
1054
    if (options && (qdict_size(options) != 0)) {
1055
        const QDictEntry *entry = qdict_first(options);
1056
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1057
                   drv->format_name, entry->key);
1058
        ret = -EINVAL;
1059
        goto fail;
1060
    }
1061
    QDECREF(options);
1062

    
1063
    bs->growable = 1;
1064
    *pbs = bs;
1065
    return 0;
1066

    
1067
fail:
1068
    QDECREF(options);
1069
    if (!bs->drv) {
1070
        QDECREF(bs->options);
1071
    }
1072
    bdrv_unref(bs);
1073
    return ret;
1074
}
1075

    
1076
/*
1077
 * Opens the backing file for a BlockDriverState if not yet open
1078
 *
1079
 * options is a QDict of options to pass to the block drivers, or NULL for an
1080
 * empty set of options. The reference to the QDict is transferred to this
1081
 * function (even on failure), so if the caller intends to reuse the dictionary,
1082
 * it needs to use QINCREF() before calling bdrv_file_open.
1083
 */
1084
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1085
{
1086
    char backing_filename[PATH_MAX];
1087
    int back_flags, ret;
1088
    BlockDriver *back_drv = NULL;
1089
    Error *local_err = NULL;
1090

    
1091
    if (bs->backing_hd != NULL) {
1092
        QDECREF(options);
1093
        return 0;
1094
    }
1095

    
1096
    /* NULL means an empty set of options */
1097
    if (options == NULL) {
1098
        options = qdict_new();
1099
    }
1100

    
1101
    bs->open_flags &= ~BDRV_O_NO_BACKING;
1102
    if (qdict_haskey(options, "file.filename")) {
1103
        backing_filename[0] = '\0';
1104
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1105
        QDECREF(options);
1106
        return 0;
1107
    } else {
1108
        bdrv_get_full_backing_filename(bs, backing_filename,
1109
                                       sizeof(backing_filename));
1110
    }
1111

    
1112
    bs->backing_hd = bdrv_new("");
1113

    
1114
    if (bs->backing_format[0] != '\0') {
1115
        back_drv = bdrv_find_format(bs->backing_format);
1116
    }
1117

    
1118
    /* backing files always opened read-only */
1119
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1120
                                    BDRV_O_COPY_ON_READ);
1121

    
1122
    ret = bdrv_open(bs->backing_hd,
1123
                    *backing_filename ? backing_filename : NULL, options,
1124
                    back_flags, back_drv, &local_err);
1125
    if (ret < 0) {
1126
        bdrv_unref(bs->backing_hd);
1127
        bs->backing_hd = NULL;
1128
        bs->open_flags |= BDRV_O_NO_BACKING;
1129
        error_setg(errp, "Could not open backing file: %s",
1130
                   error_get_pretty(local_err));
1131
        error_free(local_err);
1132
        return ret;
1133
    }
1134

    
1135
    if (bs->backing_hd->file) {
1136
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1137
                bs->backing_hd->file->filename);
1138
    }
1139

    
1140
    /* Recalculate the BlockLimits with the backing file */
1141
    bdrv_refresh_limits(bs);
1142

    
1143
    return 0;
1144
}
1145

    
1146
/*
1147
 * Opens a disk image whose options are given as BlockdevRef in another block
1148
 * device's options.
1149
 *
1150
 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1151
 * image format auto-detection. If it is false and a filename is given,
1152
 * bdrv_open() will be used for auto-detection.
1153
 *
1154
 * If allow_none is true, no image will be opened if filename is false and no
1155
 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1156
 *
1157
 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1158
 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1159
 * itself, all options starting with "${bdref_key}." are considered part of the
1160
 * BlockdevRef.
1161
 *
1162
 * The BlockdevRef will be removed from the options QDict.
1163
 */
1164
int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1165
                    QDict *options, const char *bdref_key, int flags,
1166
                    bool force_raw, bool allow_none, Error **errp)
1167
{
1168
    QDict *image_options;
1169
    int ret;
1170
    char *bdref_key_dot;
1171
    const char *reference;
1172

    
1173
    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1174
    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1175
    g_free(bdref_key_dot);
1176

    
1177
    reference = qdict_get_try_str(options, bdref_key);
1178
    if (!filename && !reference && !qdict_size(image_options)) {
1179
        if (allow_none) {
1180
            ret = 0;
1181
        } else {
1182
            error_setg(errp, "A block device must be specified for \"%s\"",
1183
                       bdref_key);
1184
            ret = -EINVAL;
1185
        }
1186
        goto done;
1187
    }
1188

    
1189
    if (filename && !force_raw) {
1190
        /* If a filename is given and the block driver should be detected
1191
           automatically (instead of using none), use bdrv_open() in order to do
1192
           that auto-detection. */
1193
        BlockDriverState *bs;
1194

    
1195
        if (reference) {
1196
            error_setg(errp, "Cannot reference an existing block device while "
1197
                       "giving a filename");
1198
            ret = -EINVAL;
1199
            goto done;
1200
        }
1201

    
1202
        bs = bdrv_new("");
1203
        ret = bdrv_open(bs, filename, image_options, flags, NULL, errp);
1204
        if (ret < 0) {
1205
            bdrv_unref(bs);
1206
        } else {
1207
            *pbs = bs;
1208
        }
1209
    } else {
1210
        ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
1211
                             errp);
1212
    }
1213

    
1214
done:
1215
    qdict_del(options, bdref_key);
1216
    return ret;
1217
}
1218

    
1219
/*
1220
 * Opens a disk image (raw, qcow2, vmdk, ...)
1221
 *
1222
 * options is a QDict of options to pass to the block drivers, or NULL for an
1223
 * empty set of options. The reference to the QDict belongs to the block layer
1224
 * after the call (even on failure), so if the caller intends to reuse the
1225
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1226
 */
1227
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1228
              int flags, BlockDriver *drv, Error **errp)
1229
{
1230
    int ret;
1231
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1232
    char tmp_filename[PATH_MAX + 1];
1233
    BlockDriverState *file = NULL;
1234
    const char *drvname;
1235
    Error *local_err = NULL;
1236

    
1237
    /* NULL means an empty set of options */
1238
    if (options == NULL) {
1239
        options = qdict_new();
1240
    }
1241

    
1242
    bs->options = options;
1243
    options = qdict_clone_shallow(options);
1244

    
1245
    /* For snapshot=on, create a temporary qcow2 overlay */
1246
    if (flags & BDRV_O_SNAPSHOT) {
1247
        BlockDriverState *bs1;
1248
        int64_t total_size;
1249
        BlockDriver *bdrv_qcow2;
1250
        QEMUOptionParameter *create_options;
1251
        QDict *snapshot_options;
1252

    
1253
        /* if snapshot, we create a temporary backing file and open it
1254
           instead of opening 'filename' directly */
1255

    
1256
        /* Get the required size from the image */
1257
        bs1 = bdrv_new("");
1258
        QINCREF(options);
1259
        ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
1260
                        drv, &local_err);
1261
        if (ret < 0) {
1262
            bdrv_unref(bs1);
1263
            goto fail;
1264
        }
1265
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1266

    
1267
        bdrv_unref(bs1);
1268

    
1269
        /* Create the temporary image */
1270
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1271
        if (ret < 0) {
1272
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1273
            goto fail;
1274
        }
1275

    
1276
        bdrv_qcow2 = bdrv_find_format("qcow2");
1277
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1278
                                                 NULL);
1279

    
1280
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1281

    
1282
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1283
        free_option_parameters(create_options);
1284
        if (ret < 0) {
1285
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1286
                             "'%s': %s", tmp_filename,
1287
                             error_get_pretty(local_err));
1288
            error_free(local_err);
1289
            local_err = NULL;
1290
            goto fail;
1291
        }
1292

    
1293
        /* Prepare a new options QDict for the temporary file, where user
1294
         * options refer to the backing file */
1295
        if (filename) {
1296
            qdict_put(options, "file.filename", qstring_from_str(filename));
1297
        }
1298
        if (drv) {
1299
            qdict_put(options, "driver", qstring_from_str(drv->format_name));
1300
        }
1301

    
1302
        snapshot_options = qdict_new();
1303
        qdict_put(snapshot_options, "backing", options);
1304
        qdict_flatten(snapshot_options);
1305

    
1306
        bs->options = snapshot_options;
1307
        options = qdict_clone_shallow(bs->options);
1308

    
1309
        filename = tmp_filename;
1310
        drv = bdrv_qcow2;
1311
        bs->is_temporary = 1;
1312
    }
1313

    
1314
    /* Open image file without format layer */
1315
    if (flags & BDRV_O_RDWR) {
1316
        flags |= BDRV_O_ALLOW_RDWR;
1317
    }
1318

    
1319
    ret = bdrv_open_image(&file, filename, options, "file",
1320
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1321
                          &local_err);
1322
    if (ret < 0) {
1323
        goto fail;
1324
    }
1325

    
1326
    /* Find the right image format driver */
1327
    drvname = qdict_get_try_str(options, "driver");
1328
    if (drvname) {
1329
        drv = bdrv_find_format(drvname);
1330
        qdict_del(options, "driver");
1331
        if (!drv) {
1332
            error_setg(errp, "Invalid driver: '%s'", drvname);
1333
            ret = -EINVAL;
1334
            goto unlink_and_fail;
1335
        }
1336
    }
1337

    
1338
    if (!drv) {
1339
        if (file) {
1340
            ret = find_image_format(file, filename, &drv, &local_err);
1341
        } else {
1342
            error_setg(errp, "Must specify either driver or file");
1343
            ret = -EINVAL;
1344
            goto unlink_and_fail;
1345
        }
1346
    }
1347

    
1348
    if (!drv) {
1349
        goto unlink_and_fail;
1350
    }
1351

    
1352
    /* Open the image */
1353
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1354
    if (ret < 0) {
1355
        goto unlink_and_fail;
1356
    }
1357

    
1358
    if (file && (bs->file != file)) {
1359
        bdrv_unref(file);
1360
        file = NULL;
1361
    }
1362

    
1363
    /* If there is a backing file, use it */
1364
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1365
        QDict *backing_options;
1366

    
1367
        qdict_extract_subqdict(options, &backing_options, "backing.");
1368
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1369
        if (ret < 0) {
1370
            goto close_and_fail;
1371
        }
1372
    }
1373

    
1374
    /* Check if any unknown options were used */
1375
    if (qdict_size(options) != 0) {
1376
        const QDictEntry *entry = qdict_first(options);
1377
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1378
                   "support the option '%s'", drv->format_name, bs->device_name,
1379
                   entry->key);
1380

    
1381
        ret = -EINVAL;
1382
        goto close_and_fail;
1383
    }
1384
    QDECREF(options);
1385

    
1386
    if (!bdrv_key_required(bs)) {
1387
        bdrv_dev_change_media_cb(bs, true);
1388
    }
1389

    
1390
    return 0;
1391

    
1392
unlink_and_fail:
1393
    if (file != NULL) {
1394
        bdrv_unref(file);
1395
    }
1396
    if (bs->is_temporary) {
1397
        unlink(filename);
1398
    }
1399
fail:
1400
    QDECREF(bs->options);
1401
    QDECREF(options);
1402
    bs->options = NULL;
1403
    if (error_is_set(&local_err)) {
1404
        error_propagate(errp, local_err);
1405
    }
1406
    return ret;
1407

    
1408
close_and_fail:
1409
    bdrv_close(bs);
1410
    QDECREF(options);
1411
    if (error_is_set(&local_err)) {
1412
        error_propagate(errp, local_err);
1413
    }
1414
    return ret;
1415
}
1416

    
1417
typedef struct BlockReopenQueueEntry {
1418
     bool prepared;
1419
     BDRVReopenState state;
1420
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1421
} BlockReopenQueueEntry;
1422

    
1423
/*
1424
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1425
 * reopen of multiple devices.
1426
 *
1427
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1428
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1429
 * be created and initialized. This newly created BlockReopenQueue should be
1430
 * passed back in for subsequent calls that are intended to be of the same
1431
 * atomic 'set'.
1432
 *
1433
 * bs is the BlockDriverState to add to the reopen queue.
1434
 *
1435
 * flags contains the open flags for the associated bs
1436
 *
1437
 * returns a pointer to bs_queue, which is either the newly allocated
1438
 * bs_queue, or the existing bs_queue being used.
1439
 *
1440
 */
1441
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1442
                                    BlockDriverState *bs, int flags)
1443
{
1444
    assert(bs != NULL);
1445

    
1446
    BlockReopenQueueEntry *bs_entry;
1447
    if (bs_queue == NULL) {
1448
        bs_queue = g_new0(BlockReopenQueue, 1);
1449
        QSIMPLEQ_INIT(bs_queue);
1450
    }
1451

    
1452
    if (bs->file) {
1453
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1454
    }
1455

    
1456
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1457
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1458

    
1459
    bs_entry->state.bs = bs;
1460
    bs_entry->state.flags = flags;
1461

    
1462
    return bs_queue;
1463
}
1464

    
1465
/*
1466
 * Reopen multiple BlockDriverStates atomically & transactionally.
1467
 *
1468
 * The queue passed in (bs_queue) must have been built up previous
1469
 * via bdrv_reopen_queue().
1470
 *
1471
 * Reopens all BDS specified in the queue, with the appropriate
1472
 * flags.  All devices are prepared for reopen, and failure of any
1473
 * device will cause all device changes to be abandonded, and intermediate
1474
 * data cleaned up.
1475
 *
1476
 * If all devices prepare successfully, then the changes are committed
1477
 * to all devices.
1478
 *
1479
 */
1480
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1481
{
1482
    int ret = -1;
1483
    BlockReopenQueueEntry *bs_entry, *next;
1484
    Error *local_err = NULL;
1485

    
1486
    assert(bs_queue != NULL);
1487

    
1488
    bdrv_drain_all();
1489

    
1490
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1491
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1492
            error_propagate(errp, local_err);
1493
            goto cleanup;
1494
        }
1495
        bs_entry->prepared = true;
1496
    }
1497

    
1498
    /* If we reach this point, we have success and just need to apply the
1499
     * changes
1500
     */
1501
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1502
        bdrv_reopen_commit(&bs_entry->state);
1503
    }
1504

    
1505
    ret = 0;
1506

    
1507
cleanup:
1508
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1509
        if (ret && bs_entry->prepared) {
1510
            bdrv_reopen_abort(&bs_entry->state);
1511
        }
1512
        g_free(bs_entry);
1513
    }
1514
    g_free(bs_queue);
1515
    return ret;
1516
}
1517

    
1518

    
1519
/* Reopen a single BlockDriverState with the specified flags. */
1520
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1521
{
1522
    int ret = -1;
1523
    Error *local_err = NULL;
1524
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1525

    
1526
    ret = bdrv_reopen_multiple(queue, &local_err);
1527
    if (local_err != NULL) {
1528
        error_propagate(errp, local_err);
1529
    }
1530
    return ret;
1531
}
1532

    
1533

    
1534
/*
1535
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1536
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1537
 * the block driver layer .bdrv_reopen_prepare()
1538
 *
1539
 * bs is the BlockDriverState to reopen
1540
 * flags are the new open flags
1541
 * queue is the reopen queue
1542
 *
1543
 * Returns 0 on success, non-zero on error.  On error errp will be set
1544
 * as well.
1545
 *
1546
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1547
 * It is the responsibility of the caller to then call the abort() or
1548
 * commit() for any other BDS that have been left in a prepare() state
1549
 *
1550
 */
1551
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1552
                        Error **errp)
1553
{
1554
    int ret = -1;
1555
    Error *local_err = NULL;
1556
    BlockDriver *drv;
1557

    
1558
    assert(reopen_state != NULL);
1559
    assert(reopen_state->bs->drv != NULL);
1560
    drv = reopen_state->bs->drv;
1561

    
1562
    /* if we are to stay read-only, do not allow permission change
1563
     * to r/w */
1564
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1565
        reopen_state->flags & BDRV_O_RDWR) {
1566
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1567
                  reopen_state->bs->device_name);
1568
        goto error;
1569
    }
1570

    
1571

    
1572
    ret = bdrv_flush(reopen_state->bs);
1573
    if (ret) {
1574
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1575
                  strerror(-ret));
1576
        goto error;
1577
    }
1578

    
1579
    if (drv->bdrv_reopen_prepare) {
1580
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1581
        if (ret) {
1582
            if (local_err != NULL) {
1583
                error_propagate(errp, local_err);
1584
            } else {
1585
                error_setg(errp, "failed while preparing to reopen image '%s'",
1586
                           reopen_state->bs->filename);
1587
            }
1588
            goto error;
1589
        }
1590
    } else {
1591
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1592
         * handler for each supported drv. */
1593
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1594
                  drv->format_name, reopen_state->bs->device_name,
1595
                 "reopening of file");
1596
        ret = -1;
1597
        goto error;
1598
    }
1599

    
1600
    ret = 0;
1601

    
1602
error:
1603
    return ret;
1604
}
1605

    
1606
/*
1607
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1608
 * makes them final by swapping the staging BlockDriverState contents into
1609
 * the active BlockDriverState contents.
1610
 */
1611
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1612
{
1613
    BlockDriver *drv;
1614

    
1615
    assert(reopen_state != NULL);
1616
    drv = reopen_state->bs->drv;
1617
    assert(drv != NULL);
1618

    
1619
    /* If there are any driver level actions to take */
1620
    if (drv->bdrv_reopen_commit) {
1621
        drv->bdrv_reopen_commit(reopen_state);
1622
    }
1623

    
1624
    /* set BDS specific flags now */
1625
    reopen_state->bs->open_flags         = reopen_state->flags;
1626
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1627
                                              BDRV_O_CACHE_WB);
1628
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1629

    
1630
    bdrv_refresh_limits(reopen_state->bs);
1631
}
1632

    
1633
/*
1634
 * Abort the reopen, and delete and free the staged changes in
1635
 * reopen_state
1636
 */
1637
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1638
{
1639
    BlockDriver *drv;
1640

    
1641
    assert(reopen_state != NULL);
1642
    drv = reopen_state->bs->drv;
1643
    assert(drv != NULL);
1644

    
1645
    if (drv->bdrv_reopen_abort) {
1646
        drv->bdrv_reopen_abort(reopen_state);
1647
    }
1648
}
1649

    
1650

    
1651
void bdrv_close(BlockDriverState *bs)
1652
{
1653
    if (bs->job) {
1654
        block_job_cancel_sync(bs->job);
1655
    }
1656
    bdrv_drain_all(); /* complete I/O */
1657
    bdrv_flush(bs);
1658
    bdrv_drain_all(); /* in case flush left pending I/O */
1659
    notifier_list_notify(&bs->close_notifiers, bs);
1660

    
1661
    if (bs->drv) {
1662
        if (bs->backing_hd) {
1663
            bdrv_unref(bs->backing_hd);
1664
            bs->backing_hd = NULL;
1665
        }
1666
        bs->drv->bdrv_close(bs);
1667
        g_free(bs->opaque);
1668
#ifdef _WIN32
1669
        if (bs->is_temporary) {
1670
            unlink(bs->filename);
1671
        }
1672
#endif
1673
        bs->opaque = NULL;
1674
        bs->drv = NULL;
1675
        bs->copy_on_read = 0;
1676
        bs->backing_file[0] = '\0';
1677
        bs->backing_format[0] = '\0';
1678
        bs->total_sectors = 0;
1679
        bs->encrypted = 0;
1680
        bs->valid_key = 0;
1681
        bs->sg = 0;
1682
        bs->growable = 0;
1683
        bs->zero_beyond_eof = false;
1684
        QDECREF(bs->options);
1685
        bs->options = NULL;
1686

    
1687
        if (bs->file != NULL) {
1688
            bdrv_unref(bs->file);
1689
            bs->file = NULL;
1690
        }
1691
    }
1692

    
1693
    bdrv_dev_change_media_cb(bs, false);
1694

    
1695
    /*throttling disk I/O limits*/
1696
    if (bs->io_limits_enabled) {
1697
        bdrv_io_limits_disable(bs);
1698
    }
1699
}
1700

    
1701
void bdrv_close_all(void)
1702
{
1703
    BlockDriverState *bs;
1704

    
1705
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1706
        bdrv_close(bs);
1707
    }
1708
}
1709

    
1710
/* Check if any requests are in-flight (including throttled requests) */
1711
static bool bdrv_requests_pending(BlockDriverState *bs)
1712
{
1713
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1714
        return true;
1715
    }
1716
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1717
        return true;
1718
    }
1719
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1720
        return true;
1721
    }
1722
    if (bs->file && bdrv_requests_pending(bs->file)) {
1723
        return true;
1724
    }
1725
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1726
        return true;
1727
    }
1728
    return false;
1729
}
1730

    
1731
static bool bdrv_requests_pending_all(void)
1732
{
1733
    BlockDriverState *bs;
1734
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1735
        if (bdrv_requests_pending(bs)) {
1736
            return true;
1737
        }
1738
    }
1739
    return false;
1740
}
1741

    
1742
/*
1743
 * Wait for pending requests to complete across all BlockDriverStates
1744
 *
1745
 * This function does not flush data to disk, use bdrv_flush_all() for that
1746
 * after calling this function.
1747
 *
1748
 * Note that completion of an asynchronous I/O operation can trigger any
1749
 * number of other I/O operations on other devices---for example a coroutine
1750
 * can be arbitrarily complex and a constant flow of I/O can come until the
1751
 * coroutine is complete.  Because of this, it is not possible to have a
1752
 * function to drain a single device's I/O queue.
1753
 */
1754
void bdrv_drain_all(void)
1755
{
1756
    /* Always run first iteration so any pending completion BHs run */
1757
    bool busy = true;
1758
    BlockDriverState *bs;
1759

    
1760
    while (busy) {
1761
        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1762
            bdrv_start_throttled_reqs(bs);
1763
        }
1764

    
1765
        busy = bdrv_requests_pending_all();
1766
        busy |= aio_poll(qemu_get_aio_context(), busy);
1767
    }
1768
}
1769

    
1770
/* make a BlockDriverState anonymous by removing from bdrv_state and
1771
 * graph_bdrv_state list.
1772
   Also, NULL terminate the device_name to prevent double remove */
1773
void bdrv_make_anon(BlockDriverState *bs)
1774
{
1775
    if (bs->device_name[0] != '\0') {
1776
        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1777
    }
1778
    bs->device_name[0] = '\0';
1779
    if (bs->node_name[0] != '\0') {
1780
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1781
    }
1782
    bs->node_name[0] = '\0';
1783
}
1784

    
1785
static void bdrv_rebind(BlockDriverState *bs)
1786
{
1787
    if (bs->drv && bs->drv->bdrv_rebind) {
1788
        bs->drv->bdrv_rebind(bs);
1789
    }
1790
}
1791

    
1792
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1793
                                     BlockDriverState *bs_src)
1794
{
1795
    /* move some fields that need to stay attached to the device */
1796
    bs_dest->open_flags         = bs_src->open_flags;
1797

    
1798
    /* dev info */
1799
    bs_dest->dev_ops            = bs_src->dev_ops;
1800
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1801
    bs_dest->dev                = bs_src->dev;
1802
    bs_dest->guest_block_size   = bs_src->guest_block_size;
1803
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1804

    
1805
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1806

    
1807
    /* i/o throttled req */
1808
    memcpy(&bs_dest->throttle_state,
1809
           &bs_src->throttle_state,
1810
           sizeof(ThrottleState));
1811
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1812
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1813
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1814

    
1815
    /* r/w error */
1816
    bs_dest->on_read_error      = bs_src->on_read_error;
1817
    bs_dest->on_write_error     = bs_src->on_write_error;
1818

    
1819
    /* i/o status */
1820
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1821
    bs_dest->iostatus           = bs_src->iostatus;
1822

    
1823
    /* dirty bitmap */
1824
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1825

    
1826
    /* reference count */
1827
    bs_dest->refcnt             = bs_src->refcnt;
1828

    
1829
    /* job */
1830
    bs_dest->in_use             = bs_src->in_use;
1831
    bs_dest->job                = bs_src->job;
1832

    
1833
    /* keep the same entry in bdrv_states */
1834
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1835
            bs_src->device_name);
1836
    bs_dest->device_list = bs_src->device_list;
1837

    
1838
    /* keep the same entry in graph_bdrv_states
1839
     * We do want to swap name but don't want to swap linked list entries
1840
     */
1841
    bs_dest->node_list   = bs_src->node_list;
1842
}
1843

    
1844
/*
1845
 * Swap bs contents for two image chains while they are live,
1846
 * while keeping required fields on the BlockDriverState that is
1847
 * actually attached to a device.
1848
 *
1849
 * This will modify the BlockDriverState fields, and swap contents
1850
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1851
 *
1852
 * bs_new is required to be anonymous.
1853
 *
1854
 * This function does not create any image files.
1855
 */
1856
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1857
{
1858
    BlockDriverState tmp;
1859

    
1860
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1861
    assert(bs_new->device_name[0] == '\0');
1862
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1863
    assert(bs_new->job == NULL);
1864
    assert(bs_new->dev == NULL);
1865
    assert(bs_new->in_use == 0);
1866
    assert(bs_new->io_limits_enabled == false);
1867
    assert(!throttle_have_timer(&bs_new->throttle_state));
1868

    
1869
    tmp = *bs_new;
1870
    *bs_new = *bs_old;
1871
    *bs_old = tmp;
1872

    
1873
    /* there are some fields that should not be swapped, move them back */
1874
    bdrv_move_feature_fields(&tmp, bs_old);
1875
    bdrv_move_feature_fields(bs_old, bs_new);
1876
    bdrv_move_feature_fields(bs_new, &tmp);
1877

    
1878
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1879
    assert(bs_new->device_name[0] == '\0');
1880

    
1881
    /* Check a few fields that should remain attached to the device */
1882
    assert(bs_new->dev == NULL);
1883
    assert(bs_new->job == NULL);
1884
    assert(bs_new->in_use == 0);
1885
    assert(bs_new->io_limits_enabled == false);
1886
    assert(!throttle_have_timer(&bs_new->throttle_state));
1887

    
1888
    bdrv_rebind(bs_new);
1889
    bdrv_rebind(bs_old);
1890
}
1891

    
1892
/*
1893
 * Add new bs contents at the top of an image chain while the chain is
1894
 * live, while keeping required fields on the top layer.
1895
 *
1896
 * This will modify the BlockDriverState fields, and swap contents
1897
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1898
 *
1899
 * bs_new is required to be anonymous.
1900
 *
1901
 * This function does not create any image files.
1902
 */
1903
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1904
{
1905
    bdrv_swap(bs_new, bs_top);
1906

    
1907
    /* The contents of 'tmp' will become bs_top, as we are
1908
     * swapping bs_new and bs_top contents. */
1909
    bs_top->backing_hd = bs_new;
1910
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1911
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1912
            bs_new->filename);
1913
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1914
            bs_new->drv ? bs_new->drv->format_name : "");
1915
}
1916

    
1917
static void bdrv_delete(BlockDriverState *bs)
1918
{
1919
    assert(!bs->dev);
1920
    assert(!bs->job);
1921
    assert(!bs->in_use);
1922
    assert(!bs->refcnt);
1923
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1924

    
1925
    bdrv_close(bs);
1926

    
1927
    /* remove from list, if necessary */
1928
    bdrv_make_anon(bs);
1929

    
1930
    g_free(bs);
1931
}
1932

    
1933
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1934
/* TODO change to DeviceState *dev when all users are qdevified */
1935
{
1936
    if (bs->dev) {
1937
        return -EBUSY;
1938
    }
1939
    bs->dev = dev;
1940
    bdrv_iostatus_reset(bs);
1941
    return 0;
1942
}
1943

    
1944
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1945
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1946
{
1947
    if (bdrv_attach_dev(bs, dev) < 0) {
1948
        abort();
1949
    }
1950
}
1951

    
1952
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1953
/* TODO change to DeviceState *dev when all users are qdevified */
1954
{
1955
    assert(bs->dev == dev);
1956
    bs->dev = NULL;
1957
    bs->dev_ops = NULL;
1958
    bs->dev_opaque = NULL;
1959
    bs->guest_block_size = 512;
1960
}
1961

    
1962
/* TODO change to return DeviceState * when all users are qdevified */
1963
void *bdrv_get_attached_dev(BlockDriverState *bs)
1964
{
1965
    return bs->dev;
1966
}
1967

    
1968
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1969
                      void *opaque)
1970
{
1971
    bs->dev_ops = ops;
1972
    bs->dev_opaque = opaque;
1973
}
1974

    
1975
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1976
                               enum MonitorEvent ev,
1977
                               BlockErrorAction action, bool is_read)
1978
{
1979
    QObject *data;
1980
    const char *action_str;
1981

    
1982
    switch (action) {
1983
    case BDRV_ACTION_REPORT:
1984
        action_str = "report";
1985
        break;
1986
    case BDRV_ACTION_IGNORE:
1987
        action_str = "ignore";
1988
        break;
1989
    case BDRV_ACTION_STOP:
1990
        action_str = "stop";
1991
        break;
1992
    default:
1993
        abort();
1994
    }
1995

    
1996
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1997
                              bdrv->device_name,
1998
                              action_str,
1999
                              is_read ? "read" : "write");
2000
    monitor_protocol_event(ev, data);
2001

    
2002
    qobject_decref(data);
2003
}
2004

    
2005
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2006
{
2007
    QObject *data;
2008

    
2009
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2010
                              bdrv_get_device_name(bs), ejected);
2011
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2012

    
2013
    qobject_decref(data);
2014
}
2015

    
2016
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2017
{
2018
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2019
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2020
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2021
        if (tray_was_closed) {
2022
            /* tray open */
2023
            bdrv_emit_qmp_eject_event(bs, true);
2024
        }
2025
        if (load) {
2026
            /* tray close */
2027
            bdrv_emit_qmp_eject_event(bs, false);
2028
        }
2029
    }
2030
}
2031

    
2032
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2033
{
2034
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2035
}
2036

    
2037
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2038
{
2039
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2040
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2041
    }
2042
}
2043

    
2044
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2045
{
2046
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2047
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
2048
    }
2049
    return false;
2050
}
2051

    
2052
static void bdrv_dev_resize_cb(BlockDriverState *bs)
2053
{
2054
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
2055
        bs->dev_ops->resize_cb(bs->dev_opaque);
2056
    }
2057
}
2058

    
2059
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2060
{
2061
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2062
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2063
    }
2064
    return false;
2065
}
2066

    
2067
/*
2068
 * Run consistency checks on an image
2069
 *
2070
 * Returns 0 if the check could be completed (it doesn't mean that the image is
2071
 * free of errors) or -errno when an internal error occurred. The results of the
2072
 * check are stored in res.
2073
 */
2074
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2075
{
2076
    if (bs->drv->bdrv_check == NULL) {
2077
        return -ENOTSUP;
2078
    }
2079

    
2080
    memset(res, 0, sizeof(*res));
2081
    return bs->drv->bdrv_check(bs, res, fix);
2082
}
2083

    
2084
#define COMMIT_BUF_SECTORS 2048
2085

    
2086
/* commit COW file into the raw image */
2087
int bdrv_commit(BlockDriverState *bs)
2088
{
2089
    BlockDriver *drv = bs->drv;
2090
    int64_t sector, total_sectors, length, backing_length;
2091
    int n, ro, open_flags;
2092
    int ret = 0;
2093
    uint8_t *buf = NULL;
2094
    char filename[PATH_MAX];
2095

    
2096
    if (!drv)
2097
        return -ENOMEDIUM;
2098
    
2099
    if (!bs->backing_hd) {
2100
        return -ENOTSUP;
2101
    }
2102

    
2103
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2104
        return -EBUSY;
2105
    }
2106

    
2107
    ro = bs->backing_hd->read_only;
2108
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2109
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2110
    open_flags =  bs->backing_hd->open_flags;
2111

    
2112
    if (ro) {
2113
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2114
            return -EACCES;
2115
        }
2116
    }
2117

    
2118
    length = bdrv_getlength(bs);
2119
    if (length < 0) {
2120
        ret = length;
2121
        goto ro_cleanup;
2122
    }
2123

    
2124
    backing_length = bdrv_getlength(bs->backing_hd);
2125
    if (backing_length < 0) {
2126
        ret = backing_length;
2127
        goto ro_cleanup;
2128
    }
2129

    
2130
    /* If our top snapshot is larger than the backing file image,
2131
     * grow the backing file image if possible.  If not possible,
2132
     * we must return an error */
2133
    if (length > backing_length) {
2134
        ret = bdrv_truncate(bs->backing_hd, length);
2135
        if (ret < 0) {
2136
            goto ro_cleanup;
2137
        }
2138
    }
2139

    
2140
    total_sectors = length >> BDRV_SECTOR_BITS;
2141
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2142

    
2143
    for (sector = 0; sector < total_sectors; sector += n) {
2144
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2145
        if (ret < 0) {
2146
            goto ro_cleanup;
2147
        }
2148
        if (ret) {
2149
            ret = bdrv_read(bs, sector, buf, n);
2150
            if (ret < 0) {
2151
                goto ro_cleanup;
2152
            }
2153

    
2154
            ret = bdrv_write(bs->backing_hd, sector, buf, n);
2155
            if (ret < 0) {
2156
                goto ro_cleanup;
2157
            }
2158
        }
2159
    }
2160

    
2161
    if (drv->bdrv_make_empty) {
2162
        ret = drv->bdrv_make_empty(bs);
2163
        if (ret < 0) {
2164
            goto ro_cleanup;
2165
        }
2166
        bdrv_flush(bs);
2167
    }
2168

    
2169
    /*
2170
     * Make sure all data we wrote to the backing device is actually
2171
     * stable on disk.
2172
     */
2173
    if (bs->backing_hd) {
2174
        bdrv_flush(bs->backing_hd);
2175
    }
2176

    
2177
    ret = 0;
2178
ro_cleanup:
2179
    g_free(buf);
2180

    
2181
    if (ro) {
2182
        /* ignoring error return here */
2183
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2184
    }
2185

    
2186
    return ret;
2187
}
2188

    
2189
int bdrv_commit_all(void)
2190
{
2191
    BlockDriverState *bs;
2192

    
2193
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2194
        if (bs->drv && bs->backing_hd) {
2195
            int ret = bdrv_commit(bs);
2196
            if (ret < 0) {
2197
                return ret;
2198
            }
2199
        }
2200
    }
2201
    return 0;
2202
}
2203

    
2204
/**
2205
 * Remove an active request from the tracked requests list
2206
 *
2207
 * This function should be called when a tracked request is completing.
2208
 */
2209
static void tracked_request_end(BdrvTrackedRequest *req)
2210
{
2211
    QLIST_REMOVE(req, list);
2212
    qemu_co_queue_restart_all(&req->wait_queue);
2213
}
2214

    
2215
/**
2216
 * Add an active request to the tracked requests list
2217
 */
2218
static void tracked_request_begin(BdrvTrackedRequest *req,
2219
                                  BlockDriverState *bs,
2220
                                  int64_t sector_num,
2221
                                  int nb_sectors, bool is_write)
2222
{
2223
    *req = (BdrvTrackedRequest){
2224
        .bs = bs,
2225
        .sector_num = sector_num,
2226
        .nb_sectors = nb_sectors,
2227
        .is_write = is_write,
2228
        .co = qemu_coroutine_self(),
2229
    };
2230

    
2231
    qemu_co_queue_init(&req->wait_queue);
2232

    
2233
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2234
}
2235

    
2236
/**
2237
 * Round a region to cluster boundaries
2238
 */
2239
void bdrv_round_to_clusters(BlockDriverState *bs,
2240
                            int64_t sector_num, int nb_sectors,
2241
                            int64_t *cluster_sector_num,
2242
                            int *cluster_nb_sectors)
2243
{
2244
    BlockDriverInfo bdi;
2245

    
2246
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2247
        *cluster_sector_num = sector_num;
2248
        *cluster_nb_sectors = nb_sectors;
2249
    } else {
2250
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2251
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2252
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2253
                                            nb_sectors, c);
2254
    }
2255
}
2256

    
2257
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2258
                                     int64_t sector_num, int nb_sectors) {
2259
    /*        aaaa   bbbb */
2260
    if (sector_num >= req->sector_num + req->nb_sectors) {
2261
        return false;
2262
    }
2263
    /* bbbb   aaaa        */
2264
    if (req->sector_num >= sector_num + nb_sectors) {
2265
        return false;
2266
    }
2267
    return true;
2268
}
2269

    
2270
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2271
        int64_t sector_num, int nb_sectors)
2272
{
2273
    BdrvTrackedRequest *req;
2274
    int64_t cluster_sector_num;
2275
    int cluster_nb_sectors;
2276
    bool retry;
2277

    
2278
    /* If we touch the same cluster it counts as an overlap.  This guarantees
2279
     * that allocating writes will be serialized and not race with each other
2280
     * for the same cluster.  For example, in copy-on-read it ensures that the
2281
     * CoR read and write operations are atomic and guest writes cannot
2282
     * interleave between them.
2283
     */
2284
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2285
                           &cluster_sector_num, &cluster_nb_sectors);
2286

    
2287
    do {
2288
        retry = false;
2289
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2290
            if (tracked_request_overlaps(req, cluster_sector_num,
2291
                                         cluster_nb_sectors)) {
2292
                /* Hitting this means there was a reentrant request, for
2293
                 * example, a block driver issuing nested requests.  This must
2294
                 * never happen since it means deadlock.
2295
                 */
2296
                assert(qemu_coroutine_self() != req->co);
2297

    
2298
                qemu_co_queue_wait(&req->wait_queue);
2299
                retry = true;
2300
                break;
2301
            }
2302
        }
2303
    } while (retry);
2304
}
2305

    
2306
/*
2307
 * Return values:
2308
 * 0        - success
2309
 * -EINVAL  - backing format specified, but no file
2310
 * -ENOSPC  - can't update the backing file because no space is left in the
2311
 *            image file header
2312
 * -ENOTSUP - format driver doesn't support changing the backing file
2313
 */
2314
int bdrv_change_backing_file(BlockDriverState *bs,
2315
    const char *backing_file, const char *backing_fmt)
2316
{
2317
    BlockDriver *drv = bs->drv;
2318
    int ret;
2319

    
2320
    /* Backing file format doesn't make sense without a backing file */
2321
    if (backing_fmt && !backing_file) {
2322
        return -EINVAL;
2323
    }
2324

    
2325
    if (drv->bdrv_change_backing_file != NULL) {
2326
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2327
    } else {
2328
        ret = -ENOTSUP;
2329
    }
2330

    
2331
    if (ret == 0) {
2332
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2333
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2334
    }
2335
    return ret;
2336
}
2337

    
2338
/*
2339
 * Finds the image layer in the chain that has 'bs' as its backing file.
2340
 *
2341
 * active is the current topmost image.
2342
 *
2343
 * Returns NULL if bs is not found in active's image chain,
2344
 * or if active == bs.
2345
 */
2346
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2347
                                    BlockDriverState *bs)
2348
{
2349
    BlockDriverState *overlay = NULL;
2350
    BlockDriverState *intermediate;
2351

    
2352
    assert(active != NULL);
2353
    assert(bs != NULL);
2354

    
2355
    /* if bs is the same as active, then by definition it has no overlay
2356
     */
2357
    if (active == bs) {
2358
        return NULL;
2359
    }
2360

    
2361
    intermediate = active;
2362
    while (intermediate->backing_hd) {
2363
        if (intermediate->backing_hd == bs) {
2364
            overlay = intermediate;
2365
            break;
2366
        }
2367
        intermediate = intermediate->backing_hd;
2368
    }
2369

    
2370
    return overlay;
2371
}
2372

    
2373
typedef struct BlkIntermediateStates {
2374
    BlockDriverState *bs;
2375
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2376
} BlkIntermediateStates;
2377

    
2378

    
2379
/*
2380
 * Drops images above 'base' up to and including 'top', and sets the image
2381
 * above 'top' to have base as its backing file.
2382
 *
2383
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2384
 * information in 'bs' can be properly updated.
2385
 *
2386
 * E.g., this will convert the following chain:
2387
 * bottom <- base <- intermediate <- top <- active
2388
 *
2389
 * to
2390
 *
2391
 * bottom <- base <- active
2392
 *
2393
 * It is allowed for bottom==base, in which case it converts:
2394
 *
2395
 * base <- intermediate <- top <- active
2396
 *
2397
 * to
2398
 *
2399
 * base <- active
2400
 *
2401
 * Error conditions:
2402
 *  if active == top, that is considered an error
2403
 *
2404
 */
2405
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2406
                           BlockDriverState *base)
2407
{
2408
    BlockDriverState *intermediate;
2409
    BlockDriverState *base_bs = NULL;
2410
    BlockDriverState *new_top_bs = NULL;
2411
    BlkIntermediateStates *intermediate_state, *next;
2412
    int ret = -EIO;
2413

    
2414
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2415
    QSIMPLEQ_INIT(&states_to_delete);
2416

    
2417
    if (!top->drv || !base->drv) {
2418
        goto exit;
2419
    }
2420

    
2421
    new_top_bs = bdrv_find_overlay(active, top);
2422

    
2423
    if (new_top_bs == NULL) {
2424
        /* we could not find the image above 'top', this is an error */
2425
        goto exit;
2426
    }
2427

    
2428
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2429
     * to do, no intermediate images */
2430
    if (new_top_bs->backing_hd == base) {
2431
        ret = 0;
2432
        goto exit;
2433
    }
2434

    
2435
    intermediate = top;
2436

    
2437
    /* now we will go down through the list, and add each BDS we find
2438
     * into our deletion queue, until we hit the 'base'
2439
     */
2440
    while (intermediate) {
2441
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2442
        intermediate_state->bs = intermediate;
2443
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2444

    
2445
        if (intermediate->backing_hd == base) {
2446
            base_bs = intermediate->backing_hd;
2447
            break;
2448
        }
2449
        intermediate = intermediate->backing_hd;
2450
    }
2451
    if (base_bs == NULL) {
2452
        /* something went wrong, we did not end at the base. safely
2453
         * unravel everything, and exit with error */
2454
        goto exit;
2455
    }
2456

    
2457
    /* success - we can delete the intermediate states, and link top->base */
2458
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2459
                                   base_bs->drv ? base_bs->drv->format_name : "");
2460
    if (ret) {
2461
        goto exit;
2462
    }
2463
    new_top_bs->backing_hd = base_bs;
2464

    
2465
    bdrv_refresh_limits(new_top_bs);
2466

    
2467
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2468
        /* so that bdrv_close() does not recursively close the chain */
2469
        intermediate_state->bs->backing_hd = NULL;
2470
        bdrv_unref(intermediate_state->bs);
2471
    }
2472
    ret = 0;
2473

    
2474
exit:
2475
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2476
        g_free(intermediate_state);
2477
    }
2478
    return ret;
2479
}
2480

    
2481

    
2482
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2483
                                   size_t size)
2484
{
2485
    int64_t len;
2486

    
2487
    if (!bdrv_is_inserted(bs))
2488
        return -ENOMEDIUM;
2489

    
2490
    if (bs->growable)
2491
        return 0;
2492

    
2493
    len = bdrv_getlength(bs);
2494

    
2495
    if (offset < 0)
2496
        return -EIO;
2497

    
2498
    if ((offset > len) || (len - offset < size))
2499
        return -EIO;
2500

    
2501
    return 0;
2502
}
2503

    
2504
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2505
                              int nb_sectors)
2506
{
2507
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2508
                                   nb_sectors * BDRV_SECTOR_SIZE);
2509
}
2510

    
2511
typedef struct RwCo {
2512
    BlockDriverState *bs;
2513
    int64_t sector_num;
2514
    int nb_sectors;
2515
    QEMUIOVector *qiov;
2516
    bool is_write;
2517
    int ret;
2518
    BdrvRequestFlags flags;
2519
} RwCo;
2520

    
2521
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2522
{
2523
    RwCo *rwco = opaque;
2524

    
2525
    if (!rwco->is_write) {
2526
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2527
                                     rwco->nb_sectors, rwco->qiov,
2528
                                     rwco->flags);
2529
    } else {
2530
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2531
                                      rwco->nb_sectors, rwco->qiov,
2532
                                      rwco->flags);
2533
    }
2534
}
2535

    
2536
/*
2537
 * Process a vectored synchronous request using coroutines
2538
 */
2539
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2540
                       QEMUIOVector *qiov, bool is_write,
2541
                       BdrvRequestFlags flags)
2542
{
2543
    Coroutine *co;
2544
    RwCo rwco = {
2545
        .bs = bs,
2546
        .sector_num = sector_num,
2547
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2548
        .qiov = qiov,
2549
        .is_write = is_write,
2550
        .ret = NOT_DONE,
2551
        .flags = flags,
2552
    };
2553
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2554

    
2555
    /**
2556
     * In sync call context, when the vcpu is blocked, this throttling timer
2557
     * will not fire; so the I/O throttling function has to be disabled here
2558
     * if it has been enabled.
2559
     */
2560
    if (bs->io_limits_enabled) {
2561
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2562
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2563
        bdrv_io_limits_disable(bs);
2564
    }
2565

    
2566
    if (qemu_in_coroutine()) {
2567
        /* Fast-path if already in coroutine context */
2568
        bdrv_rw_co_entry(&rwco);
2569
    } else {
2570
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2571
        qemu_coroutine_enter(co, &rwco);
2572
        while (rwco.ret == NOT_DONE) {
2573
            qemu_aio_wait();
2574
        }
2575
    }
2576
    return rwco.ret;
2577
}
2578

    
2579
/*
2580
 * Process a synchronous request using coroutines
2581
 */
2582
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2583
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2584
{
2585
    QEMUIOVector qiov;
2586
    struct iovec iov = {
2587
        .iov_base = (void *)buf,
2588
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2589
    };
2590

    
2591
    qemu_iovec_init_external(&qiov, &iov, 1);
2592
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2593
}
2594

    
2595
/* return < 0 if error. See bdrv_write() for the return codes */
2596
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2597
              uint8_t *buf, int nb_sectors)
2598
{
2599
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2600
}
2601

    
2602
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2603
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2604
                          uint8_t *buf, int nb_sectors)
2605
{
2606
    bool enabled;
2607
    int ret;
2608

    
2609
    enabled = bs->io_limits_enabled;
2610
    bs->io_limits_enabled = false;
2611
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2612
    bs->io_limits_enabled = enabled;
2613
    return ret;
2614
}
2615

    
2616
/* Return < 0 if error. Important errors are:
2617
  -EIO         generic I/O error (may happen for all errors)
2618
  -ENOMEDIUM   No media inserted.
2619
  -EINVAL      Invalid sector number or nb_sectors
2620
  -EACCES      Trying to write a read-only device
2621
*/
2622
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2623
               const uint8_t *buf, int nb_sectors)
2624
{
2625
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2626
}
2627

    
2628
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2629
{
2630
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2631
}
2632

    
2633
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2634
                      int nb_sectors, BdrvRequestFlags flags)
2635
{
2636
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2637
                      BDRV_REQ_ZERO_WRITE | flags);
2638
}
2639

    
2640
/*
2641
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2642
 * The operation is sped up by checking the block status and only writing
2643
 * zeroes to the device if they currently do not return zeroes. Optional
2644
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2645
 *
2646
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2647
 */
2648
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2649
{
2650
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2651
    int64_t ret, nb_sectors, sector_num = 0;
2652
    int n;
2653

    
2654
    for (;;) {
2655
        nb_sectors = target_size - sector_num;
2656
        if (nb_sectors <= 0) {
2657
            return 0;
2658
        }
2659
        if (nb_sectors > INT_MAX) {
2660
            nb_sectors = INT_MAX;
2661
        }
2662
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2663
        if (ret < 0) {
2664
            error_report("error getting block status at sector %" PRId64 ": %s",
2665
                         sector_num, strerror(-ret));
2666
            return ret;
2667
        }
2668
        if (ret & BDRV_BLOCK_ZERO) {
2669
            sector_num += n;
2670
            continue;
2671
        }
2672
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2673
        if (ret < 0) {
2674
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2675
                         sector_num, strerror(-ret));
2676
            return ret;
2677
        }
2678
        sector_num += n;
2679
    }
2680
}
2681

    
2682
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2683
               void *buf, int count1)
2684
{
2685
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2686
    int len, nb_sectors, count;
2687
    int64_t sector_num;
2688
    int ret;
2689

    
2690
    count = count1;
2691
    /* first read to align to sector start */
2692
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2693
    if (len > count)
2694
        len = count;
2695
    sector_num = offset >> BDRV_SECTOR_BITS;
2696
    if (len > 0) {
2697
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2698
            return ret;
2699
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2700
        count -= len;
2701
        if (count == 0)
2702
            return count1;
2703
        sector_num++;
2704
        buf += len;
2705
    }
2706

    
2707
    /* read the sectors "in place" */
2708
    nb_sectors = count >> BDRV_SECTOR_BITS;
2709
    if (nb_sectors > 0) {
2710
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2711
            return ret;
2712
        sector_num += nb_sectors;
2713
        len = nb_sectors << BDRV_SECTOR_BITS;
2714
        buf += len;
2715
        count -= len;
2716
    }
2717

    
2718
    /* add data from the last sector */
2719
    if (count > 0) {
2720
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2721
            return ret;
2722
        memcpy(buf, tmp_buf, count);
2723
    }
2724
    return count1;
2725
}
2726

    
2727
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2728
{
2729
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2730
    int len, nb_sectors, count;
2731
    int64_t sector_num;
2732
    int ret;
2733

    
2734
    count = qiov->size;
2735

    
2736
    /* first write to align to sector start */
2737
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2738
    if (len > count)
2739
        len = count;
2740
    sector_num = offset >> BDRV_SECTOR_BITS;
2741
    if (len > 0) {
2742
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2743
            return ret;
2744
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2745
                          len);
2746
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2747
            return ret;
2748
        count -= len;
2749
        if (count == 0)
2750
            return qiov->size;
2751
        sector_num++;
2752
    }
2753

    
2754
    /* write the sectors "in place" */
2755
    nb_sectors = count >> BDRV_SECTOR_BITS;
2756
    if (nb_sectors > 0) {
2757
        QEMUIOVector qiov_inplace;
2758

    
2759
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2760
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2761
                          nb_sectors << BDRV_SECTOR_BITS);
2762
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2763
        qemu_iovec_destroy(&qiov_inplace);
2764
        if (ret < 0) {
2765
            return ret;
2766
        }
2767

    
2768
        sector_num += nb_sectors;
2769
        len = nb_sectors << BDRV_SECTOR_BITS;
2770
        count -= len;
2771
    }
2772

    
2773
    /* add data from the last sector */
2774
    if (count > 0) {
2775
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2776
            return ret;
2777
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2778
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2779
            return ret;
2780
    }
2781
    return qiov->size;
2782
}
2783

    
2784
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2785
                const void *buf, int count1)
2786
{
2787
    QEMUIOVector qiov;
2788
    struct iovec iov = {
2789
        .iov_base   = (void *) buf,
2790
        .iov_len    = count1,
2791
    };
2792

    
2793
    qemu_iovec_init_external(&qiov, &iov, 1);
2794
    return bdrv_pwritev(bs, offset, &qiov);
2795
}
2796

    
2797
/*
2798
 * Writes to the file and ensures that no writes are reordered across this
2799
 * request (acts as a barrier)
2800
 *
2801
 * Returns 0 on success, -errno in error cases.
2802
 */
2803
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2804
    const void *buf, int count)
2805
{
2806
    int ret;
2807

    
2808
    ret = bdrv_pwrite(bs, offset, buf, count);
2809
    if (ret < 0) {
2810
        return ret;
2811
    }
2812

    
2813
    /* No flush needed for cache modes that already do it */
2814
    if (bs->enable_write_cache) {
2815
        bdrv_flush(bs);
2816
    }
2817

    
2818
    return 0;
2819
}
2820

    
2821
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2822
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2823
{
2824
    /* Perform I/O through a temporary buffer so that users who scribble over
2825
     * their read buffer while the operation is in progress do not end up
2826
     * modifying the image file.  This is critical for zero-copy guest I/O
2827
     * where anything might happen inside guest memory.
2828
     */
2829
    void *bounce_buffer;
2830

    
2831
    BlockDriver *drv = bs->drv;
2832
    struct iovec iov;
2833
    QEMUIOVector bounce_qiov;
2834
    int64_t cluster_sector_num;
2835
    int cluster_nb_sectors;
2836
    size_t skip_bytes;
2837
    int ret;
2838

    
2839
    /* Cover entire cluster so no additional backing file I/O is required when
2840
     * allocating cluster in the image file.
2841
     */
2842
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2843
                           &cluster_sector_num, &cluster_nb_sectors);
2844

    
2845
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2846
                                   cluster_sector_num, cluster_nb_sectors);
2847

    
2848
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2849
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2850
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2851

    
2852
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2853
                             &bounce_qiov);
2854
    if (ret < 0) {
2855
        goto err;
2856
    }
2857

    
2858
    if (drv->bdrv_co_write_zeroes &&
2859
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2860
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2861
                                      cluster_nb_sectors, 0);
2862
    } else {
2863
        /* This does not change the data on the disk, it is not necessary
2864
         * to flush even in cache=writethrough mode.
2865
         */
2866
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2867
                                  &bounce_qiov);
2868
    }
2869

    
2870
    if (ret < 0) {
2871
        /* It might be okay to ignore write errors for guest requests.  If this
2872
         * is a deliberate copy-on-read then we don't want to ignore the error.
2873
         * Simply report it in all cases.
2874
         */
2875
        goto err;
2876
    }
2877

    
2878
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2879
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2880
                        nb_sectors * BDRV_SECTOR_SIZE);
2881

    
2882
err:
2883
    qemu_vfree(bounce_buffer);
2884
    return ret;
2885
}
2886

    
2887
/*
2888
 * Forwards an already correctly aligned request to the BlockDriver. This
2889
 * handles copy on read and zeroing after EOF; any other features must be
2890
 * implemented by the caller.
2891
 */
2892
static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2893
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
2894
{
2895
    BlockDriver *drv = bs->drv;
2896
    BdrvTrackedRequest req;
2897
    int ret;
2898

    
2899
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2900
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2901

    
2902
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2903
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2904

    
2905
    /* Handle Copy on Read and associated serialisation */
2906
    if (flags & BDRV_REQ_COPY_ON_READ) {
2907
        bs->copy_on_read_in_flight++;
2908
    }
2909

    
2910
    if (bs->copy_on_read_in_flight) {
2911
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2912
    }
2913

    
2914
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2915

    
2916
    if (flags & BDRV_REQ_COPY_ON_READ) {
2917
        int pnum;
2918

    
2919
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2920
        if (ret < 0) {
2921
            goto out;
2922
        }
2923

    
2924
        if (!ret || pnum != nb_sectors) {
2925
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2926
            goto out;
2927
        }
2928
    }
2929

    
2930
    /* Forward the request to the BlockDriver */
2931
    if (!(bs->zero_beyond_eof && bs->growable)) {
2932
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2933
    } else {
2934
        /* Read zeros after EOF of growable BDSes */
2935
        int64_t len, total_sectors, max_nb_sectors;
2936

    
2937
        len = bdrv_getlength(bs);
2938
        if (len < 0) {
2939
            ret = len;
2940
            goto out;
2941
        }
2942

    
2943
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2944
        max_nb_sectors = MAX(0, total_sectors - sector_num);
2945
        if (max_nb_sectors > 0) {
2946
            ret = drv->bdrv_co_readv(bs, sector_num,
2947
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2948
        } else {
2949
            ret = 0;
2950
        }
2951

    
2952
        /* Reading beyond end of file is supposed to produce zeroes */
2953
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2954
            uint64_t offset = MAX(0, total_sectors - sector_num);
2955
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2956
                              BDRV_SECTOR_SIZE;
2957
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2958
        }
2959
    }
2960

    
2961
out:
2962
    tracked_request_end(&req);
2963

    
2964
    if (flags & BDRV_REQ_COPY_ON_READ) {
2965
        bs->copy_on_read_in_flight--;
2966
    }
2967

    
2968
    return ret;
2969
}
2970

    
2971
/*
2972
 * Handle a read request in coroutine context
2973
 */
2974
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2975
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2976
    BdrvRequestFlags flags)
2977
{
2978
    BlockDriver *drv = bs->drv;
2979
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2980
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2981
    uint8_t *head_buf = NULL;
2982
    uint8_t *tail_buf = NULL;
2983
    QEMUIOVector local_qiov;
2984
    bool use_local_qiov = false;
2985
    int ret;
2986

    
2987
    if (!drv) {
2988
        return -ENOMEDIUM;
2989
    }
2990
    if (bdrv_check_byte_request(bs, offset, bytes)) {
2991
        return -EIO;
2992
    }
2993

    
2994
    if (bs->copy_on_read) {
2995
        flags |= BDRV_REQ_COPY_ON_READ;
2996
    }
2997

    
2998
    /* throttling disk I/O */
2999
    if (bs->io_limits_enabled) {
3000
        /* TODO Switch to byte granularity */
3001
        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, false);
3002
    }
3003

    
3004
    /* Align read if necessary by padding qiov */
3005
    if (offset & (align - 1)) {
3006
        head_buf = qemu_blockalign(bs, align);
3007
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3008
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3009
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3010
        use_local_qiov = true;
3011

    
3012
        bytes += offset & (align - 1);
3013
        offset = offset & ~(align - 1);
3014
    }
3015

    
3016
    if ((offset + bytes) & (align - 1)) {
3017
        if (!use_local_qiov) {
3018
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3019
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3020
            use_local_qiov = true;
3021
        }
3022
        tail_buf = qemu_blockalign(bs, align);
3023
        qemu_iovec_add(&local_qiov, tail_buf,
3024
                       align - ((offset + bytes) & (align - 1)));
3025

    
3026
        bytes = ROUND_UP(bytes, align);
3027
    }
3028

    
3029
    ret = bdrv_aligned_preadv(bs, offset, bytes,
3030
                              use_local_qiov ? &local_qiov : qiov,
3031
                              flags);
3032

    
3033
    if (use_local_qiov) {
3034
        qemu_iovec_destroy(&local_qiov);
3035
        qemu_vfree(head_buf);
3036
        qemu_vfree(tail_buf);
3037
    }
3038

    
3039
    return ret;
3040
}
3041

    
3042
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3043
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3044
    BdrvRequestFlags flags)
3045
{
3046
    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3047
        return -EINVAL;
3048
    }
3049

    
3050
    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3051
                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3052
}
3053

    
3054
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3055
    int nb_sectors, QEMUIOVector *qiov)
3056
{
3057
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3058

    
3059
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3060
}
3061

    
3062
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3063
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3064
{
3065
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3066

    
3067
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3068
                            BDRV_REQ_COPY_ON_READ);
3069
}
3070

    
3071
/* if no limit is specified in the BlockLimits use a default
3072
 * of 32768 512-byte sectors (16 MiB) per request.
3073
 */
3074
#define MAX_WRITE_ZEROES_DEFAULT 32768
3075

    
3076
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3077
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3078
{
3079
    BlockDriver *drv = bs->drv;
3080
    QEMUIOVector qiov;
3081
    struct iovec iov = {0};
3082
    int ret = 0;
3083

    
3084
    int max_write_zeroes = bs->bl.max_write_zeroes ?
3085
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3086

    
3087
    while (nb_sectors > 0 && !ret) {
3088
        int num = nb_sectors;
3089

    
3090
        /* Align request.  Block drivers can expect the "bulk" of the request
3091
         * to be aligned.
3092
         */
3093
        if (bs->bl.write_zeroes_alignment
3094
            && num > bs->bl.write_zeroes_alignment) {
3095
            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3096
                /* Make a small request up to the first aligned sector.  */
3097
                num = bs->bl.write_zeroes_alignment;
3098
                num -= sector_num % bs->bl.write_zeroes_alignment;
3099
            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3100
                /* Shorten the request to the last aligned sector.  num cannot
3101
                 * underflow because num > bs->bl.write_zeroes_alignment.
3102
                 */
3103
                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3104
            }
3105
        }
3106

    
3107
        /* limit request size */
3108
        if (num > max_write_zeroes) {
3109
            num = max_write_zeroes;
3110
        }
3111

    
3112
        ret = -ENOTSUP;
3113
        /* First try the efficient write zeroes operation */
3114
        if (drv->bdrv_co_write_zeroes) {
3115
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3116
        }
3117

    
3118
        if (ret == -ENOTSUP) {
3119
            /* Fall back to bounce buffer if write zeroes is unsupported */
3120
            iov.iov_len = num * BDRV_SECTOR_SIZE;
3121
            if (iov.iov_base == NULL) {
3122
                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3123
                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3124
            }
3125
            qemu_iovec_init_external(&qiov, &iov, 1);
3126

    
3127
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3128

    
3129
            /* Keep bounce buffer around if it is big enough for all
3130
             * all future requests.
3131
             */
3132
            if (num < max_write_zeroes) {
3133
                qemu_vfree(iov.iov_base);
3134
                iov.iov_base = NULL;
3135
            }
3136
        }
3137

    
3138
        sector_num += num;
3139
        nb_sectors -= num;
3140
    }
3141

    
3142
    qemu_vfree(iov.iov_base);
3143
    return ret;
3144
}
3145

    
3146
/*
3147
 * Handle a write request in coroutine context
3148
 */
3149
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3150
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3151
    BdrvRequestFlags flags)
3152
{
3153
    BlockDriver *drv = bs->drv;
3154
    BdrvTrackedRequest req;
3155
    int ret;
3156

    
3157
    if (!bs->drv) {
3158
        return -ENOMEDIUM;
3159
    }
3160
    if (bs->read_only) {
3161
        return -EACCES;
3162
    }
3163
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3164
        return -EIO;
3165
    }
3166

    
3167
    if (bs->copy_on_read_in_flight) {
3168
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
3169
    }
3170

    
3171
    /* throttling disk I/O */
3172
    if (bs->io_limits_enabled) {
3173
        bdrv_io_limits_intercept(bs, nb_sectors, true);
3174
    }
3175

    
3176
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
3177

    
3178
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
3179

    
3180
    if (ret < 0) {
3181
        /* Do nothing, write notifier decided to fail this request */
3182
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
3183
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3184
    } else {
3185
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3186
    }
3187

    
3188
    if (ret == 0 && !bs->enable_write_cache) {
3189
        ret = bdrv_co_flush(bs);
3190
    }
3191

    
3192
    bdrv_set_dirty(bs, sector_num, nb_sectors);
3193

    
3194
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3195
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
3196
    }
3197
    if (bs->growable && ret >= 0) {
3198
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3199
    }
3200

    
3201
    tracked_request_end(&req);
3202

    
3203
    return ret;
3204
}
3205

    
3206
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3207
    int nb_sectors, QEMUIOVector *qiov)
3208
{
3209
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3210

    
3211
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3212
}
3213

    
3214
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3215
                                      int64_t sector_num, int nb_sectors,
3216
                                      BdrvRequestFlags flags)
3217
{
3218
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3219

    
3220
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
3221
        flags &= ~BDRV_REQ_MAY_UNMAP;
3222
    }
3223

    
3224
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3225
                             BDRV_REQ_ZERO_WRITE | flags);
3226
}
3227

    
3228
/**
3229
 * Truncate file to 'offset' bytes (needed only for file protocols)
3230
 */
3231
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3232
{
3233
    BlockDriver *drv = bs->drv;
3234
    int ret;
3235
    if (!drv)
3236
        return -ENOMEDIUM;
3237
    if (!drv->bdrv_truncate)
3238
        return -ENOTSUP;
3239
    if (bs->read_only)
3240
        return -EACCES;
3241
    if (bdrv_in_use(bs))
3242
        return -EBUSY;
3243
    ret = drv->bdrv_truncate(bs, offset);
3244
    if (ret == 0) {
3245
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3246
        bdrv_dev_resize_cb(bs);
3247
    }
3248
    return ret;
3249
}
3250

    
3251
/**
3252
 * Length of a allocated file in bytes. Sparse files are counted by actual
3253
 * allocated space. Return < 0 if error or unknown.
3254
 */
3255
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3256
{
3257
    BlockDriver *drv = bs->drv;
3258
    if (!drv) {
3259
        return -ENOMEDIUM;
3260
    }
3261
    if (drv->bdrv_get_allocated_file_size) {
3262
        return drv->bdrv_get_allocated_file_size(bs);
3263
    }
3264
    if (bs->file) {
3265
        return bdrv_get_allocated_file_size(bs->file);
3266
    }
3267
    return -ENOTSUP;
3268
}
3269

    
3270
/**
3271
 * Length of a file in bytes. Return < 0 if error or unknown.
3272
 */
3273
int64_t bdrv_getlength(BlockDriverState *bs)
3274
{
3275
    BlockDriver *drv = bs->drv;
3276
    if (!drv)
3277
        return -ENOMEDIUM;
3278

    
3279
    if (drv->has_variable_length) {
3280
        int ret = refresh_total_sectors(bs, bs->total_sectors);
3281
        if (ret < 0) {
3282
            return ret;
3283
        }
3284
    }
3285
    return bs->total_sectors * BDRV_SECTOR_SIZE;
3286
}
3287

    
3288
/* return 0 as number of sectors if no device present or error */
3289
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3290
{
3291
    int64_t length;
3292
    length = bdrv_getlength(bs);
3293
    if (length < 0)
3294
        length = 0;
3295
    else
3296
        length = length >> BDRV_SECTOR_BITS;
3297
    *nb_sectors_ptr = length;
3298
}
3299

    
3300
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3301
                       BlockdevOnError on_write_error)
3302
{
3303
    bs->on_read_error = on_read_error;
3304
    bs->on_write_error = on_write_error;
3305
}
3306

    
3307
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3308
{
3309
    return is_read ? bs->on_read_error : bs->on_write_error;
3310
}
3311

    
3312
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3313
{
3314
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3315

    
3316
    switch (on_err) {
3317
    case BLOCKDEV_ON_ERROR_ENOSPC:
3318
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3319
    case BLOCKDEV_ON_ERROR_STOP:
3320
        return BDRV_ACTION_STOP;
3321
    case BLOCKDEV_ON_ERROR_REPORT:
3322
        return BDRV_ACTION_REPORT;
3323
    case BLOCKDEV_ON_ERROR_IGNORE:
3324
        return BDRV_ACTION_IGNORE;
3325
    default:
3326
        abort();
3327
    }
3328
}
3329

    
3330
/* This is done by device models because, while the block layer knows
3331
 * about the error, it does not know whether an operation comes from
3332
 * the device or the block layer (from a job, for example).
3333
 */
3334
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3335
                       bool is_read, int error)
3336
{
3337
    assert(error >= 0);
3338
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3339
    if (action == BDRV_ACTION_STOP) {
3340
        vm_stop(RUN_STATE_IO_ERROR);
3341
        bdrv_iostatus_set_err(bs, error);
3342
    }
3343
}
3344

    
3345
int bdrv_is_read_only(BlockDriverState *bs)
3346
{
3347
    return bs->read_only;
3348
}
3349

    
3350
int bdrv_is_sg(BlockDriverState *bs)
3351
{
3352
    return bs->sg;
3353
}
3354

    
3355
int bdrv_enable_write_cache(BlockDriverState *bs)
3356
{
3357
    return bs->enable_write_cache;
3358
}
3359

    
3360
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3361
{
3362
    bs->enable_write_cache = wce;
3363

    
3364
    /* so a reopen() will preserve wce */
3365
    if (wce) {
3366
        bs->open_flags |= BDRV_O_CACHE_WB;
3367
    } else {
3368
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3369
    }
3370
}
3371

    
3372
int bdrv_is_encrypted(BlockDriverState *bs)
3373
{
3374
    if (bs->backing_hd && bs->backing_hd->encrypted)
3375
        return 1;
3376
    return bs->encrypted;
3377
}
3378

    
3379
int bdrv_key_required(BlockDriverState *bs)
3380
{
3381
    BlockDriverState *backing_hd = bs->backing_hd;
3382

    
3383
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3384
        return 1;
3385
    return (bs->encrypted && !bs->valid_key);
3386
}
3387

    
3388
int bdrv_set_key(BlockDriverState *bs, const char *key)
3389
{
3390
    int ret;
3391
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3392
        ret = bdrv_set_key(bs->backing_hd, key);
3393
        if (ret < 0)
3394
            return ret;
3395
        if (!bs->encrypted)
3396
            return 0;
3397
    }
3398
    if (!bs->encrypted) {
3399
        return -EINVAL;
3400
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3401
        return -ENOMEDIUM;
3402
    }
3403
    ret = bs->drv->bdrv_set_key(bs, key);
3404
    if (ret < 0) {
3405
        bs->valid_key = 0;
3406
    } else if (!bs->valid_key) {
3407
        bs->valid_key = 1;
3408
        /* call the change callback now, we skipped it on open */
3409
        bdrv_dev_change_media_cb(bs, true);
3410
    }
3411
    return ret;
3412
}
3413

    
3414
const char *bdrv_get_format_name(BlockDriverState *bs)
3415
{
3416
    return bs->drv ? bs->drv->format_name : NULL;
3417
}
3418

    
3419
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3420
                         void *opaque)
3421
{
3422
    BlockDriver *drv;
3423

    
3424
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3425
        it(opaque, drv->format_name);
3426
    }
3427
}
3428

    
3429
/* This function is to find block backend bs */
3430
BlockDriverState *bdrv_find(const char *name)
3431
{
3432
    BlockDriverState *bs;
3433

    
3434
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3435
        if (!strcmp(name, bs->device_name)) {
3436
            return bs;
3437
        }
3438
    }
3439
    return NULL;
3440
}
3441

    
3442
/* This function is to find a node in the bs graph */
3443
BlockDriverState *bdrv_find_node(const char *node_name)
3444
{
3445
    BlockDriverState *bs;
3446

    
3447
    assert(node_name);
3448

    
3449
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3450
        if (!strcmp(node_name, bs->node_name)) {
3451
            return bs;
3452
        }
3453
    }
3454
    return NULL;
3455
}
3456

    
3457
/* Put this QMP function here so it can access the static graph_bdrv_states. */
3458
BlockDeviceInfoList *bdrv_named_nodes_list(void)
3459
{
3460
    BlockDeviceInfoList *list, *entry;
3461
    BlockDriverState *bs;
3462

    
3463
    list = NULL;
3464
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3465
        entry = g_malloc0(sizeof(*entry));
3466
        entry->value = bdrv_block_device_info(bs);
3467
        entry->next = list;
3468
        list = entry;
3469
    }
3470

    
3471
    return list;
3472
}
3473

    
3474
BlockDriverState *bdrv_lookup_bs(const char *device,
3475
                                 const char *node_name,
3476
                                 Error **errp)
3477
{
3478
    BlockDriverState *bs = NULL;
3479

    
3480
    if ((!device && !node_name) || (device && node_name)) {
3481
        error_setg(errp, "Use either device or node-name but not both");
3482
        return NULL;
3483
    }
3484

    
3485
    if (device) {
3486
        bs = bdrv_find(device);
3487

    
3488
        if (!bs) {
3489
            error_set(errp, QERR_DEVICE_NOT_FOUND, device);
3490
            return NULL;
3491
        }
3492

    
3493
        return bs;
3494
    }
3495

    
3496
    bs = bdrv_find_node(node_name);
3497

    
3498
    if (!bs) {
3499
        error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
3500
        return NULL;
3501
    }
3502

    
3503
    return bs;
3504
}
3505

    
3506
BlockDriverState *bdrv_next(BlockDriverState *bs)
3507
{
3508
    if (!bs) {
3509
        return QTAILQ_FIRST(&bdrv_states);
3510
    }
3511
    return QTAILQ_NEXT(bs, device_list);
3512
}
3513

    
3514
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3515
{
3516
    BlockDriverState *bs;
3517

    
3518
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3519
        it(opaque, bs);
3520
    }
3521
}
3522

    
3523
const char *bdrv_get_device_name(BlockDriverState *bs)
3524
{
3525
    return bs->device_name;
3526
}
3527

    
3528
int bdrv_get_flags(BlockDriverState *bs)
3529
{
3530
    return bs->open_flags;
3531
}
3532

    
3533
int bdrv_flush_all(void)
3534
{
3535
    BlockDriverState *bs;
3536
    int result = 0;
3537

    
3538
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3539
        int ret = bdrv_flush(bs);
3540
        if (ret < 0 && !result) {
3541
            result = ret;
3542
        }
3543
    }
3544

    
3545
    return result;
3546
}
3547

    
3548
int bdrv_has_zero_init_1(BlockDriverState *bs)
3549
{
3550
    return 1;
3551
}
3552

    
3553
int bdrv_has_zero_init(BlockDriverState *bs)
3554
{
3555
    assert(bs->drv);
3556

    
3557
    /* If BS is a copy on write image, it is initialized to
3558
       the contents of the base image, which may not be zeroes.  */
3559
    if (bs->backing_hd) {
3560
        return 0;
3561
    }
3562
    if (bs->drv->bdrv_has_zero_init) {
3563
        return bs->drv->bdrv_has_zero_init(bs);
3564
    }
3565

    
3566
    /* safe default */
3567
    return 0;
3568
}
3569

    
3570
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3571
{
3572
    BlockDriverInfo bdi;
3573

    
3574
    if (bs->backing_hd) {
3575
        return false;
3576
    }
3577

    
3578
    if (bdrv_get_info(bs, &bdi) == 0) {
3579
        return bdi.unallocated_blocks_are_zero;
3580
    }
3581

    
3582
    return false;
3583
}
3584

    
3585
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3586
{
3587
    BlockDriverInfo bdi;
3588

    
3589
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3590
        return false;
3591
    }
3592

    
3593
    if (bdrv_get_info(bs, &bdi) == 0) {
3594
        return bdi.can_write_zeroes_with_unmap;
3595
    }
3596

    
3597
    return false;
3598
}
3599

    
3600
typedef struct BdrvCoGetBlockStatusData {
3601
    BlockDriverState *bs;
3602
    BlockDriverState *base;
3603
    int64_t sector_num;
3604
    int nb_sectors;
3605
    int *pnum;
3606
    int64_t ret;
3607
    bool done;
3608
} BdrvCoGetBlockStatusData;
3609

    
3610
/*
3611
 * Returns true iff the specified sector is present in the disk image. Drivers
3612
 * not implementing the functionality are assumed to not support backing files,
3613
 * hence all their sectors are reported as allocated.
3614
 *
3615
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3616
 * and 'pnum' is set to 0.
3617
 *
3618
 * 'pnum' is set to the number of sectors (including and immediately following
3619
 * the specified sector) that are known to be in the same
3620
 * allocated/unallocated state.
3621
 *
3622
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3623
 * beyond the end of the disk image it will be clamped.
3624
 */
3625
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3626
                                                     int64_t sector_num,
3627
                                                     int nb_sectors, int *pnum)
3628
{
3629
    int64_t length;
3630
    int64_t n;
3631
    int64_t ret, ret2;
3632

    
3633
    length = bdrv_getlength(bs);
3634
    if (length < 0) {
3635
        return length;
3636
    }
3637

    
3638
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3639
        *pnum = 0;
3640
        return 0;
3641
    }
3642

    
3643
    n = bs->total_sectors - sector_num;
3644
    if (n < nb_sectors) {
3645
        nb_sectors = n;
3646
    }
3647

    
3648
    if (!bs->drv->bdrv_co_get_block_status) {
3649
        *pnum = nb_sectors;
3650
        ret = BDRV_BLOCK_DATA;
3651
        if (bs->drv->protocol_name) {
3652
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3653
        }
3654
        return ret;
3655
    }
3656

    
3657
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3658
    if (ret < 0) {
3659
        *pnum = 0;
3660
        return ret;
3661
    }
3662

    
3663
    if (ret & BDRV_BLOCK_RAW) {
3664
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3665
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3666
                                     *pnum, pnum);
3667
    }
3668

    
3669
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3670
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3671
            ret |= BDRV_BLOCK_ZERO;
3672
        } else if (bs->backing_hd) {
3673
            BlockDriverState *bs2 = bs->backing_hd;
3674
            int64_t length2 = bdrv_getlength(bs2);
3675
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3676
                ret |= BDRV_BLOCK_ZERO;
3677
            }
3678
        }
3679
    }
3680

    
3681
    if (bs->file &&
3682
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3683
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3684
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3685
                                        *pnum, pnum);
3686
        if (ret2 >= 0) {
3687
            /* Ignore errors.  This is just providing extra information, it
3688
             * is useful but not necessary.
3689
             */
3690
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3691
        }
3692
    }
3693

    
3694
    return ret;
3695
}
3696

    
3697
/* Coroutine wrapper for bdrv_get_block_status() */
3698
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3699
{
3700
    BdrvCoGetBlockStatusData *data = opaque;
3701
    BlockDriverState *bs = data->bs;
3702

    
3703
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3704
                                         data->pnum);
3705
    data->done = true;
3706
}
3707

    
3708
/*
3709
 * Synchronous wrapper around bdrv_co_get_block_status().
3710
 *
3711
 * See bdrv_co_get_block_status() for details.
3712
 */
3713
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3714
                              int nb_sectors, int *pnum)
3715
{
3716
    Coroutine *co;
3717
    BdrvCoGetBlockStatusData data = {
3718
        .bs = bs,
3719
        .sector_num = sector_num,
3720
        .nb_sectors = nb_sectors,
3721
        .pnum = pnum,
3722
        .done = false,
3723
    };
3724

    
3725
    if (qemu_in_coroutine()) {
3726
        /* Fast-path if already in coroutine context */
3727
        bdrv_get_block_status_co_entry(&data);
3728
    } else {
3729
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3730
        qemu_coroutine_enter(co, &data);
3731
        while (!data.done) {
3732
            qemu_aio_wait();
3733
        }
3734
    }
3735
    return data.ret;
3736
}
3737

    
3738
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3739
                                   int nb_sectors, int *pnum)
3740
{
3741
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3742
    if (ret < 0) {
3743
        return ret;
3744
    }
3745
    return
3746
        (ret & BDRV_BLOCK_DATA) ||
3747
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3748
}
3749

    
3750
/*
3751
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3752
 *
3753
 * Return true if the given sector is allocated in any image between
3754
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3755
 * sector is allocated in any image of the chain.  Return false otherwise.
3756
 *
3757
 * 'pnum' is set to the number of sectors (including and immediately following
3758
 *  the specified sector) that are known to be in the same
3759
 *  allocated/unallocated state.
3760
 *
3761
 */
3762
int bdrv_is_allocated_above(BlockDriverState *top,
3763
                            BlockDriverState *base,
3764
                            int64_t sector_num,
3765
                            int nb_sectors, int *pnum)
3766
{
3767
    BlockDriverState *intermediate;
3768
    int ret, n = nb_sectors;
3769

    
3770
    intermediate = top;
3771
    while (intermediate && intermediate != base) {
3772
        int pnum_inter;
3773
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3774
                                &pnum_inter);
3775
        if (ret < 0) {
3776
            return ret;
3777
        } else if (ret) {
3778
            *pnum = pnum_inter;
3779
            return 1;
3780
        }
3781

    
3782
        /*
3783
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3784
         * might have
3785
         *
3786
         * [sector_num+x, nr_sectors] allocated.
3787
         */
3788
        if (n > pnum_inter &&
3789
            (intermediate == top ||
3790
             sector_num + pnum_inter < intermediate->total_sectors)) {
3791
            n = pnum_inter;
3792
        }
3793

    
3794
        intermediate = intermediate->backing_hd;
3795
    }
3796

    
3797
    *pnum = n;
3798
    return 0;
3799
}
3800

    
3801
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3802
{
3803
    if (bs->backing_hd && bs->backing_hd->encrypted)
3804
        return bs->backing_file;
3805
    else if (bs->encrypted)
3806
        return bs->filename;
3807
    else
3808
        return NULL;
3809
}
3810

    
3811
void bdrv_get_backing_filename(BlockDriverState *bs,
3812
                               char *filename, int filename_size)
3813
{
3814
    pstrcpy(filename, filename_size, bs->backing_file);
3815
}
3816

    
3817
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3818
                          const uint8_t *buf, int nb_sectors)
3819
{
3820
    BlockDriver *drv = bs->drv;
3821
    if (!drv)
3822
        return -ENOMEDIUM;
3823
    if (!drv->bdrv_write_compressed)
3824
        return -ENOTSUP;
3825
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3826
        return -EIO;
3827

    
3828
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3829

    
3830
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3831
}
3832

    
3833
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3834
{
3835
    BlockDriver *drv = bs->drv;
3836
    if (!drv)
3837
        return -ENOMEDIUM;
3838
    if (!drv->bdrv_get_info)
3839
        return -ENOTSUP;
3840
    memset(bdi, 0, sizeof(*bdi));
3841
    return drv->bdrv_get_info(bs, bdi);
3842
}
3843

    
3844
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3845
{
3846
    BlockDriver *drv = bs->drv;
3847
    if (drv && drv->bdrv_get_specific_info) {
3848
        return drv->bdrv_get_specific_info(bs);
3849
    }
3850
    return NULL;
3851
}
3852

    
3853
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3854
                      int64_t pos, int size)
3855
{
3856
    QEMUIOVector qiov;
3857
    struct iovec iov = {
3858
        .iov_base   = (void *) buf,
3859
        .iov_len    = size,
3860
    };
3861

    
3862
    qemu_iovec_init_external(&qiov, &iov, 1);
3863
    return bdrv_writev_vmstate(bs, &qiov, pos);
3864
}
3865

    
3866
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3867
{
3868
    BlockDriver *drv = bs->drv;
3869

    
3870
    if (!drv) {
3871
        return -ENOMEDIUM;
3872
    } else if (drv->bdrv_save_vmstate) {
3873
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3874
    } else if (bs->file) {
3875
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3876
    }
3877

    
3878
    return -ENOTSUP;
3879
}
3880

    
3881
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3882
                      int64_t pos, int size)
3883
{
3884
    BlockDriver *drv = bs->drv;
3885
    if (!drv)
3886
        return -ENOMEDIUM;
3887
    if (drv->bdrv_load_vmstate)
3888
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3889
    if (bs->file)
3890
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3891
    return -ENOTSUP;
3892
}
3893

    
3894
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3895
{
3896
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3897
        return;
3898
    }
3899

    
3900
    bs->drv->bdrv_debug_event(bs, event);
3901
}
3902

    
3903
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3904
                          const char *tag)
3905
{
3906
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3907
        bs = bs->file;
3908
    }
3909

    
3910
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3911
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3912
    }
3913

    
3914
    return -ENOTSUP;
3915
}
3916

    
3917
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
3918
{
3919
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
3920
        bs = bs->file;
3921
    }
3922

    
3923
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
3924
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
3925
    }
3926

    
3927
    return -ENOTSUP;
3928
}
3929

    
3930
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3931
{
3932
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3933
        bs = bs->file;
3934
    }
3935

    
3936
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3937
        return bs->drv->bdrv_debug_resume(bs, tag);
3938
    }
3939

    
3940
    return -ENOTSUP;
3941
}
3942

    
3943
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3944
{
3945
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3946
        bs = bs->file;
3947
    }
3948

    
3949
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3950
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3951
    }
3952

    
3953
    return false;
3954
}
3955

    
3956
int bdrv_is_snapshot(BlockDriverState *bs)
3957
{
3958
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3959
}
3960

    
3961
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3962
 * relative, it must be relative to the chain.  So, passing in bs->filename
3963
 * from a BDS as backing_file should not be done, as that may be relative to
3964
 * the CWD rather than the chain. */
3965
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3966
        const char *backing_file)
3967
{
3968
    char *filename_full = NULL;
3969
    char *backing_file_full = NULL;
3970
    char *filename_tmp = NULL;
3971
    int is_protocol = 0;
3972
    BlockDriverState *curr_bs = NULL;
3973
    BlockDriverState *retval = NULL;
3974

    
3975
    if (!bs || !bs->drv || !backing_file) {
3976
        return NULL;
3977
    }
3978

    
3979
    filename_full     = g_malloc(PATH_MAX);
3980
    backing_file_full = g_malloc(PATH_MAX);
3981
    filename_tmp      = g_malloc(PATH_MAX);
3982

    
3983
    is_protocol = path_has_protocol(backing_file);
3984

    
3985
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3986

    
3987
        /* If either of the filename paths is actually a protocol, then
3988
         * compare unmodified paths; otherwise make paths relative */
3989
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3990
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3991
                retval = curr_bs->backing_hd;
3992
                break;
3993
            }
3994
        } else {
3995
            /* If not an absolute filename path, make it relative to the current
3996
             * image's filename path */
3997
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3998
                         backing_file);
3999

    
4000
            /* We are going to compare absolute pathnames */
4001
            if (!realpath(filename_tmp, filename_full)) {
4002
                continue;
4003
            }
4004

    
4005
            /* We need to make sure the backing filename we are comparing against
4006
             * is relative to the current image filename (or absolute) */
4007
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4008
                         curr_bs->backing_file);
4009

    
4010
            if (!realpath(filename_tmp, backing_file_full)) {
4011
                continue;
4012
            }
4013

    
4014
            if (strcmp(backing_file_full, filename_full) == 0) {
4015
                retval = curr_bs->backing_hd;
4016
                break;
4017
            }
4018
        }
4019
    }
4020

    
4021
    g_free(filename_full);
4022
    g_free(backing_file_full);
4023
    g_free(filename_tmp);
4024
    return retval;
4025
}
4026

    
4027
int bdrv_get_backing_file_depth(BlockDriverState *bs)
4028
{
4029
    if (!bs->drv) {
4030
        return 0;
4031
    }
4032

    
4033
    if (!bs->backing_hd) {
4034
        return 0;
4035
    }
4036

    
4037
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4038
}
4039

    
4040
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4041
{
4042
    BlockDriverState *curr_bs = NULL;
4043

    
4044
    if (!bs) {
4045
        return NULL;
4046
    }
4047

    
4048
    curr_bs = bs;
4049

    
4050
    while (curr_bs->backing_hd) {
4051
        curr_bs = curr_bs->backing_hd;
4052
    }
4053
    return curr_bs;
4054
}
4055

    
4056
/**************************************************************/
4057
/* async I/Os */
4058

    
4059
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4060
                                 QEMUIOVector *qiov, int nb_sectors,
4061
                                 BlockDriverCompletionFunc *cb, void *opaque)
4062
{
4063
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4064

    
4065
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4066
                                 cb, opaque, false);
4067
}
4068

    
4069
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4070
                                  QEMUIOVector *qiov, int nb_sectors,
4071
                                  BlockDriverCompletionFunc *cb, void *opaque)
4072
{
4073
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4074

    
4075
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4076
                                 cb, opaque, true);
4077
}
4078

    
4079
BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4080
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4081
        BlockDriverCompletionFunc *cb, void *opaque)
4082
{
4083
    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4084

    
4085
    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4086
                                 BDRV_REQ_ZERO_WRITE | flags,
4087
                                 cb, opaque, true);
4088
}
4089

    
4090

    
4091
typedef struct MultiwriteCB {
4092
    int error;
4093
    int num_requests;
4094
    int num_callbacks;
4095
    struct {
4096
        BlockDriverCompletionFunc *cb;
4097
        void *opaque;
4098
        QEMUIOVector *free_qiov;
4099
    } callbacks[];
4100
} MultiwriteCB;
4101

    
4102
static void multiwrite_user_cb(MultiwriteCB *mcb)
4103
{
4104
    int i;
4105

    
4106
    for (i = 0; i < mcb->num_callbacks; i++) {
4107
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4108
        if (mcb->callbacks[i].free_qiov) {
4109
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4110
        }
4111
        g_free(mcb->callbacks[i].free_qiov);
4112
    }
4113
}
4114

    
4115
static void multiwrite_cb(void *opaque, int ret)
4116
{
4117
    MultiwriteCB *mcb = opaque;
4118

    
4119
    trace_multiwrite_cb(mcb, ret);
4120

    
4121
    if (ret < 0 && !mcb->error) {
4122
        mcb->error = ret;
4123
    }
4124

    
4125
    mcb->num_requests--;
4126
    if (mcb->num_requests == 0) {
4127
        multiwrite_user_cb(mcb);
4128
        g_free(mcb);
4129
    }
4130
}
4131

    
4132
static int multiwrite_req_compare(const void *a, const void *b)
4133
{
4134
    const BlockRequest *req1 = a, *req2 = b;
4135

    
4136
    /*
4137
     * Note that we can't simply subtract req2->sector from req1->sector
4138
     * here as that could overflow the return value.
4139
     */
4140
    if (req1->sector > req2->sector) {
4141
        return 1;
4142
    } else if (req1->sector < req2->sector) {
4143
        return -1;
4144
    } else {
4145
        return 0;
4146
    }
4147
}
4148

    
4149
/*
4150
 * Takes a bunch of requests and tries to merge them. Returns the number of
4151
 * requests that remain after merging.
4152
 */
4153
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4154
    int num_reqs, MultiwriteCB *mcb)
4155
{
4156
    int i, outidx;
4157

    
4158
    // Sort requests by start sector
4159
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4160

    
4161
    // Check if adjacent requests touch the same clusters. If so, combine them,
4162
    // filling up gaps with zero sectors.
4163
    outidx = 0;
4164
    for (i = 1; i < num_reqs; i++) {
4165
        int merge = 0;
4166
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4167

    
4168
        // Handle exactly sequential writes and overlapping writes.
4169
        if (reqs[i].sector <= oldreq_last) {
4170
            merge = 1;
4171
        }
4172

    
4173
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4174
            merge = 0;
4175
        }
4176

    
4177
        if (merge) {
4178
            size_t size;
4179
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4180
            qemu_iovec_init(qiov,
4181
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4182

    
4183
            // Add the first request to the merged one. If the requests are
4184
            // overlapping, drop the last sectors of the first request.
4185
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
4186
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4187

    
4188
            // We should need to add any zeros between the two requests
4189
            assert (reqs[i].sector <= oldreq_last);
4190

    
4191
            // Add the second request
4192
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4193

    
4194
            reqs[outidx].nb_sectors = qiov->size >> 9;
4195
            reqs[outidx].qiov = qiov;
4196

    
4197
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4198
        } else {
4199
            outidx++;
4200
            reqs[outidx].sector     = reqs[i].sector;
4201
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4202
            reqs[outidx].qiov       = reqs[i].qiov;
4203
        }
4204
    }
4205

    
4206
    return outidx + 1;
4207
}
4208

    
4209
/*
4210
 * Submit multiple AIO write requests at once.
4211
 *
4212
 * On success, the function returns 0 and all requests in the reqs array have
4213
 * been submitted. In error case this function returns -1, and any of the
4214
 * requests may or may not be submitted yet. In particular, this means that the
4215
 * callback will be called for some of the requests, for others it won't. The
4216
 * caller must check the error field of the BlockRequest to wait for the right
4217
 * callbacks (if error != 0, no callback will be called).
4218
 *
4219
 * The implementation may modify the contents of the reqs array, e.g. to merge
4220
 * requests. However, the fields opaque and error are left unmodified as they
4221
 * are used to signal failure for a single request to the caller.
4222
 */
4223
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4224
{
4225
    MultiwriteCB *mcb;
4226
    int i;
4227

    
4228
    /* don't submit writes if we don't have a medium */
4229
    if (bs->drv == NULL) {
4230
        for (i = 0; i < num_reqs; i++) {
4231
            reqs[i].error = -ENOMEDIUM;
4232
        }
4233
        return -1;
4234
    }
4235

    
4236
    if (num_reqs == 0) {
4237
        return 0;
4238
    }
4239

    
4240
    // Create MultiwriteCB structure
4241
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4242
    mcb->num_requests = 0;
4243
    mcb->num_callbacks = num_reqs;
4244

    
4245
    for (i = 0; i < num_reqs; i++) {
4246
        mcb->callbacks[i].cb = reqs[i].cb;
4247
        mcb->callbacks[i].opaque = reqs[i].opaque;
4248
    }
4249

    
4250
    // Check for mergable requests
4251
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4252

    
4253
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4254

    
4255
    /* Run the aio requests. */
4256
    mcb->num_requests = num_reqs;
4257
    for (i = 0; i < num_reqs; i++) {
4258
        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4259
                              reqs[i].nb_sectors, reqs[i].flags,
4260
                              multiwrite_cb, mcb,
4261
                              true);
4262
    }
4263

    
4264
    return 0;
4265
}
4266

    
4267
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4268
{
4269
    acb->aiocb_info->cancel(acb);
4270
}
4271

    
4272
/**************************************************************/
4273
/* async block device emulation */
4274

    
4275
typedef struct BlockDriverAIOCBSync {
4276
    BlockDriverAIOCB common;
4277
    QEMUBH *bh;
4278
    int ret;
4279
    /* vector translation state */
4280
    QEMUIOVector *qiov;
4281
    uint8_t *bounce;
4282
    int is_write;
4283
} BlockDriverAIOCBSync;
4284

    
4285
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4286
{
4287
    BlockDriverAIOCBSync *acb =
4288
        container_of(blockacb, BlockDriverAIOCBSync, common);
4289
    qemu_bh_delete(acb->bh);
4290
    acb->bh = NULL;
4291
    qemu_aio_release(acb);
4292
}
4293

    
4294
static const AIOCBInfo bdrv_em_aiocb_info = {
4295
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4296
    .cancel             = bdrv_aio_cancel_em,
4297
};
4298

    
4299
static void bdrv_aio_bh_cb(void *opaque)
4300
{
4301
    BlockDriverAIOCBSync *acb = opaque;
4302

    
4303
    if (!acb->is_write)
4304
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4305
    qemu_vfree(acb->bounce);
4306
    acb->common.cb(acb->common.opaque, acb->ret);
4307
    qemu_bh_delete(acb->bh);
4308
    acb->bh = NULL;
4309
    qemu_aio_release(acb);
4310
}
4311

    
4312
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4313
                                            int64_t sector_num,
4314
                                            QEMUIOVector *qiov,
4315
                                            int nb_sectors,
4316
                                            BlockDriverCompletionFunc *cb,
4317
                                            void *opaque,
4318
                                            int is_write)
4319

    
4320
{
4321
    BlockDriverAIOCBSync *acb;
4322

    
4323
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4324
    acb->is_write = is_write;
4325
    acb->qiov = qiov;
4326
    acb->bounce = qemu_blockalign(bs, qiov->size);
4327
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4328

    
4329
    if (is_write) {
4330
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4331
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4332
    } else {
4333
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4334
    }
4335

    
4336
    qemu_bh_schedule(acb->bh);
4337

    
4338
    return &acb->common;
4339
}
4340

    
4341
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4342
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4343
        BlockDriverCompletionFunc *cb, void *opaque)
4344
{
4345
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4346
}
4347

    
4348
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4349
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4350
        BlockDriverCompletionFunc *cb, void *opaque)
4351
{
4352
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4353
}
4354

    
4355

    
4356
typedef struct BlockDriverAIOCBCoroutine {
4357
    BlockDriverAIOCB common;
4358
    BlockRequest req;
4359
    bool is_write;
4360
    bool *done;
4361
    QEMUBH* bh;
4362
} BlockDriverAIOCBCoroutine;
4363

    
4364
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4365
{
4366
    BlockDriverAIOCBCoroutine *acb =
4367
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4368
    bool done = false;
4369

    
4370
    acb->done = &done;
4371
    while (!done) {
4372
        qemu_aio_wait();
4373
    }
4374
}
4375

    
4376
static const AIOCBInfo bdrv_em_co_aiocb_info = {
4377
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4378
    .cancel             = bdrv_aio_co_cancel_em,
4379
};
4380

    
4381
static void bdrv_co_em_bh(void *opaque)
4382
{
4383
    BlockDriverAIOCBCoroutine *acb = opaque;
4384

    
4385
    acb->common.cb(acb->common.opaque, acb->req.error);
4386

    
4387
    if (acb->done) {
4388
        *acb->done = true;
4389
    }
4390

    
4391
    qemu_bh_delete(acb->bh);
4392
    qemu_aio_release(acb);
4393
}
4394

    
4395
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4396
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4397
{
4398
    BlockDriverAIOCBCoroutine *acb = opaque;
4399
    BlockDriverState *bs = acb->common.bs;
4400

    
4401
    if (!acb->is_write) {
4402
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4403
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4404
    } else {
4405
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4406
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4407
    }
4408

    
4409
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4410
    qemu_bh_schedule(acb->bh);
4411
}
4412

    
4413
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4414
                                               int64_t sector_num,
4415
                                               QEMUIOVector *qiov,
4416
                                               int nb_sectors,
4417
                                               BdrvRequestFlags flags,
4418
                                               BlockDriverCompletionFunc *cb,
4419
                                               void *opaque,
4420
                                               bool is_write)
4421
{
4422
    Coroutine *co;
4423
    BlockDriverAIOCBCoroutine *acb;
4424

    
4425
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4426
    acb->req.sector = sector_num;
4427
    acb->req.nb_sectors = nb_sectors;
4428
    acb->req.qiov = qiov;
4429
    acb->req.flags = flags;
4430
    acb->is_write = is_write;
4431
    acb->done = NULL;
4432

    
4433
    co = qemu_coroutine_create(bdrv_co_do_rw);
4434
    qemu_coroutine_enter(co, acb);
4435

    
4436
    return &acb->common;
4437
}
4438

    
4439
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4440
{
4441
    BlockDriverAIOCBCoroutine *acb = opaque;
4442
    BlockDriverState *bs = acb->common.bs;
4443

    
4444
    acb->req.error = bdrv_co_flush(bs);
4445
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4446
    qemu_bh_schedule(acb->bh);
4447
}
4448

    
4449
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4450
        BlockDriverCompletionFunc *cb, void *opaque)
4451
{
4452
    trace_bdrv_aio_flush(bs, opaque);
4453

    
4454
    Coroutine *co;
4455
    BlockDriverAIOCBCoroutine *acb;
4456

    
4457
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4458
    acb->done = NULL;
4459

    
4460
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4461
    qemu_coroutine_enter(co, acb);
4462

    
4463
    return &acb->common;
4464
}
4465

    
4466
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4467
{
4468
    BlockDriverAIOCBCoroutine *acb = opaque;
4469
    BlockDriverState *bs = acb->common.bs;
4470

    
4471
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4472
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4473
    qemu_bh_schedule(acb->bh);
4474
}
4475

    
4476
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4477
        int64_t sector_num, int nb_sectors,
4478
        BlockDriverCompletionFunc *cb, void *opaque)
4479
{
4480
    Coroutine *co;
4481
    BlockDriverAIOCBCoroutine *acb;
4482

    
4483
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4484

    
4485
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4486
    acb->req.sector = sector_num;
4487
    acb->req.nb_sectors = nb_sectors;
4488
    acb->done = NULL;
4489
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4490
    qemu_coroutine_enter(co, acb);
4491

    
4492
    return &acb->common;
4493
}
4494

    
4495
void bdrv_init(void)
4496
{
4497
    module_call_init(MODULE_INIT_BLOCK);
4498
}
4499

    
4500
void bdrv_init_with_whitelist(void)
4501
{
4502
    use_bdrv_whitelist = 1;
4503
    bdrv_init();
4504
}
4505

    
4506
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4507
                   BlockDriverCompletionFunc *cb, void *opaque)
4508
{
4509
    BlockDriverAIOCB *acb;
4510

    
4511
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4512
    acb->aiocb_info = aiocb_info;
4513
    acb->bs = bs;
4514
    acb->cb = cb;
4515
    acb->opaque = opaque;
4516
    return acb;
4517
}
4518

    
4519
void qemu_aio_release(void *p)
4520
{
4521
    BlockDriverAIOCB *acb = p;
4522
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4523
}
4524

    
4525
/**************************************************************/
4526
/* Coroutine block device emulation */
4527

    
4528
typedef struct CoroutineIOCompletion {
4529
    Coroutine *coroutine;
4530
    int ret;
4531
} CoroutineIOCompletion;
4532

    
4533
static void bdrv_co_io_em_complete(void *opaque, int ret)
4534
{
4535
    CoroutineIOCompletion *co = opaque;
4536

    
4537
    co->ret = ret;
4538
    qemu_coroutine_enter(co->coroutine, NULL);
4539
}
4540

    
4541
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4542
                                      int nb_sectors, QEMUIOVector *iov,
4543
                                      bool is_write)
4544
{
4545
    CoroutineIOCompletion co = {
4546
        .coroutine = qemu_coroutine_self(),
4547
    };
4548
    BlockDriverAIOCB *acb;
4549

    
4550
    if (is_write) {
4551
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4552
                                       bdrv_co_io_em_complete, &co);
4553
    } else {
4554
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4555
                                      bdrv_co_io_em_complete, &co);
4556
    }
4557

    
4558
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4559
    if (!acb) {
4560
        return -EIO;
4561
    }
4562
    qemu_coroutine_yield();
4563

    
4564
    return co.ret;
4565
}
4566

    
4567
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4568
                                         int64_t sector_num, int nb_sectors,
4569
                                         QEMUIOVector *iov)
4570
{
4571
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4572
}
4573

    
4574
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4575
                                         int64_t sector_num, int nb_sectors,
4576
                                         QEMUIOVector *iov)
4577
{
4578
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4579
}
4580

    
4581
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4582
{
4583
    RwCo *rwco = opaque;
4584

    
4585
    rwco->ret = bdrv_co_flush(rwco->bs);
4586
}
4587

    
4588
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4589
{
4590
    int ret;
4591

    
4592
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4593
        return 0;
4594
    }
4595

    
4596
    /* Write back cached data to the OS even with cache=unsafe */
4597
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4598
    if (bs->drv->bdrv_co_flush_to_os) {
4599
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4600
        if (ret < 0) {
4601
            return ret;
4602
        }
4603
    }
4604

    
4605
    /* But don't actually force it to the disk with cache=unsafe */
4606
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4607
        goto flush_parent;
4608
    }
4609

    
4610
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4611
    if (bs->drv->bdrv_co_flush_to_disk) {
4612
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4613
    } else if (bs->drv->bdrv_aio_flush) {
4614
        BlockDriverAIOCB *acb;
4615
        CoroutineIOCompletion co = {
4616
            .coroutine = qemu_coroutine_self(),
4617
        };
4618

    
4619
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4620
        if (acb == NULL) {
4621
            ret = -EIO;
4622
        } else {
4623
            qemu_coroutine_yield();
4624
            ret = co.ret;
4625
        }
4626
    } else {
4627
        /*
4628
         * Some block drivers always operate in either writethrough or unsafe
4629
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4630
         * know how the server works (because the behaviour is hardcoded or
4631
         * depends on server-side configuration), so we can't ensure that
4632
         * everything is safe on disk. Returning an error doesn't work because
4633
         * that would break guests even if the server operates in writethrough
4634
         * mode.
4635
         *
4636
         * Let's hope the user knows what he's doing.
4637
         */
4638
        ret = 0;
4639
    }
4640
    if (ret < 0) {
4641
        return ret;
4642
    }
4643

    
4644
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4645
     * in the case of cache=unsafe, so there are no useless flushes.
4646
     */
4647
flush_parent:
4648
    return bdrv_co_flush(bs->file);
4649
}
4650

    
4651
void bdrv_invalidate_cache(BlockDriverState *bs)
4652
{
4653
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4654
        bs->drv->bdrv_invalidate_cache(bs);
4655
    }
4656
}
4657

    
4658
void bdrv_invalidate_cache_all(void)
4659
{
4660
    BlockDriverState *bs;
4661

    
4662
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4663
        bdrv_invalidate_cache(bs);
4664
    }
4665
}
4666

    
4667
void bdrv_clear_incoming_migration_all(void)
4668
{
4669
    BlockDriverState *bs;
4670

    
4671
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4672
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4673
    }
4674
}
4675

    
4676
int bdrv_flush(BlockDriverState *bs)
4677
{
4678
    Coroutine *co;
4679
    RwCo rwco = {
4680
        .bs = bs,
4681
        .ret = NOT_DONE,
4682
    };
4683

    
4684
    if (qemu_in_coroutine()) {
4685
        /* Fast-path if already in coroutine context */
4686
        bdrv_flush_co_entry(&rwco);
4687
    } else {
4688
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4689
        qemu_coroutine_enter(co, &rwco);
4690
        while (rwco.ret == NOT_DONE) {
4691
            qemu_aio_wait();
4692
        }
4693
    }
4694

    
4695
    return rwco.ret;
4696
}
4697

    
4698
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4699
{
4700
    RwCo *rwco = opaque;
4701

    
4702
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4703
}
4704

    
4705
/* if no limit is specified in the BlockLimits use a default
4706
 * of 32768 512-byte sectors (16 MiB) per request.
4707
 */
4708
#define MAX_DISCARD_DEFAULT 32768
4709

    
4710
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4711
                                 int nb_sectors)
4712
{
4713
    int max_discard;
4714

    
4715
    if (!bs->drv) {
4716
        return -ENOMEDIUM;
4717
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4718
        return -EIO;
4719
    } else if (bs->read_only) {
4720
        return -EROFS;
4721
    }
4722

    
4723
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4724

    
4725
    /* Do nothing if disabled.  */
4726
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4727
        return 0;
4728
    }
4729

    
4730
    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4731
        return 0;
4732
    }
4733

    
4734
    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4735
    while (nb_sectors > 0) {
4736
        int ret;
4737
        int num = nb_sectors;
4738

    
4739
        /* align request */
4740
        if (bs->bl.discard_alignment &&
4741
            num >= bs->bl.discard_alignment &&
4742
            sector_num % bs->bl.discard_alignment) {
4743
            if (num > bs->bl.discard_alignment) {
4744
                num = bs->bl.discard_alignment;
4745
            }
4746
            num -= sector_num % bs->bl.discard_alignment;
4747
        }
4748

    
4749
        /* limit request size */
4750
        if (num > max_discard) {
4751
            num = max_discard;
4752
        }
4753

    
4754
        if (bs->drv->bdrv_co_discard) {
4755
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4756
        } else {
4757
            BlockDriverAIOCB *acb;
4758
            CoroutineIOCompletion co = {
4759
                .coroutine = qemu_coroutine_self(),
4760
            };
4761

    
4762
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4763
                                            bdrv_co_io_em_complete, &co);
4764
            if (acb == NULL) {
4765
                return -EIO;
4766
            } else {
4767
                qemu_coroutine_yield();
4768
                ret = co.ret;
4769
            }
4770
        }
4771
        if (ret && ret != -ENOTSUP) {
4772
            return ret;
4773
        }
4774

    
4775
        sector_num += num;
4776
        nb_sectors -= num;
4777
    }
4778
    return 0;
4779
}
4780

    
4781
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4782
{
4783
    Coroutine *co;
4784
    RwCo rwco = {
4785
        .bs = bs,
4786
        .sector_num = sector_num,
4787
        .nb_sectors = nb_sectors,
4788
        .ret = NOT_DONE,
4789
    };
4790

    
4791
    if (qemu_in_coroutine()) {
4792
        /* Fast-path if already in coroutine context */
4793
        bdrv_discard_co_entry(&rwco);
4794
    } else {
4795
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4796
        qemu_coroutine_enter(co, &rwco);
4797
        while (rwco.ret == NOT_DONE) {
4798
            qemu_aio_wait();
4799
        }
4800
    }
4801

    
4802
    return rwco.ret;
4803
}
4804

    
4805
/**************************************************************/
4806
/* removable device support */
4807

    
4808
/**
4809
 * Return TRUE if the media is present
4810
 */
4811
int bdrv_is_inserted(BlockDriverState *bs)
4812
{
4813
    BlockDriver *drv = bs->drv;
4814

    
4815
    if (!drv)
4816
        return 0;
4817
    if (!drv->bdrv_is_inserted)
4818
        return 1;
4819
    return drv->bdrv_is_inserted(bs);
4820
}
4821

    
4822
/**
4823
 * Return whether the media changed since the last call to this
4824
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4825
 */
4826
int bdrv_media_changed(BlockDriverState *bs)
4827
{
4828
    BlockDriver *drv = bs->drv;
4829

    
4830
    if (drv && drv->bdrv_media_changed) {
4831
        return drv->bdrv_media_changed(bs);
4832
    }
4833
    return -ENOTSUP;
4834
}
4835

    
4836
/**
4837
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4838
 */
4839
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4840
{
4841
    BlockDriver *drv = bs->drv;
4842

    
4843
    if (drv && drv->bdrv_eject) {
4844
        drv->bdrv_eject(bs, eject_flag);
4845
    }
4846

    
4847
    if (bs->device_name[0] != '\0') {
4848
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4849
    }
4850
}
4851

    
4852
/**
4853
 * Lock or unlock the media (if it is locked, the user won't be able
4854
 * to eject it manually).
4855
 */
4856
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4857
{
4858
    BlockDriver *drv = bs->drv;
4859

    
4860
    trace_bdrv_lock_medium(bs, locked);
4861

    
4862
    if (drv && drv->bdrv_lock_medium) {
4863
        drv->bdrv_lock_medium(bs, locked);
4864
    }
4865
}
4866

    
4867
/* needed for generic scsi interface */
4868

    
4869
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4870
{
4871
    BlockDriver *drv = bs->drv;
4872

    
4873
    if (drv && drv->bdrv_ioctl)
4874
        return drv->bdrv_ioctl(bs, req, buf);
4875
    return -ENOTSUP;
4876
}
4877

    
4878
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4879
        unsigned long int req, void *buf,
4880
        BlockDriverCompletionFunc *cb, void *opaque)
4881
{
4882
    BlockDriver *drv = bs->drv;
4883

    
4884
    if (drv && drv->bdrv_aio_ioctl)
4885
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4886
    return NULL;
4887
}
4888

    
4889
void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
4890
{
4891
    bs->guest_block_size = align;
4892
}
4893

    
4894
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4895
{
4896
    return qemu_memalign(bdrv_opt_mem_align(bs), size);
4897
}
4898

    
4899
/*
4900
 * Check if all memory in this vector is sector aligned.
4901
 */
4902
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4903
{
4904
    int i;
4905
    size_t alignment = bdrv_opt_mem_align(bs);
4906

    
4907
    for (i = 0; i < qiov->niov; i++) {
4908
        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
4909
            return false;
4910
        }
4911
        if (qiov->iov[i].iov_len % alignment) {
4912
            return false;
4913
        }
4914
    }
4915

    
4916
    return true;
4917
}
4918

    
4919
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
4920
{
4921
    int64_t bitmap_size;
4922
    BdrvDirtyBitmap *bitmap;
4923

    
4924
    assert((granularity & (granularity - 1)) == 0);
4925

    
4926
    granularity >>= BDRV_SECTOR_BITS;
4927
    assert(granularity);
4928
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4929
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
4930
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4931
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
4932
    return bitmap;
4933
}
4934

    
4935
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4936
{
4937
    BdrvDirtyBitmap *bm, *next;
4938
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
4939
        if (bm == bitmap) {
4940
            QLIST_REMOVE(bitmap, list);
4941
            hbitmap_free(bitmap->bitmap);
4942
            g_free(bitmap);
4943
            return;
4944
        }
4945
    }
4946
}
4947

    
4948
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
4949
{
4950
    BdrvDirtyBitmap *bm;
4951
    BlockDirtyInfoList *list = NULL;
4952
    BlockDirtyInfoList **plist = &list;
4953

    
4954
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
4955
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
4956
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
4957
        info->count = bdrv_get_dirty_count(bs, bm);
4958
        info->granularity =
4959
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
4960
        entry->value = info;
4961
        *plist = entry;
4962
        plist = &entry->next;
4963
    }
4964

    
4965
    return list;
4966
}
4967

    
4968
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
4969
{
4970
    if (bitmap) {
4971
        return hbitmap_get(bitmap->bitmap, sector);
4972
    } else {
4973
        return 0;
4974
    }
4975
}
4976

    
4977
void bdrv_dirty_iter_init(BlockDriverState *bs,
4978
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
4979
{
4980
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
4981
}
4982

    
4983
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4984
                    int nr_sectors)
4985
{
4986
    BdrvDirtyBitmap *bitmap;
4987
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4988
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
4989
    }
4990
}
4991

    
4992
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
4993
{
4994
    BdrvDirtyBitmap *bitmap;
4995
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4996
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
4997
    }
4998
}
4999

    
5000
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5001
{
5002
    return hbitmap_count(bitmap->bitmap);
5003
}
5004

    
5005
/* Get a reference to bs */
5006
void bdrv_ref(BlockDriverState *bs)
5007
{
5008
    bs->refcnt++;
5009
}
5010

    
5011
/* Release a previously grabbed reference to bs.
5012
 * If after releasing, reference count is zero, the BlockDriverState is
5013
 * deleted. */
5014
void bdrv_unref(BlockDriverState *bs)
5015
{
5016
    assert(bs->refcnt > 0);
5017
    if (--bs->refcnt == 0) {
5018
        bdrv_delete(bs);
5019
    }
5020
}
5021

    
5022
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5023
{
5024
    assert(bs->in_use != in_use);
5025
    bs->in_use = in_use;
5026
}
5027

    
5028
int bdrv_in_use(BlockDriverState *bs)
5029
{
5030
    return bs->in_use;
5031
}
5032

    
5033
void bdrv_iostatus_enable(BlockDriverState *bs)
5034
{
5035
    bs->iostatus_enabled = true;
5036
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5037
}
5038

    
5039
/* The I/O status is only enabled if the drive explicitly
5040
 * enables it _and_ the VM is configured to stop on errors */
5041
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5042
{
5043
    return (bs->iostatus_enabled &&
5044
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5045
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5046
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5047
}
5048

    
5049
void bdrv_iostatus_disable(BlockDriverState *bs)
5050
{
5051
    bs->iostatus_enabled = false;
5052
}
5053

    
5054
void bdrv_iostatus_reset(BlockDriverState *bs)
5055
{
5056
    if (bdrv_iostatus_is_enabled(bs)) {
5057
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5058
        if (bs->job) {
5059
            block_job_iostatus_reset(bs->job);
5060
        }
5061
    }
5062
}
5063

    
5064
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5065
{
5066
    assert(bdrv_iostatus_is_enabled(bs));
5067
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5068
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5069
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
5070
    }
5071
}
5072

    
5073
void
5074
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5075
        enum BlockAcctType type)
5076
{
5077
    assert(type < BDRV_MAX_IOTYPE);
5078

    
5079
    cookie->bytes = bytes;
5080
    cookie->start_time_ns = get_clock();
5081
    cookie->type = type;
5082
}
5083

    
5084
void
5085
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5086
{
5087
    assert(cookie->type < BDRV_MAX_IOTYPE);
5088

    
5089
    bs->nr_bytes[cookie->type] += cookie->bytes;
5090
    bs->nr_ops[cookie->type]++;
5091
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5092
}
5093

    
5094
void bdrv_img_create(const char *filename, const char *fmt,
5095
                     const char *base_filename, const char *base_fmt,
5096
                     char *options, uint64_t img_size, int flags,
5097
                     Error **errp, bool quiet)
5098
{
5099
    QEMUOptionParameter *param = NULL, *create_options = NULL;
5100
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
5101
    BlockDriver *drv, *proto_drv;
5102
    BlockDriver *backing_drv = NULL;
5103
    Error *local_err = NULL;
5104
    int ret = 0;
5105

    
5106
    /* Find driver and parse its options */
5107
    drv = bdrv_find_format(fmt);
5108
    if (!drv) {
5109
        error_setg(errp, "Unknown file format '%s'", fmt);
5110
        return;
5111
    }
5112

    
5113
    proto_drv = bdrv_find_protocol(filename, true);
5114
    if (!proto_drv) {
5115
        error_setg(errp, "Unknown protocol '%s'", filename);
5116
        return;
5117
    }
5118

    
5119
    create_options = append_option_parameters(create_options,
5120
                                              drv->create_options);
5121
    create_options = append_option_parameters(create_options,
5122
                                              proto_drv->create_options);
5123

    
5124
    /* Create parameter list with default values */
5125
    param = parse_option_parameters("", create_options, param);
5126

    
5127
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5128

    
5129
    /* Parse -o options */
5130
    if (options) {
5131
        param = parse_option_parameters(options, create_options, param);
5132
        if (param == NULL) {
5133
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
5134
            goto out;
5135
        }
5136
    }
5137

    
5138
    if (base_filename) {
5139
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5140
                                 base_filename)) {
5141
            error_setg(errp, "Backing file not supported for file format '%s'",
5142
                       fmt);
5143
            goto out;
5144
        }
5145
    }
5146

    
5147
    if (base_fmt) {
5148
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5149
            error_setg(errp, "Backing file format not supported for file "
5150
                             "format '%s'", fmt);
5151
            goto out;
5152
        }
5153
    }
5154

    
5155
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5156
    if (backing_file && backing_file->value.s) {
5157
        if (!strcmp(filename, backing_file->value.s)) {
5158
            error_setg(errp, "Error: Trying to create an image with the "
5159
                             "same filename as the backing file");
5160
            goto out;
5161
        }
5162
    }
5163

    
5164
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5165
    if (backing_fmt && backing_fmt->value.s) {
5166
        backing_drv = bdrv_find_format(backing_fmt->value.s);
5167
        if (!backing_drv) {
5168
            error_setg(errp, "Unknown backing file format '%s'",
5169
                       backing_fmt->value.s);
5170
            goto out;
5171
        }
5172
    }
5173

    
5174
    // The size for the image must always be specified, with one exception:
5175
    // If we are using a backing file, we can obtain the size from there
5176
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
5177
    if (size && size->value.n == -1) {
5178
        if (backing_file && backing_file->value.s) {
5179
            BlockDriverState *bs;
5180
            uint64_t size;
5181
            char buf[32];
5182
            int back_flags;
5183

    
5184
            /* backing files always opened read-only */
5185
            back_flags =
5186
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5187

    
5188
            bs = bdrv_new("");
5189

    
5190
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
5191
                            backing_drv, &local_err);
5192
            if (ret < 0) {
5193
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
5194
                                 backing_file->value.s,
5195
                                 error_get_pretty(local_err));
5196
                error_free(local_err);
5197
                local_err = NULL;
5198
                bdrv_unref(bs);
5199
                goto out;
5200
            }
5201
            bdrv_get_geometry(bs, &size);
5202
            size *= 512;
5203

    
5204
            snprintf(buf, sizeof(buf), "%" PRId64, size);
5205
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5206

    
5207
            bdrv_unref(bs);
5208
        } else {
5209
            error_setg(errp, "Image creation needs a size parameter");
5210
            goto out;
5211
        }
5212
    }
5213

    
5214
    if (!quiet) {
5215
        printf("Formatting '%s', fmt=%s ", filename, fmt);
5216
        print_option_parameters(param);
5217
        puts("");
5218
    }
5219
    ret = bdrv_create(drv, filename, param, &local_err);
5220
    if (ret == -EFBIG) {
5221
        /* This is generally a better message than whatever the driver would
5222
         * deliver (especially because of the cluster_size_hint), since that
5223
         * is most probably not much different from "image too large". */
5224
        const char *cluster_size_hint = "";
5225
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5226
            cluster_size_hint = " (try using a larger cluster size)";
5227
        }
5228
        error_setg(errp, "The image size is too large for file format '%s'"
5229
                   "%s", fmt, cluster_size_hint);
5230
        error_free(local_err);
5231
        local_err = NULL;
5232
    }
5233

    
5234
out:
5235
    free_option_parameters(create_options);
5236
    free_option_parameters(param);
5237

    
5238
    if (error_is_set(&local_err)) {
5239
        error_propagate(errp, local_err);
5240
    }
5241
}
5242

    
5243
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5244
{
5245
    /* Currently BlockDriverState always uses the main loop AioContext */
5246
    return qemu_get_aio_context();
5247
}
5248

    
5249
void bdrv_add_before_write_notifier(BlockDriverState *bs,
5250
                                    NotifierWithReturn *notifier)
5251
{
5252
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5253
}
5254

    
5255
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5256
{
5257
    if (bs->drv->bdrv_amend_options == NULL) {
5258
        return -ENOTSUP;
5259
    }
5260
    return bs->drv->bdrv_amend_options(bs, options);
5261
}
5262

    
5263
/* Used to recurse on single child block filters.
5264
 * Single child block filter will store their child in bs->file.
5265
 */
5266
bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5267
                                      BlockDriverState *candidate)
5268
{
5269
    if (!bs->drv) {
5270
        return false;
5271
    }
5272

    
5273
    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5274
        if (bs == candidate) {
5275
            return true;
5276
        } else {
5277
            return false;
5278
        }
5279
    }
5280

    
5281
    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5282
        return false;
5283
    }
5284

    
5285
    if (!bs->file) {
5286
        return false;
5287
    }
5288

    
5289
    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5290
}
5291

    
5292
bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5293
                                      BlockDriverState *candidate)
5294
{
5295
    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5296
        return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5297
    }
5298

    
5299
    return bdrv_generic_is_first_non_filter(bs, candidate);
5300
}
5301

    
5302
/* This function checks if the candidate is the first non filter bs down it's
5303
 * bs chain. Since we don't have pointers to parents it explore all bs chains
5304
 * from the top. Some filters can choose not to pass down the recursion.
5305
 */
5306
bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5307
{
5308
    BlockDriverState *bs;
5309

    
5310
    /* walk down the bs forest recursively */
5311
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5312
        bool perm;
5313

    
5314
        if (!bs->file) {
5315
            continue;
5316
        }
5317

    
5318
        perm = bdrv_recurse_is_first_non_filter(bs->file, candidate);
5319

    
5320
        /* candidate is the first non filter */
5321
        if (perm) {
5322
            return true;
5323
        }
5324
    }
5325

    
5326
    return false;
5327
}