Statistics
| Branch: | Revision:

root / block.c @ cc84d90f

History | View | Annotate | Download (127.7 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
90
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
91

    
92
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
93
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
94

    
95
/* If non-zero, use only whitelisted block drivers */
96
static int use_bdrv_whitelist;
97

    
98
#ifdef _WIN32
99
static int is_windows_drive_prefix(const char *filename)
100
{
101
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
102
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
103
            filename[1] == ':');
104
}
105

    
106
int is_windows_drive(const char *filename)
107
{
108
    if (is_windows_drive_prefix(filename) &&
109
        filename[2] == '\0')
110
        return 1;
111
    if (strstart(filename, "\\\\.\\", NULL) ||
112
        strstart(filename, "//./", NULL))
113
        return 1;
114
    return 0;
115
}
116
#endif
117

    
118
/* throttling disk I/O limits */
119
void bdrv_set_io_limits(BlockDriverState *bs,
120
                        ThrottleConfig *cfg)
121
{
122
    int i;
123

    
124
    throttle_config(&bs->throttle_state, cfg);
125

    
126
    for (i = 0; i < 2; i++) {
127
        qemu_co_enter_next(&bs->throttled_reqs[i]);
128
    }
129
}
130

    
131
/* this function drain all the throttled IOs */
132
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
133
{
134
    bool drained = false;
135
    bool enabled = bs->io_limits_enabled;
136
    int i;
137

    
138
    bs->io_limits_enabled = false;
139

    
140
    for (i = 0; i < 2; i++) {
141
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
142
            drained = true;
143
        }
144
    }
145

    
146
    bs->io_limits_enabled = enabled;
147

    
148
    return drained;
149
}
150

    
151
void bdrv_io_limits_disable(BlockDriverState *bs)
152
{
153
    bs->io_limits_enabled = false;
154

    
155
    bdrv_start_throttled_reqs(bs);
156

    
157
    throttle_destroy(&bs->throttle_state);
158
}
159

    
160
static void bdrv_throttle_read_timer_cb(void *opaque)
161
{
162
    BlockDriverState *bs = opaque;
163
    qemu_co_enter_next(&bs->throttled_reqs[0]);
164
}
165

    
166
static void bdrv_throttle_write_timer_cb(void *opaque)
167
{
168
    BlockDriverState *bs = opaque;
169
    qemu_co_enter_next(&bs->throttled_reqs[1]);
170
}
171

    
172
/* should be called before bdrv_set_io_limits if a limit is set */
173
void bdrv_io_limits_enable(BlockDriverState *bs)
174
{
175
    assert(!bs->io_limits_enabled);
176
    throttle_init(&bs->throttle_state,
177
                  QEMU_CLOCK_VIRTUAL,
178
                  bdrv_throttle_read_timer_cb,
179
                  bdrv_throttle_write_timer_cb,
180
                  bs);
181
    bs->io_limits_enabled = true;
182
}
183

    
184
/* This function makes an IO wait if needed
185
 *
186
 * @nb_sectors: the number of sectors of the IO
187
 * @is_write:   is the IO a write
188
 */
189
static void bdrv_io_limits_intercept(BlockDriverState *bs,
190
                                     int nb_sectors,
191
                                     bool is_write)
192
{
193
    /* does this io must wait */
194
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
195

    
196
    /* if must wait or any request of this type throttled queue the IO */
197
    if (must_wait ||
198
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
199
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
200
    }
201

    
202
    /* the IO will be executed, do the accounting */
203
    throttle_account(&bs->throttle_state,
204
                     is_write,
205
                     nb_sectors * BDRV_SECTOR_SIZE);
206

    
207
    /* if the next request must wait -> do nothing */
208
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
209
        return;
210
    }
211

    
212
    /* else queue next request for execution */
213
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
214
}
215

    
216
/* check if the path starts with "<protocol>:" */
217
static int path_has_protocol(const char *path)
218
{
219
    const char *p;
220

    
221
#ifdef _WIN32
222
    if (is_windows_drive(path) ||
223
        is_windows_drive_prefix(path)) {
224
        return 0;
225
    }
226
    p = path + strcspn(path, ":/\\");
227
#else
228
    p = path + strcspn(path, ":/");
229
#endif
230

    
231
    return *p == ':';
232
}
233

    
234
int path_is_absolute(const char *path)
235
{
236
#ifdef _WIN32
237
    /* specific case for names like: "\\.\d:" */
238
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
239
        return 1;
240
    }
241
    return (*path == '/' || *path == '\\');
242
#else
243
    return (*path == '/');
244
#endif
245
}
246

    
247
/* if filename is absolute, just copy it to dest. Otherwise, build a
248
   path to it by considering it is relative to base_path. URL are
249
   supported. */
250
void path_combine(char *dest, int dest_size,
251
                  const char *base_path,
252
                  const char *filename)
253
{
254
    const char *p, *p1;
255
    int len;
256

    
257
    if (dest_size <= 0)
258
        return;
259
    if (path_is_absolute(filename)) {
260
        pstrcpy(dest, dest_size, filename);
261
    } else {
262
        p = strchr(base_path, ':');
263
        if (p)
264
            p++;
265
        else
266
            p = base_path;
267
        p1 = strrchr(base_path, '/');
268
#ifdef _WIN32
269
        {
270
            const char *p2;
271
            p2 = strrchr(base_path, '\\');
272
            if (!p1 || p2 > p1)
273
                p1 = p2;
274
        }
275
#endif
276
        if (p1)
277
            p1++;
278
        else
279
            p1 = base_path;
280
        if (p1 > p)
281
            p = p1;
282
        len = p - base_path;
283
        if (len > dest_size - 1)
284
            len = dest_size - 1;
285
        memcpy(dest, base_path, len);
286
        dest[len] = '\0';
287
        pstrcat(dest, dest_size, filename);
288
    }
289
}
290

    
291
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
292
{
293
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
294
        pstrcpy(dest, sz, bs->backing_file);
295
    } else {
296
        path_combine(dest, sz, bs->filename, bs->backing_file);
297
    }
298
}
299

    
300
void bdrv_register(BlockDriver *bdrv)
301
{
302
    /* Block drivers without coroutine functions need emulation */
303
    if (!bdrv->bdrv_co_readv) {
304
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
305
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
306

    
307
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
308
         * the block driver lacks aio we need to emulate that too.
309
         */
310
        if (!bdrv->bdrv_aio_readv) {
311
            /* add AIO emulation layer */
312
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
313
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
314
        }
315
    }
316

    
317
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
318
}
319

    
320
/* create a new block device (by default it is empty) */
321
BlockDriverState *bdrv_new(const char *device_name)
322
{
323
    BlockDriverState *bs;
324

    
325
    bs = g_malloc0(sizeof(BlockDriverState));
326
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
327
    if (device_name[0] != '\0') {
328
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
329
    }
330
    bdrv_iostatus_disable(bs);
331
    notifier_list_init(&bs->close_notifiers);
332
    notifier_with_return_list_init(&bs->before_write_notifiers);
333
    qemu_co_queue_init(&bs->throttled_reqs[0]);
334
    qemu_co_queue_init(&bs->throttled_reqs[1]);
335
    bs->refcnt = 1;
336

    
337
    return bs;
338
}
339

    
340
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
341
{
342
    notifier_list_add(&bs->close_notifiers, notify);
343
}
344

    
345
BlockDriver *bdrv_find_format(const char *format_name)
346
{
347
    BlockDriver *drv1;
348
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
349
        if (!strcmp(drv1->format_name, format_name)) {
350
            return drv1;
351
        }
352
    }
353
    return NULL;
354
}
355

    
356
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
357
{
358
    static const char *whitelist_rw[] = {
359
        CONFIG_BDRV_RW_WHITELIST
360
    };
361
    static const char *whitelist_ro[] = {
362
        CONFIG_BDRV_RO_WHITELIST
363
    };
364
    const char **p;
365

    
366
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
367
        return 1;               /* no whitelist, anything goes */
368
    }
369

    
370
    for (p = whitelist_rw; *p; p++) {
371
        if (!strcmp(drv->format_name, *p)) {
372
            return 1;
373
        }
374
    }
375
    if (read_only) {
376
        for (p = whitelist_ro; *p; p++) {
377
            if (!strcmp(drv->format_name, *p)) {
378
                return 1;
379
            }
380
        }
381
    }
382
    return 0;
383
}
384

    
385
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
386
                                          bool read_only)
387
{
388
    BlockDriver *drv = bdrv_find_format(format_name);
389
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
390
}
391

    
392
typedef struct CreateCo {
393
    BlockDriver *drv;
394
    char *filename;
395
    QEMUOptionParameter *options;
396
    int ret;
397
    Error *err;
398
} CreateCo;
399

    
400
static void coroutine_fn bdrv_create_co_entry(void *opaque)
401
{
402
    Error *local_err = NULL;
403
    int ret;
404

    
405
    CreateCo *cco = opaque;
406
    assert(cco->drv);
407

    
408
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
409
    if (error_is_set(&local_err)) {
410
        error_propagate(&cco->err, local_err);
411
    }
412
    cco->ret = ret;
413
}
414

    
415
int bdrv_create(BlockDriver *drv, const char* filename,
416
    QEMUOptionParameter *options, Error **errp)
417
{
418
    int ret;
419

    
420
    Coroutine *co;
421
    CreateCo cco = {
422
        .drv = drv,
423
        .filename = g_strdup(filename),
424
        .options = options,
425
        .ret = NOT_DONE,
426
        .err = NULL,
427
    };
428

    
429
    if (!drv->bdrv_create) {
430
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
431
        ret = -ENOTSUP;
432
        goto out;
433
    }
434

    
435
    if (qemu_in_coroutine()) {
436
        /* Fast-path if already in coroutine context */
437
        bdrv_create_co_entry(&cco);
438
    } else {
439
        co = qemu_coroutine_create(bdrv_create_co_entry);
440
        qemu_coroutine_enter(co, &cco);
441
        while (cco.ret == NOT_DONE) {
442
            qemu_aio_wait();
443
        }
444
    }
445

    
446
    ret = cco.ret;
447
    if (ret < 0) {
448
        if (error_is_set(&cco.err)) {
449
            error_propagate(errp, cco.err);
450
        } else {
451
            error_setg_errno(errp, -ret, "Could not create image");
452
        }
453
    }
454

    
455
out:
456
    g_free(cco.filename);
457
    return ret;
458
}
459

    
460
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
461
                     Error **errp)
462
{
463
    BlockDriver *drv;
464
    Error *local_err = NULL;
465
    int ret;
466

    
467
    drv = bdrv_find_protocol(filename, true);
468
    if (drv == NULL) {
469
        error_setg(errp, "Could not find protocol for file '%s'", filename);
470
        return -ENOENT;
471
    }
472

    
473
    ret = bdrv_create(drv, filename, options, &local_err);
474
    if (error_is_set(&local_err)) {
475
        error_propagate(errp, local_err);
476
    }
477
    return ret;
478
}
479

    
480
/*
481
 * Create a uniquely-named empty temporary file.
482
 * Return 0 upon success, otherwise a negative errno value.
483
 */
484
int get_tmp_filename(char *filename, int size)
485
{
486
#ifdef _WIN32
487
    char temp_dir[MAX_PATH];
488
    /* GetTempFileName requires that its output buffer (4th param)
489
       have length MAX_PATH or greater.  */
490
    assert(size >= MAX_PATH);
491
    return (GetTempPath(MAX_PATH, temp_dir)
492
            && GetTempFileName(temp_dir, "qem", 0, filename)
493
            ? 0 : -GetLastError());
494
#else
495
    int fd;
496
    const char *tmpdir;
497
    tmpdir = getenv("TMPDIR");
498
    if (!tmpdir)
499
        tmpdir = "/tmp";
500
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
501
        return -EOVERFLOW;
502
    }
503
    fd = mkstemp(filename);
504
    if (fd < 0) {
505
        return -errno;
506
    }
507
    if (close(fd) != 0) {
508
        unlink(filename);
509
        return -errno;
510
    }
511
    return 0;
512
#endif
513
}
514

    
515
/*
516
 * Detect host devices. By convention, /dev/cdrom[N] is always
517
 * recognized as a host CDROM.
518
 */
519
static BlockDriver *find_hdev_driver(const char *filename)
520
{
521
    int score_max = 0, score;
522
    BlockDriver *drv = NULL, *d;
523

    
524
    QLIST_FOREACH(d, &bdrv_drivers, list) {
525
        if (d->bdrv_probe_device) {
526
            score = d->bdrv_probe_device(filename);
527
            if (score > score_max) {
528
                score_max = score;
529
                drv = d;
530
            }
531
        }
532
    }
533

    
534
    return drv;
535
}
536

    
537
BlockDriver *bdrv_find_protocol(const char *filename,
538
                                bool allow_protocol_prefix)
539
{
540
    BlockDriver *drv1;
541
    char protocol[128];
542
    int len;
543
    const char *p;
544

    
545
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
546

    
547
    /*
548
     * XXX(hch): we really should not let host device detection
549
     * override an explicit protocol specification, but moving this
550
     * later breaks access to device names with colons in them.
551
     * Thanks to the brain-dead persistent naming schemes on udev-
552
     * based Linux systems those actually are quite common.
553
     */
554
    drv1 = find_hdev_driver(filename);
555
    if (drv1) {
556
        return drv1;
557
    }
558

    
559
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
560
        return bdrv_find_format("file");
561
    }
562

    
563
    p = strchr(filename, ':');
564
    assert(p != NULL);
565
    len = p - filename;
566
    if (len > sizeof(protocol) - 1)
567
        len = sizeof(protocol) - 1;
568
    memcpy(protocol, filename, len);
569
    protocol[len] = '\0';
570
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
571
        if (drv1->protocol_name &&
572
            !strcmp(drv1->protocol_name, protocol)) {
573
            return drv1;
574
        }
575
    }
576
    return NULL;
577
}
578

    
579
static int find_image_format(BlockDriverState *bs, const char *filename,
580
                             BlockDriver **pdrv, Error **errp)
581
{
582
    int score, score_max;
583
    BlockDriver *drv1, *drv;
584
    uint8_t buf[2048];
585
    int ret = 0;
586

    
587
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
588
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
589
        drv = bdrv_find_format("raw");
590
        if (!drv) {
591
            error_setg(errp, "Could not find raw image format");
592
            ret = -ENOENT;
593
        }
594
        *pdrv = drv;
595
        return ret;
596
    }
597

    
598
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
599
    if (ret < 0) {
600
        error_setg_errno(errp, -ret, "Could not read image for determining its "
601
                         "format");
602
        *pdrv = NULL;
603
        return ret;
604
    }
605

    
606
    score_max = 0;
607
    drv = NULL;
608
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
609
        if (drv1->bdrv_probe) {
610
            score = drv1->bdrv_probe(buf, ret, filename);
611
            if (score > score_max) {
612
                score_max = score;
613
                drv = drv1;
614
            }
615
        }
616
    }
617
    if (!drv) {
618
        error_setg(errp, "Could not determine image format: No compatible "
619
                   "driver found");
620
        ret = -ENOENT;
621
    }
622
    *pdrv = drv;
623
    return ret;
624
}
625

    
626
/**
627
 * Set the current 'total_sectors' value
628
 */
629
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
630
{
631
    BlockDriver *drv = bs->drv;
632

    
633
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
634
    if (bs->sg)
635
        return 0;
636

    
637
    /* query actual device if possible, otherwise just trust the hint */
638
    if (drv->bdrv_getlength) {
639
        int64_t length = drv->bdrv_getlength(bs);
640
        if (length < 0) {
641
            return length;
642
        }
643
        hint = length >> BDRV_SECTOR_BITS;
644
    }
645

    
646
    bs->total_sectors = hint;
647
    return 0;
648
}
649

    
650
/**
651
 * Set open flags for a given discard mode
652
 *
653
 * Return 0 on success, -1 if the discard mode was invalid.
654
 */
655
int bdrv_parse_discard_flags(const char *mode, int *flags)
656
{
657
    *flags &= ~BDRV_O_UNMAP;
658

    
659
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
660
        /* do nothing */
661
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
662
        *flags |= BDRV_O_UNMAP;
663
    } else {
664
        return -1;
665
    }
666

    
667
    return 0;
668
}
669

    
670
/**
671
 * Set open flags for a given cache mode
672
 *
673
 * Return 0 on success, -1 if the cache mode was invalid.
674
 */
675
int bdrv_parse_cache_flags(const char *mode, int *flags)
676
{
677
    *flags &= ~BDRV_O_CACHE_MASK;
678

    
679
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
680
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
681
    } else if (!strcmp(mode, "directsync")) {
682
        *flags |= BDRV_O_NOCACHE;
683
    } else if (!strcmp(mode, "writeback")) {
684
        *flags |= BDRV_O_CACHE_WB;
685
    } else if (!strcmp(mode, "unsafe")) {
686
        *flags |= BDRV_O_CACHE_WB;
687
        *flags |= BDRV_O_NO_FLUSH;
688
    } else if (!strcmp(mode, "writethrough")) {
689
        /* this is the default */
690
    } else {
691
        return -1;
692
    }
693

    
694
    return 0;
695
}
696

    
697
/**
698
 * The copy-on-read flag is actually a reference count so multiple users may
699
 * use the feature without worrying about clobbering its previous state.
700
 * Copy-on-read stays enabled until all users have called to disable it.
701
 */
702
void bdrv_enable_copy_on_read(BlockDriverState *bs)
703
{
704
    bs->copy_on_read++;
705
}
706

    
707
void bdrv_disable_copy_on_read(BlockDriverState *bs)
708
{
709
    assert(bs->copy_on_read > 0);
710
    bs->copy_on_read--;
711
}
712

    
713
static int bdrv_open_flags(BlockDriverState *bs, int flags)
714
{
715
    int open_flags = flags | BDRV_O_CACHE_WB;
716

    
717
    /*
718
     * Clear flags that are internal to the block layer before opening the
719
     * image.
720
     */
721
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
722

    
723
    /*
724
     * Snapshots should be writable.
725
     */
726
    if (bs->is_temporary) {
727
        open_flags |= BDRV_O_RDWR;
728
    }
729

    
730
    return open_flags;
731
}
732

    
733
/*
734
 * Common part for opening disk images and files
735
 *
736
 * Removes all processed options from *options.
737
 */
738
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
739
    QDict *options, int flags, BlockDriver *drv, Error **errp)
740
{
741
    int ret, open_flags;
742
    const char *filename;
743
    Error *local_err = NULL;
744

    
745
    assert(drv != NULL);
746
    assert(bs->file == NULL);
747
    assert(options != NULL && bs->options != options);
748

    
749
    if (file != NULL) {
750
        filename = file->filename;
751
    } else {
752
        filename = qdict_get_try_str(options, "filename");
753
    }
754

    
755
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
756

    
757
    /* bdrv_open() with directly using a protocol as drv. This layer is already
758
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
759
     * and return immediately. */
760
    if (file != NULL && drv->bdrv_file_open) {
761
        bdrv_swap(file, bs);
762
        return 0;
763
    }
764

    
765
    bs->open_flags = flags;
766
    bs->buffer_alignment = 512;
767
    bs->zero_beyond_eof = true;
768
    open_flags = bdrv_open_flags(bs, flags);
769
    bs->read_only = !(open_flags & BDRV_O_RDWR);
770

    
771
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
772
        error_setg(errp, "Driver '%s' is not whitelisted", drv->format_name);
773
        return -ENOTSUP;
774
    }
775

    
776
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
777
    if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) {
778
        bdrv_enable_copy_on_read(bs);
779
    }
780

    
781
    if (filename != NULL) {
782
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
783
    } else {
784
        bs->filename[0] = '\0';
785
    }
786

    
787
    bs->drv = drv;
788
    bs->opaque = g_malloc0(drv->instance_size);
789

    
790
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
791

    
792
    /* Open the image, either directly or using a protocol */
793
    if (drv->bdrv_file_open) {
794
        assert(file == NULL);
795
        assert(drv->bdrv_parse_filename || filename != NULL);
796
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
797
    } else {
798
        if (file == NULL) {
799
            error_setg(errp, "Can't use '%s' as a block driver for the "
800
                       "protocol level", drv->format_name);
801
            ret = -EINVAL;
802
            goto free_and_fail;
803
        }
804
        bs->file = file;
805
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
806
    }
807

    
808
    if (ret < 0) {
809
        if (error_is_set(&local_err)) {
810
            error_propagate(errp, local_err);
811
        } else if (filename) {
812
            error_setg_errno(errp, -ret, "Could not open '%s'", filename);
813
        } else {
814
            error_setg_errno(errp, -ret, "Could not open image");
815
        }
816
        goto free_and_fail;
817
    }
818

    
819
    ret = refresh_total_sectors(bs, bs->total_sectors);
820
    if (ret < 0) {
821
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
822
        goto free_and_fail;
823
    }
824

    
825
#ifndef _WIN32
826
    if (bs->is_temporary) {
827
        assert(filename != NULL);
828
        unlink(filename);
829
    }
830
#endif
831
    return 0;
832

    
833
free_and_fail:
834
    bs->file = NULL;
835
    g_free(bs->opaque);
836
    bs->opaque = NULL;
837
    bs->drv = NULL;
838
    return ret;
839
}
840

    
841
/*
842
 * Opens a file using a protocol (file, host_device, nbd, ...)
843
 *
844
 * options is a QDict of options to pass to the block drivers, or NULL for an
845
 * empty set of options. The reference to the QDict belongs to the block layer
846
 * after the call (even on failure), so if the caller intends to reuse the
847
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
848
 */
849
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
850
                   QDict *options, int flags, Error **errp)
851
{
852
    BlockDriverState *bs;
853
    BlockDriver *drv;
854
    const char *drvname;
855
    bool allow_protocol_prefix = false;
856
    Error *local_err = NULL;
857
    int ret;
858

    
859
    /* NULL means an empty set of options */
860
    if (options == NULL) {
861
        options = qdict_new();
862
    }
863

    
864
    bs = bdrv_new("");
865
    bs->options = options;
866
    options = qdict_clone_shallow(options);
867

    
868
    /* Fetch the file name from the options QDict if necessary */
869
    if (!filename) {
870
        filename = qdict_get_try_str(options, "filename");
871
    } else if (filename && !qdict_haskey(options, "filename")) {
872
        qdict_put(options, "filename", qstring_from_str(filename));
873
        allow_protocol_prefix = true;
874
    } else {
875
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
876
                   "same time");
877
        ret = -EINVAL;
878
        goto fail;
879
    }
880

    
881
    /* Find the right block driver */
882
    drvname = qdict_get_try_str(options, "driver");
883
    if (drvname) {
884
        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
885
        if (!drv) {
886
            error_setg(errp, "Unknown driver '%s'", drvname);
887
        }
888
        qdict_del(options, "driver");
889
    } else if (filename) {
890
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
891
        if (!drv) {
892
            error_setg(errp, "Unknown protocol");
893
        }
894
    } else {
895
        error_setg(errp, "Must specify either driver or file");
896
        drv = NULL;
897
    }
898

    
899
    if (!drv) {
900
        /* errp has been set already */
901
        ret = -ENOENT;
902
        goto fail;
903
    }
904

    
905
    /* Parse the filename and open it */
906
    if (drv->bdrv_parse_filename && filename) {
907
        drv->bdrv_parse_filename(filename, options, &local_err);
908
        if (error_is_set(&local_err)) {
909
            error_propagate(errp, local_err);
910
            ret = -EINVAL;
911
            goto fail;
912
        }
913
        qdict_del(options, "filename");
914
    } else if (!drv->bdrv_parse_filename && !filename) {
915
        error_setg(errp, "The '%s' block driver requires a file name",
916
                   drv->format_name);
917
        ret = -EINVAL;
918
        goto fail;
919
    }
920

    
921
    ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
922
    if (ret < 0) {
923
        error_propagate(errp, local_err);
924
        goto fail;
925
    }
926

    
927
    /* Check if any unknown options were used */
928
    if (qdict_size(options) != 0) {
929
        const QDictEntry *entry = qdict_first(options);
930
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
931
                   drv->format_name, entry->key);
932
        ret = -EINVAL;
933
        goto fail;
934
    }
935
    QDECREF(options);
936

    
937
    bs->growable = 1;
938
    *pbs = bs;
939
    return 0;
940

    
941
fail:
942
    QDECREF(options);
943
    if (!bs->drv) {
944
        QDECREF(bs->options);
945
    }
946
    bdrv_unref(bs);
947
    return ret;
948
}
949

    
950
/*
951
 * Opens the backing file for a BlockDriverState if not yet open
952
 *
953
 * options is a QDict of options to pass to the block drivers, or NULL for an
954
 * empty set of options. The reference to the QDict is transferred to this
955
 * function (even on failure), so if the caller intends to reuse the dictionary,
956
 * it needs to use QINCREF() before calling bdrv_file_open.
957
 */
958
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
959
{
960
    char backing_filename[PATH_MAX];
961
    int back_flags, ret;
962
    BlockDriver *back_drv = NULL;
963
    Error *local_err = NULL;
964

    
965
    if (bs->backing_hd != NULL) {
966
        QDECREF(options);
967
        return 0;
968
    }
969

    
970
    /* NULL means an empty set of options */
971
    if (options == NULL) {
972
        options = qdict_new();
973
    }
974

    
975
    bs->open_flags &= ~BDRV_O_NO_BACKING;
976
    if (qdict_haskey(options, "file.filename")) {
977
        backing_filename[0] = '\0';
978
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
979
        QDECREF(options);
980
        return 0;
981
    }
982

    
983
    bs->backing_hd = bdrv_new("");
984
    bdrv_get_full_backing_filename(bs, backing_filename,
985
                                   sizeof(backing_filename));
986

    
987
    if (bs->backing_format[0] != '\0') {
988
        back_drv = bdrv_find_format(bs->backing_format);
989
    }
990

    
991
    /* backing files always opened read-only */
992
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
993

    
994
    ret = bdrv_open(bs->backing_hd,
995
                    *backing_filename ? backing_filename : NULL, options,
996
                    back_flags, back_drv, &local_err);
997
    if (ret < 0) {
998
        bdrv_unref(bs->backing_hd);
999
        bs->backing_hd = NULL;
1000
        bs->open_flags |= BDRV_O_NO_BACKING;
1001
        error_propagate(errp, local_err);
1002
        return ret;
1003
    }
1004
    return 0;
1005
}
1006

    
1007
static void extract_subqdict(QDict *src, QDict **dst, const char *start)
1008
{
1009
    const QDictEntry *entry, *next;
1010
    const char *p;
1011

    
1012
    *dst = qdict_new();
1013
    entry = qdict_first(src);
1014

    
1015
    while (entry != NULL) {
1016
        next = qdict_next(src, entry);
1017
        if (strstart(entry->key, start, &p)) {
1018
            qobject_incref(entry->value);
1019
            qdict_put_obj(*dst, p, entry->value);
1020
            qdict_del(src, entry->key);
1021
        }
1022
        entry = next;
1023
    }
1024
}
1025

    
1026
/*
1027
 * Opens a disk image (raw, qcow2, vmdk, ...)
1028
 *
1029
 * options is a QDict of options to pass to the block drivers, or NULL for an
1030
 * empty set of options. The reference to the QDict belongs to the block layer
1031
 * after the call (even on failure), so if the caller intends to reuse the
1032
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1033
 */
1034
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1035
              int flags, BlockDriver *drv, Error **errp)
1036
{
1037
    int ret;
1038
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1039
    char tmp_filename[PATH_MAX + 1];
1040
    BlockDriverState *file = NULL;
1041
    QDict *file_options = NULL;
1042
    const char *drvname;
1043
    Error *local_err = NULL;
1044

    
1045
    /* NULL means an empty set of options */
1046
    if (options == NULL) {
1047
        options = qdict_new();
1048
    }
1049

    
1050
    bs->options = options;
1051
    options = qdict_clone_shallow(options);
1052

    
1053
    /* For snapshot=on, create a temporary qcow2 overlay */
1054
    if (flags & BDRV_O_SNAPSHOT) {
1055
        BlockDriverState *bs1;
1056
        int64_t total_size;
1057
        BlockDriver *bdrv_qcow2;
1058
        QEMUOptionParameter *create_options;
1059
        char backing_filename[PATH_MAX];
1060

    
1061
        if (qdict_size(options) != 0) {
1062
            error_setg(errp, "Can't use snapshot=on with driver-specific options");
1063
            ret = -EINVAL;
1064
            goto fail;
1065
        }
1066
        assert(filename != NULL);
1067

    
1068
        /* if snapshot, we create a temporary backing file and open it
1069
           instead of opening 'filename' directly */
1070

    
1071
        /* if there is a backing file, use it */
1072
        bs1 = bdrv_new("");
1073
        ret = bdrv_open(bs1, filename, NULL, 0, drv, &local_err);
1074
        if (ret < 0) {
1075
            bdrv_unref(bs1);
1076
            goto fail;
1077
        }
1078
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1079

    
1080
        bdrv_unref(bs1);
1081

    
1082
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1083
        if (ret < 0) {
1084
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1085
            goto fail;
1086
        }
1087

    
1088
        /* Real path is meaningless for protocols */
1089
        if (path_has_protocol(filename)) {
1090
            snprintf(backing_filename, sizeof(backing_filename),
1091
                     "%s", filename);
1092
        } else if (!realpath(filename, backing_filename)) {
1093
            error_setg_errno(errp, errno, "Could not resolve path '%s'", filename);
1094
            ret = -errno;
1095
            goto fail;
1096
        }
1097

    
1098
        bdrv_qcow2 = bdrv_find_format("qcow2");
1099
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1100
                                                 NULL);
1101

    
1102
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1103
        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
1104
                             backing_filename);
1105
        if (drv) {
1106
            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
1107
                drv->format_name);
1108
        }
1109

    
1110
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1111
        free_option_parameters(create_options);
1112
        if (ret < 0) {
1113
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1114
                             "'%s': %s", tmp_filename,
1115
                             error_get_pretty(local_err));
1116
            error_free(local_err);
1117
            local_err = NULL;
1118
            goto fail;
1119
        }
1120

    
1121
        filename = tmp_filename;
1122
        drv = bdrv_qcow2;
1123
        bs->is_temporary = 1;
1124
    }
1125

    
1126
    /* Open image file without format layer */
1127
    if (flags & BDRV_O_RDWR) {
1128
        flags |= BDRV_O_ALLOW_RDWR;
1129
    }
1130

    
1131
    extract_subqdict(options, &file_options, "file.");
1132

    
1133
    ret = bdrv_file_open(&file, filename, file_options,
1134
                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
1135
    if (ret < 0) {
1136
        goto fail;
1137
    }
1138

    
1139
    /* Find the right image format driver */
1140
    drvname = qdict_get_try_str(options, "driver");
1141
    if (drvname) {
1142
        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
1143
        qdict_del(options, "driver");
1144
    }
1145

    
1146
    if (!drv) {
1147
        ret = find_image_format(file, filename, &drv, &local_err);
1148
    }
1149

    
1150
    if (!drv) {
1151
        goto unlink_and_fail;
1152
    }
1153

    
1154
    /* Open the image */
1155
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1156
    if (ret < 0) {
1157
        goto unlink_and_fail;
1158
    }
1159

    
1160
    if (bs->file != file) {
1161
        bdrv_unref(file);
1162
        file = NULL;
1163
    }
1164

    
1165
    /* If there is a backing file, use it */
1166
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1167
        QDict *backing_options;
1168

    
1169
        extract_subqdict(options, &backing_options, "backing.");
1170
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1171
        if (ret < 0) {
1172
            goto close_and_fail;
1173
        }
1174
    }
1175

    
1176
    /* Check if any unknown options were used */
1177
    if (qdict_size(options) != 0) {
1178
        const QDictEntry *entry = qdict_first(options);
1179
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1180
                   "support the option '%s'", drv->format_name, bs->device_name,
1181
                   entry->key);
1182

    
1183
        ret = -EINVAL;
1184
        goto close_and_fail;
1185
    }
1186
    QDECREF(options);
1187

    
1188
    if (!bdrv_key_required(bs)) {
1189
        bdrv_dev_change_media_cb(bs, true);
1190
    }
1191

    
1192
    return 0;
1193

    
1194
unlink_and_fail:
1195
    if (file != NULL) {
1196
        bdrv_unref(file);
1197
    }
1198
    if (bs->is_temporary) {
1199
        unlink(filename);
1200
    }
1201
fail:
1202
    QDECREF(bs->options);
1203
    QDECREF(options);
1204
    bs->options = NULL;
1205
    if (error_is_set(&local_err)) {
1206
        error_propagate(errp, local_err);
1207
    }
1208
    return ret;
1209

    
1210
close_and_fail:
1211
    bdrv_close(bs);
1212
    QDECREF(options);
1213
    if (error_is_set(&local_err)) {
1214
        error_propagate(errp, local_err);
1215
    }
1216
    return ret;
1217
}
1218

    
1219
typedef struct BlockReopenQueueEntry {
1220
     bool prepared;
1221
     BDRVReopenState state;
1222
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1223
} BlockReopenQueueEntry;
1224

    
1225
/*
1226
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1227
 * reopen of multiple devices.
1228
 *
1229
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1230
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1231
 * be created and initialized. This newly created BlockReopenQueue should be
1232
 * passed back in for subsequent calls that are intended to be of the same
1233
 * atomic 'set'.
1234
 *
1235
 * bs is the BlockDriverState to add to the reopen queue.
1236
 *
1237
 * flags contains the open flags for the associated bs
1238
 *
1239
 * returns a pointer to bs_queue, which is either the newly allocated
1240
 * bs_queue, or the existing bs_queue being used.
1241
 *
1242
 */
1243
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1244
                                    BlockDriverState *bs, int flags)
1245
{
1246
    assert(bs != NULL);
1247

    
1248
    BlockReopenQueueEntry *bs_entry;
1249
    if (bs_queue == NULL) {
1250
        bs_queue = g_new0(BlockReopenQueue, 1);
1251
        QSIMPLEQ_INIT(bs_queue);
1252
    }
1253

    
1254
    if (bs->file) {
1255
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1256
    }
1257

    
1258
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1259
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1260

    
1261
    bs_entry->state.bs = bs;
1262
    bs_entry->state.flags = flags;
1263

    
1264
    return bs_queue;
1265
}
1266

    
1267
/*
1268
 * Reopen multiple BlockDriverStates atomically & transactionally.
1269
 *
1270
 * The queue passed in (bs_queue) must have been built up previous
1271
 * via bdrv_reopen_queue().
1272
 *
1273
 * Reopens all BDS specified in the queue, with the appropriate
1274
 * flags.  All devices are prepared for reopen, and failure of any
1275
 * device will cause all device changes to be abandonded, and intermediate
1276
 * data cleaned up.
1277
 *
1278
 * If all devices prepare successfully, then the changes are committed
1279
 * to all devices.
1280
 *
1281
 */
1282
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1283
{
1284
    int ret = -1;
1285
    BlockReopenQueueEntry *bs_entry, *next;
1286
    Error *local_err = NULL;
1287

    
1288
    assert(bs_queue != NULL);
1289

    
1290
    bdrv_drain_all();
1291

    
1292
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1293
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1294
            error_propagate(errp, local_err);
1295
            goto cleanup;
1296
        }
1297
        bs_entry->prepared = true;
1298
    }
1299

    
1300
    /* If we reach this point, we have success and just need to apply the
1301
     * changes
1302
     */
1303
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1304
        bdrv_reopen_commit(&bs_entry->state);
1305
    }
1306

    
1307
    ret = 0;
1308

    
1309
cleanup:
1310
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1311
        if (ret && bs_entry->prepared) {
1312
            bdrv_reopen_abort(&bs_entry->state);
1313
        }
1314
        g_free(bs_entry);
1315
    }
1316
    g_free(bs_queue);
1317
    return ret;
1318
}
1319

    
1320

    
1321
/* Reopen a single BlockDriverState with the specified flags. */
1322
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1323
{
1324
    int ret = -1;
1325
    Error *local_err = NULL;
1326
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1327

    
1328
    ret = bdrv_reopen_multiple(queue, &local_err);
1329
    if (local_err != NULL) {
1330
        error_propagate(errp, local_err);
1331
    }
1332
    return ret;
1333
}
1334

    
1335

    
1336
/*
1337
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1338
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1339
 * the block driver layer .bdrv_reopen_prepare()
1340
 *
1341
 * bs is the BlockDriverState to reopen
1342
 * flags are the new open flags
1343
 * queue is the reopen queue
1344
 *
1345
 * Returns 0 on success, non-zero on error.  On error errp will be set
1346
 * as well.
1347
 *
1348
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1349
 * It is the responsibility of the caller to then call the abort() or
1350
 * commit() for any other BDS that have been left in a prepare() state
1351
 *
1352
 */
1353
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1354
                        Error **errp)
1355
{
1356
    int ret = -1;
1357
    Error *local_err = NULL;
1358
    BlockDriver *drv;
1359

    
1360
    assert(reopen_state != NULL);
1361
    assert(reopen_state->bs->drv != NULL);
1362
    drv = reopen_state->bs->drv;
1363

    
1364
    /* if we are to stay read-only, do not allow permission change
1365
     * to r/w */
1366
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1367
        reopen_state->flags & BDRV_O_RDWR) {
1368
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1369
                  reopen_state->bs->device_name);
1370
        goto error;
1371
    }
1372

    
1373

    
1374
    ret = bdrv_flush(reopen_state->bs);
1375
    if (ret) {
1376
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1377
                  strerror(-ret));
1378
        goto error;
1379
    }
1380

    
1381
    if (drv->bdrv_reopen_prepare) {
1382
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1383
        if (ret) {
1384
            if (local_err != NULL) {
1385
                error_propagate(errp, local_err);
1386
            } else {
1387
                error_setg(errp, "failed while preparing to reopen image '%s'",
1388
                           reopen_state->bs->filename);
1389
            }
1390
            goto error;
1391
        }
1392
    } else {
1393
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1394
         * handler for each supported drv. */
1395
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1396
                  drv->format_name, reopen_state->bs->device_name,
1397
                 "reopening of file");
1398
        ret = -1;
1399
        goto error;
1400
    }
1401

    
1402
    ret = 0;
1403

    
1404
error:
1405
    return ret;
1406
}
1407

    
1408
/*
1409
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1410
 * makes them final by swapping the staging BlockDriverState contents into
1411
 * the active BlockDriverState contents.
1412
 */
1413
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1414
{
1415
    BlockDriver *drv;
1416

    
1417
    assert(reopen_state != NULL);
1418
    drv = reopen_state->bs->drv;
1419
    assert(drv != NULL);
1420

    
1421
    /* If there are any driver level actions to take */
1422
    if (drv->bdrv_reopen_commit) {
1423
        drv->bdrv_reopen_commit(reopen_state);
1424
    }
1425

    
1426
    /* set BDS specific flags now */
1427
    reopen_state->bs->open_flags         = reopen_state->flags;
1428
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1429
                                              BDRV_O_CACHE_WB);
1430
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1431
}
1432

    
1433
/*
1434
 * Abort the reopen, and delete and free the staged changes in
1435
 * reopen_state
1436
 */
1437
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1438
{
1439
    BlockDriver *drv;
1440

    
1441
    assert(reopen_state != NULL);
1442
    drv = reopen_state->bs->drv;
1443
    assert(drv != NULL);
1444

    
1445
    if (drv->bdrv_reopen_abort) {
1446
        drv->bdrv_reopen_abort(reopen_state);
1447
    }
1448
}
1449

    
1450

    
1451
void bdrv_close(BlockDriverState *bs)
1452
{
1453
    if (bs->job) {
1454
        block_job_cancel_sync(bs->job);
1455
    }
1456
    bdrv_drain_all(); /* complete I/O */
1457
    bdrv_flush(bs);
1458
    bdrv_drain_all(); /* in case flush left pending I/O */
1459
    notifier_list_notify(&bs->close_notifiers, bs);
1460

    
1461
    if (bs->drv) {
1462
        if (bs->backing_hd) {
1463
            bdrv_unref(bs->backing_hd);
1464
            bs->backing_hd = NULL;
1465
        }
1466
        bs->drv->bdrv_close(bs);
1467
        g_free(bs->opaque);
1468
#ifdef _WIN32
1469
        if (bs->is_temporary) {
1470
            unlink(bs->filename);
1471
        }
1472
#endif
1473
        bs->opaque = NULL;
1474
        bs->drv = NULL;
1475
        bs->copy_on_read = 0;
1476
        bs->backing_file[0] = '\0';
1477
        bs->backing_format[0] = '\0';
1478
        bs->total_sectors = 0;
1479
        bs->encrypted = 0;
1480
        bs->valid_key = 0;
1481
        bs->sg = 0;
1482
        bs->growable = 0;
1483
        bs->zero_beyond_eof = false;
1484
        QDECREF(bs->options);
1485
        bs->options = NULL;
1486

    
1487
        if (bs->file != NULL) {
1488
            bdrv_unref(bs->file);
1489
            bs->file = NULL;
1490
        }
1491
    }
1492

    
1493
    bdrv_dev_change_media_cb(bs, false);
1494

    
1495
    /*throttling disk I/O limits*/
1496
    if (bs->io_limits_enabled) {
1497
        bdrv_io_limits_disable(bs);
1498
    }
1499
}
1500

    
1501
void bdrv_close_all(void)
1502
{
1503
    BlockDriverState *bs;
1504

    
1505
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1506
        bdrv_close(bs);
1507
    }
1508
}
1509

    
1510
/* Check if any requests are in-flight (including throttled requests) */
1511
static bool bdrv_requests_pending(BlockDriverState *bs)
1512
{
1513
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1514
        return true;
1515
    }
1516
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1517
        return true;
1518
    }
1519
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1520
        return true;
1521
    }
1522
    if (bs->file && bdrv_requests_pending(bs->file)) {
1523
        return true;
1524
    }
1525
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1526
        return true;
1527
    }
1528
    return false;
1529
}
1530

    
1531
static bool bdrv_requests_pending_all(void)
1532
{
1533
    BlockDriverState *bs;
1534
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1535
        if (bdrv_requests_pending(bs)) {
1536
            return true;
1537
        }
1538
    }
1539
    return false;
1540
}
1541

    
1542
/*
1543
 * Wait for pending requests to complete across all BlockDriverStates
1544
 *
1545
 * This function does not flush data to disk, use bdrv_flush_all() for that
1546
 * after calling this function.
1547
 *
1548
 * Note that completion of an asynchronous I/O operation can trigger any
1549
 * number of other I/O operations on other devices---for example a coroutine
1550
 * can be arbitrarily complex and a constant flow of I/O can come until the
1551
 * coroutine is complete.  Because of this, it is not possible to have a
1552
 * function to drain a single device's I/O queue.
1553
 */
1554
void bdrv_drain_all(void)
1555
{
1556
    /* Always run first iteration so any pending completion BHs run */
1557
    bool busy = true;
1558
    BlockDriverState *bs;
1559

    
1560
    while (busy) {
1561
        /* FIXME: We do not have timer support here, so this is effectively
1562
         * a busy wait.
1563
         */
1564
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1565
            if (bdrv_start_throttled_reqs(bs)) {
1566
                busy = true;
1567
            }
1568
        }
1569

    
1570
        busy = bdrv_requests_pending_all();
1571
        busy |= aio_poll(qemu_get_aio_context(), busy);
1572
    }
1573
}
1574

    
1575
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1576
   Also, NULL terminate the device_name to prevent double remove */
1577
void bdrv_make_anon(BlockDriverState *bs)
1578
{
1579
    if (bs->device_name[0] != '\0') {
1580
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1581
    }
1582
    bs->device_name[0] = '\0';
1583
}
1584

    
1585
static void bdrv_rebind(BlockDriverState *bs)
1586
{
1587
    if (bs->drv && bs->drv->bdrv_rebind) {
1588
        bs->drv->bdrv_rebind(bs);
1589
    }
1590
}
1591

    
1592
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1593
                                     BlockDriverState *bs_src)
1594
{
1595
    /* move some fields that need to stay attached to the device */
1596
    bs_dest->open_flags         = bs_src->open_flags;
1597

    
1598
    /* dev info */
1599
    bs_dest->dev_ops            = bs_src->dev_ops;
1600
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1601
    bs_dest->dev                = bs_src->dev;
1602
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1603
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1604

    
1605
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1606

    
1607
    /* i/o throttled req */
1608
    memcpy(&bs_dest->throttle_state,
1609
           &bs_src->throttle_state,
1610
           sizeof(ThrottleState));
1611
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1612
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1613
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1614

    
1615
    /* r/w error */
1616
    bs_dest->on_read_error      = bs_src->on_read_error;
1617
    bs_dest->on_write_error     = bs_src->on_write_error;
1618

    
1619
    /* i/o status */
1620
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1621
    bs_dest->iostatus           = bs_src->iostatus;
1622

    
1623
    /* dirty bitmap */
1624
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1625

    
1626
    /* reference count */
1627
    bs_dest->refcnt             = bs_src->refcnt;
1628

    
1629
    /* job */
1630
    bs_dest->in_use             = bs_src->in_use;
1631
    bs_dest->job                = bs_src->job;
1632

    
1633
    /* keep the same entry in bdrv_states */
1634
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1635
            bs_src->device_name);
1636
    bs_dest->list = bs_src->list;
1637
}
1638

    
1639
/*
1640
 * Swap bs contents for two image chains while they are live,
1641
 * while keeping required fields on the BlockDriverState that is
1642
 * actually attached to a device.
1643
 *
1644
 * This will modify the BlockDriverState fields, and swap contents
1645
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1646
 *
1647
 * bs_new is required to be anonymous.
1648
 *
1649
 * This function does not create any image files.
1650
 */
1651
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1652
{
1653
    BlockDriverState tmp;
1654

    
1655
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1656
    assert(bs_new->device_name[0] == '\0');
1657
    assert(bs_new->dirty_bitmap == NULL);
1658
    assert(bs_new->job == NULL);
1659
    assert(bs_new->dev == NULL);
1660
    assert(bs_new->in_use == 0);
1661
    assert(bs_new->io_limits_enabled == false);
1662
    assert(!throttle_have_timer(&bs_new->throttle_state));
1663

    
1664
    tmp = *bs_new;
1665
    *bs_new = *bs_old;
1666
    *bs_old = tmp;
1667

    
1668
    /* there are some fields that should not be swapped, move them back */
1669
    bdrv_move_feature_fields(&tmp, bs_old);
1670
    bdrv_move_feature_fields(bs_old, bs_new);
1671
    bdrv_move_feature_fields(bs_new, &tmp);
1672

    
1673
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1674
    assert(bs_new->device_name[0] == '\0');
1675

    
1676
    /* Check a few fields that should remain attached to the device */
1677
    assert(bs_new->dev == NULL);
1678
    assert(bs_new->job == NULL);
1679
    assert(bs_new->in_use == 0);
1680
    assert(bs_new->io_limits_enabled == false);
1681
    assert(!throttle_have_timer(&bs_new->throttle_state));
1682

    
1683
    bdrv_rebind(bs_new);
1684
    bdrv_rebind(bs_old);
1685
}
1686

    
1687
/*
1688
 * Add new bs contents at the top of an image chain while the chain is
1689
 * live, while keeping required fields on the top layer.
1690
 *
1691
 * This will modify the BlockDriverState fields, and swap contents
1692
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1693
 *
1694
 * bs_new is required to be anonymous.
1695
 *
1696
 * This function does not create any image files.
1697
 */
1698
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1699
{
1700
    bdrv_swap(bs_new, bs_top);
1701

    
1702
    /* The contents of 'tmp' will become bs_top, as we are
1703
     * swapping bs_new and bs_top contents. */
1704
    bs_top->backing_hd = bs_new;
1705
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1706
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1707
            bs_new->filename);
1708
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1709
            bs_new->drv ? bs_new->drv->format_name : "");
1710
}
1711

    
1712
static void bdrv_delete(BlockDriverState *bs)
1713
{
1714
    assert(!bs->dev);
1715
    assert(!bs->job);
1716
    assert(!bs->in_use);
1717
    assert(!bs->refcnt);
1718

    
1719
    bdrv_close(bs);
1720

    
1721
    /* remove from list, if necessary */
1722
    bdrv_make_anon(bs);
1723

    
1724
    g_free(bs);
1725
}
1726

    
1727
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1728
/* TODO change to DeviceState *dev when all users are qdevified */
1729
{
1730
    if (bs->dev) {
1731
        return -EBUSY;
1732
    }
1733
    bs->dev = dev;
1734
    bdrv_iostatus_reset(bs);
1735
    return 0;
1736
}
1737

    
1738
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1739
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1740
{
1741
    if (bdrv_attach_dev(bs, dev) < 0) {
1742
        abort();
1743
    }
1744
}
1745

    
1746
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1747
/* TODO change to DeviceState *dev when all users are qdevified */
1748
{
1749
    assert(bs->dev == dev);
1750
    bs->dev = NULL;
1751
    bs->dev_ops = NULL;
1752
    bs->dev_opaque = NULL;
1753
    bs->buffer_alignment = 512;
1754
}
1755

    
1756
/* TODO change to return DeviceState * when all users are qdevified */
1757
void *bdrv_get_attached_dev(BlockDriverState *bs)
1758
{
1759
    return bs->dev;
1760
}
1761

    
1762
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1763
                      void *opaque)
1764
{
1765
    bs->dev_ops = ops;
1766
    bs->dev_opaque = opaque;
1767
}
1768

    
1769
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1770
                               enum MonitorEvent ev,
1771
                               BlockErrorAction action, bool is_read)
1772
{
1773
    QObject *data;
1774
    const char *action_str;
1775

    
1776
    switch (action) {
1777
    case BDRV_ACTION_REPORT:
1778
        action_str = "report";
1779
        break;
1780
    case BDRV_ACTION_IGNORE:
1781
        action_str = "ignore";
1782
        break;
1783
    case BDRV_ACTION_STOP:
1784
        action_str = "stop";
1785
        break;
1786
    default:
1787
        abort();
1788
    }
1789

    
1790
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1791
                              bdrv->device_name,
1792
                              action_str,
1793
                              is_read ? "read" : "write");
1794
    monitor_protocol_event(ev, data);
1795

    
1796
    qobject_decref(data);
1797
}
1798

    
1799
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1800
{
1801
    QObject *data;
1802

    
1803
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1804
                              bdrv_get_device_name(bs), ejected);
1805
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1806

    
1807
    qobject_decref(data);
1808
}
1809

    
1810
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1811
{
1812
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1813
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1814
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1815
        if (tray_was_closed) {
1816
            /* tray open */
1817
            bdrv_emit_qmp_eject_event(bs, true);
1818
        }
1819
        if (load) {
1820
            /* tray close */
1821
            bdrv_emit_qmp_eject_event(bs, false);
1822
        }
1823
    }
1824
}
1825

    
1826
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1827
{
1828
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1829
}
1830

    
1831
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1832
{
1833
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1834
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1835
    }
1836
}
1837

    
1838
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1839
{
1840
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1841
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1842
    }
1843
    return false;
1844
}
1845

    
1846
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1847
{
1848
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1849
        bs->dev_ops->resize_cb(bs->dev_opaque);
1850
    }
1851
}
1852

    
1853
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1854
{
1855
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1856
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1857
    }
1858
    return false;
1859
}
1860

    
1861
/*
1862
 * Run consistency checks on an image
1863
 *
1864
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1865
 * free of errors) or -errno when an internal error occurred. The results of the
1866
 * check are stored in res.
1867
 */
1868
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1869
{
1870
    if (bs->drv->bdrv_check == NULL) {
1871
        return -ENOTSUP;
1872
    }
1873

    
1874
    memset(res, 0, sizeof(*res));
1875
    return bs->drv->bdrv_check(bs, res, fix);
1876
}
1877

    
1878
#define COMMIT_BUF_SECTORS 2048
1879

    
1880
/* commit COW file into the raw image */
1881
int bdrv_commit(BlockDriverState *bs)
1882
{
1883
    BlockDriver *drv = bs->drv;
1884
    int64_t sector, total_sectors;
1885
    int n, ro, open_flags;
1886
    int ret = 0;
1887
    uint8_t *buf;
1888
    char filename[PATH_MAX];
1889

    
1890
    if (!drv)
1891
        return -ENOMEDIUM;
1892
    
1893
    if (!bs->backing_hd) {
1894
        return -ENOTSUP;
1895
    }
1896

    
1897
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1898
        return -EBUSY;
1899
    }
1900

    
1901
    ro = bs->backing_hd->read_only;
1902
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1903
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1904
    open_flags =  bs->backing_hd->open_flags;
1905

    
1906
    if (ro) {
1907
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1908
            return -EACCES;
1909
        }
1910
    }
1911

    
1912
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1913
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1914

    
1915
    for (sector = 0; sector < total_sectors; sector += n) {
1916
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
1917
        if (ret < 0) {
1918
            goto ro_cleanup;
1919
        }
1920
        if (ret) {
1921
            if (bdrv_read(bs, sector, buf, n) != 0) {
1922
                ret = -EIO;
1923
                goto ro_cleanup;
1924
            }
1925

    
1926
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1927
                ret = -EIO;
1928
                goto ro_cleanup;
1929
            }
1930
        }
1931
    }
1932

    
1933
    if (drv->bdrv_make_empty) {
1934
        ret = drv->bdrv_make_empty(bs);
1935
        bdrv_flush(bs);
1936
    }
1937

    
1938
    /*
1939
     * Make sure all data we wrote to the backing device is actually
1940
     * stable on disk.
1941
     */
1942
    if (bs->backing_hd)
1943
        bdrv_flush(bs->backing_hd);
1944

    
1945
ro_cleanup:
1946
    g_free(buf);
1947

    
1948
    if (ro) {
1949
        /* ignoring error return here */
1950
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1951
    }
1952

    
1953
    return ret;
1954
}
1955

    
1956
int bdrv_commit_all(void)
1957
{
1958
    BlockDriverState *bs;
1959

    
1960
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1961
        if (bs->drv && bs->backing_hd) {
1962
            int ret = bdrv_commit(bs);
1963
            if (ret < 0) {
1964
                return ret;
1965
            }
1966
        }
1967
    }
1968
    return 0;
1969
}
1970

    
1971
/**
1972
 * Remove an active request from the tracked requests list
1973
 *
1974
 * This function should be called when a tracked request is completing.
1975
 */
1976
static void tracked_request_end(BdrvTrackedRequest *req)
1977
{
1978
    QLIST_REMOVE(req, list);
1979
    qemu_co_queue_restart_all(&req->wait_queue);
1980
}
1981

    
1982
/**
1983
 * Add an active request to the tracked requests list
1984
 */
1985
static void tracked_request_begin(BdrvTrackedRequest *req,
1986
                                  BlockDriverState *bs,
1987
                                  int64_t sector_num,
1988
                                  int nb_sectors, bool is_write)
1989
{
1990
    *req = (BdrvTrackedRequest){
1991
        .bs = bs,
1992
        .sector_num = sector_num,
1993
        .nb_sectors = nb_sectors,
1994
        .is_write = is_write,
1995
        .co = qemu_coroutine_self(),
1996
    };
1997

    
1998
    qemu_co_queue_init(&req->wait_queue);
1999

    
2000
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2001
}
2002

    
2003
/**
2004
 * Round a region to cluster boundaries
2005
 */
2006
void bdrv_round_to_clusters(BlockDriverState *bs,
2007
                            int64_t sector_num, int nb_sectors,
2008
                            int64_t *cluster_sector_num,
2009
                            int *cluster_nb_sectors)
2010
{
2011
    BlockDriverInfo bdi;
2012

    
2013
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2014
        *cluster_sector_num = sector_num;
2015
        *cluster_nb_sectors = nb_sectors;
2016
    } else {
2017
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2018
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2019
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2020
                                            nb_sectors, c);
2021
    }
2022
}
2023

    
2024
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2025
                                     int64_t sector_num, int nb_sectors) {
2026
    /*        aaaa   bbbb */
2027
    if (sector_num >= req->sector_num + req->nb_sectors) {
2028
        return false;
2029
    }
2030
    /* bbbb   aaaa        */
2031
    if (req->sector_num >= sector_num + nb_sectors) {
2032
        return false;
2033
    }
2034
    return true;
2035
}
2036

    
2037
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2038
        int64_t sector_num, int nb_sectors)
2039
{
2040
    BdrvTrackedRequest *req;
2041
    int64_t cluster_sector_num;
2042
    int cluster_nb_sectors;
2043
    bool retry;
2044

    
2045
    /* If we touch the same cluster it counts as an overlap.  This guarantees
2046
     * that allocating writes will be serialized and not race with each other
2047
     * for the same cluster.  For example, in copy-on-read it ensures that the
2048
     * CoR read and write operations are atomic and guest writes cannot
2049
     * interleave between them.
2050
     */
2051
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2052
                           &cluster_sector_num, &cluster_nb_sectors);
2053

    
2054
    do {
2055
        retry = false;
2056
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2057
            if (tracked_request_overlaps(req, cluster_sector_num,
2058
                                         cluster_nb_sectors)) {
2059
                /* Hitting this means there was a reentrant request, for
2060
                 * example, a block driver issuing nested requests.  This must
2061
                 * never happen since it means deadlock.
2062
                 */
2063
                assert(qemu_coroutine_self() != req->co);
2064

    
2065
                qemu_co_queue_wait(&req->wait_queue);
2066
                retry = true;
2067
                break;
2068
            }
2069
        }
2070
    } while (retry);
2071
}
2072

    
2073
/*
2074
 * Return values:
2075
 * 0        - success
2076
 * -EINVAL  - backing format specified, but no file
2077
 * -ENOSPC  - can't update the backing file because no space is left in the
2078
 *            image file header
2079
 * -ENOTSUP - format driver doesn't support changing the backing file
2080
 */
2081
int bdrv_change_backing_file(BlockDriverState *bs,
2082
    const char *backing_file, const char *backing_fmt)
2083
{
2084
    BlockDriver *drv = bs->drv;
2085
    int ret;
2086

    
2087
    /* Backing file format doesn't make sense without a backing file */
2088
    if (backing_fmt && !backing_file) {
2089
        return -EINVAL;
2090
    }
2091

    
2092
    if (drv->bdrv_change_backing_file != NULL) {
2093
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2094
    } else {
2095
        ret = -ENOTSUP;
2096
    }
2097

    
2098
    if (ret == 0) {
2099
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2100
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2101
    }
2102
    return ret;
2103
}
2104

    
2105
/*
2106
 * Finds the image layer in the chain that has 'bs' as its backing file.
2107
 *
2108
 * active is the current topmost image.
2109
 *
2110
 * Returns NULL if bs is not found in active's image chain,
2111
 * or if active == bs.
2112
 */
2113
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2114
                                    BlockDriverState *bs)
2115
{
2116
    BlockDriverState *overlay = NULL;
2117
    BlockDriverState *intermediate;
2118

    
2119
    assert(active != NULL);
2120
    assert(bs != NULL);
2121

    
2122
    /* if bs is the same as active, then by definition it has no overlay
2123
     */
2124
    if (active == bs) {
2125
        return NULL;
2126
    }
2127

    
2128
    intermediate = active;
2129
    while (intermediate->backing_hd) {
2130
        if (intermediate->backing_hd == bs) {
2131
            overlay = intermediate;
2132
            break;
2133
        }
2134
        intermediate = intermediate->backing_hd;
2135
    }
2136

    
2137
    return overlay;
2138
}
2139

    
2140
typedef struct BlkIntermediateStates {
2141
    BlockDriverState *bs;
2142
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2143
} BlkIntermediateStates;
2144

    
2145

    
2146
/*
2147
 * Drops images above 'base' up to and including 'top', and sets the image
2148
 * above 'top' to have base as its backing file.
2149
 *
2150
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2151
 * information in 'bs' can be properly updated.
2152
 *
2153
 * E.g., this will convert the following chain:
2154
 * bottom <- base <- intermediate <- top <- active
2155
 *
2156
 * to
2157
 *
2158
 * bottom <- base <- active
2159
 *
2160
 * It is allowed for bottom==base, in which case it converts:
2161
 *
2162
 * base <- intermediate <- top <- active
2163
 *
2164
 * to
2165
 *
2166
 * base <- active
2167
 *
2168
 * Error conditions:
2169
 *  if active == top, that is considered an error
2170
 *
2171
 */
2172
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2173
                           BlockDriverState *base)
2174
{
2175
    BlockDriverState *intermediate;
2176
    BlockDriverState *base_bs = NULL;
2177
    BlockDriverState *new_top_bs = NULL;
2178
    BlkIntermediateStates *intermediate_state, *next;
2179
    int ret = -EIO;
2180

    
2181
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2182
    QSIMPLEQ_INIT(&states_to_delete);
2183

    
2184
    if (!top->drv || !base->drv) {
2185
        goto exit;
2186
    }
2187

    
2188
    new_top_bs = bdrv_find_overlay(active, top);
2189

    
2190
    if (new_top_bs == NULL) {
2191
        /* we could not find the image above 'top', this is an error */
2192
        goto exit;
2193
    }
2194

    
2195
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2196
     * to do, no intermediate images */
2197
    if (new_top_bs->backing_hd == base) {
2198
        ret = 0;
2199
        goto exit;
2200
    }
2201

    
2202
    intermediate = top;
2203

    
2204
    /* now we will go down through the list, and add each BDS we find
2205
     * into our deletion queue, until we hit the 'base'
2206
     */
2207
    while (intermediate) {
2208
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2209
        intermediate_state->bs = intermediate;
2210
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2211

    
2212
        if (intermediate->backing_hd == base) {
2213
            base_bs = intermediate->backing_hd;
2214
            break;
2215
        }
2216
        intermediate = intermediate->backing_hd;
2217
    }
2218
    if (base_bs == NULL) {
2219
        /* something went wrong, we did not end at the base. safely
2220
         * unravel everything, and exit with error */
2221
        goto exit;
2222
    }
2223

    
2224
    /* success - we can delete the intermediate states, and link top->base */
2225
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2226
                                   base_bs->drv ? base_bs->drv->format_name : "");
2227
    if (ret) {
2228
        goto exit;
2229
    }
2230
    new_top_bs->backing_hd = base_bs;
2231

    
2232

    
2233
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2234
        /* so that bdrv_close() does not recursively close the chain */
2235
        intermediate_state->bs->backing_hd = NULL;
2236
        bdrv_unref(intermediate_state->bs);
2237
    }
2238
    ret = 0;
2239

    
2240
exit:
2241
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2242
        g_free(intermediate_state);
2243
    }
2244
    return ret;
2245
}
2246

    
2247

    
2248
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2249
                                   size_t size)
2250
{
2251
    int64_t len;
2252

    
2253
    if (!bdrv_is_inserted(bs))
2254
        return -ENOMEDIUM;
2255

    
2256
    if (bs->growable)
2257
        return 0;
2258

    
2259
    len = bdrv_getlength(bs);
2260

    
2261
    if (offset < 0)
2262
        return -EIO;
2263

    
2264
    if ((offset > len) || (len - offset < size))
2265
        return -EIO;
2266

    
2267
    return 0;
2268
}
2269

    
2270
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2271
                              int nb_sectors)
2272
{
2273
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2274
                                   nb_sectors * BDRV_SECTOR_SIZE);
2275
}
2276

    
2277
typedef struct RwCo {
2278
    BlockDriverState *bs;
2279
    int64_t sector_num;
2280
    int nb_sectors;
2281
    QEMUIOVector *qiov;
2282
    bool is_write;
2283
    int ret;
2284
    BdrvRequestFlags flags;
2285
} RwCo;
2286

    
2287
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2288
{
2289
    RwCo *rwco = opaque;
2290

    
2291
    if (!rwco->is_write) {
2292
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2293
                                     rwco->nb_sectors, rwco->qiov,
2294
                                     rwco->flags);
2295
    } else {
2296
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2297
                                      rwco->nb_sectors, rwco->qiov,
2298
                                      rwco->flags);
2299
    }
2300
}
2301

    
2302
/*
2303
 * Process a vectored synchronous request using coroutines
2304
 */
2305
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2306
                       QEMUIOVector *qiov, bool is_write,
2307
                       BdrvRequestFlags flags)
2308
{
2309
    Coroutine *co;
2310
    RwCo rwco = {
2311
        .bs = bs,
2312
        .sector_num = sector_num,
2313
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2314
        .qiov = qiov,
2315
        .is_write = is_write,
2316
        .ret = NOT_DONE,
2317
        .flags = flags,
2318
    };
2319
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2320

    
2321
    /**
2322
     * In sync call context, when the vcpu is blocked, this throttling timer
2323
     * will not fire; so the I/O throttling function has to be disabled here
2324
     * if it has been enabled.
2325
     */
2326
    if (bs->io_limits_enabled) {
2327
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2328
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2329
        bdrv_io_limits_disable(bs);
2330
    }
2331

    
2332
    if (qemu_in_coroutine()) {
2333
        /* Fast-path if already in coroutine context */
2334
        bdrv_rw_co_entry(&rwco);
2335
    } else {
2336
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2337
        qemu_coroutine_enter(co, &rwco);
2338
        while (rwco.ret == NOT_DONE) {
2339
            qemu_aio_wait();
2340
        }
2341
    }
2342
    return rwco.ret;
2343
}
2344

    
2345
/*
2346
 * Process a synchronous request using coroutines
2347
 */
2348
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2349
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2350
{
2351
    QEMUIOVector qiov;
2352
    struct iovec iov = {
2353
        .iov_base = (void *)buf,
2354
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2355
    };
2356

    
2357
    qemu_iovec_init_external(&qiov, &iov, 1);
2358
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2359
}
2360

    
2361
/* return < 0 if error. See bdrv_write() for the return codes */
2362
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2363
              uint8_t *buf, int nb_sectors)
2364
{
2365
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2366
}
2367

    
2368
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2369
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2370
                          uint8_t *buf, int nb_sectors)
2371
{
2372
    bool enabled;
2373
    int ret;
2374

    
2375
    enabled = bs->io_limits_enabled;
2376
    bs->io_limits_enabled = false;
2377
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2378
    bs->io_limits_enabled = enabled;
2379
    return ret;
2380
}
2381

    
2382
/* Return < 0 if error. Important errors are:
2383
  -EIO         generic I/O error (may happen for all errors)
2384
  -ENOMEDIUM   No media inserted.
2385
  -EINVAL      Invalid sector number or nb_sectors
2386
  -EACCES      Trying to write a read-only device
2387
*/
2388
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2389
               const uint8_t *buf, int nb_sectors)
2390
{
2391
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2392
}
2393

    
2394
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2395
{
2396
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2397
}
2398

    
2399
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2400
{
2401
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2402
                      BDRV_REQ_ZERO_WRITE);
2403
}
2404

    
2405
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2406
               void *buf, int count1)
2407
{
2408
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2409
    int len, nb_sectors, count;
2410
    int64_t sector_num;
2411
    int ret;
2412

    
2413
    count = count1;
2414
    /* first read to align to sector start */
2415
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2416
    if (len > count)
2417
        len = count;
2418
    sector_num = offset >> BDRV_SECTOR_BITS;
2419
    if (len > 0) {
2420
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2421
            return ret;
2422
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2423
        count -= len;
2424
        if (count == 0)
2425
            return count1;
2426
        sector_num++;
2427
        buf += len;
2428
    }
2429

    
2430
    /* read the sectors "in place" */
2431
    nb_sectors = count >> BDRV_SECTOR_BITS;
2432
    if (nb_sectors > 0) {
2433
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2434
            return ret;
2435
        sector_num += nb_sectors;
2436
        len = nb_sectors << BDRV_SECTOR_BITS;
2437
        buf += len;
2438
        count -= len;
2439
    }
2440

    
2441
    /* add data from the last sector */
2442
    if (count > 0) {
2443
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2444
            return ret;
2445
        memcpy(buf, tmp_buf, count);
2446
    }
2447
    return count1;
2448
}
2449

    
2450
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2451
{
2452
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2453
    int len, nb_sectors, count;
2454
    int64_t sector_num;
2455
    int ret;
2456

    
2457
    count = qiov->size;
2458

    
2459
    /* first write to align to sector start */
2460
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2461
    if (len > count)
2462
        len = count;
2463
    sector_num = offset >> BDRV_SECTOR_BITS;
2464
    if (len > 0) {
2465
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2466
            return ret;
2467
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2468
                          len);
2469
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2470
            return ret;
2471
        count -= len;
2472
        if (count == 0)
2473
            return qiov->size;
2474
        sector_num++;
2475
    }
2476

    
2477
    /* write the sectors "in place" */
2478
    nb_sectors = count >> BDRV_SECTOR_BITS;
2479
    if (nb_sectors > 0) {
2480
        QEMUIOVector qiov_inplace;
2481

    
2482
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2483
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2484
                          nb_sectors << BDRV_SECTOR_BITS);
2485
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2486
        qemu_iovec_destroy(&qiov_inplace);
2487
        if (ret < 0) {
2488
            return ret;
2489
        }
2490

    
2491
        sector_num += nb_sectors;
2492
        len = nb_sectors << BDRV_SECTOR_BITS;
2493
        count -= len;
2494
    }
2495

    
2496
    /* add data from the last sector */
2497
    if (count > 0) {
2498
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2499
            return ret;
2500
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2501
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2502
            return ret;
2503
    }
2504
    return qiov->size;
2505
}
2506

    
2507
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2508
                const void *buf, int count1)
2509
{
2510
    QEMUIOVector qiov;
2511
    struct iovec iov = {
2512
        .iov_base   = (void *) buf,
2513
        .iov_len    = count1,
2514
    };
2515

    
2516
    qemu_iovec_init_external(&qiov, &iov, 1);
2517
    return bdrv_pwritev(bs, offset, &qiov);
2518
}
2519

    
2520
/*
2521
 * Writes to the file and ensures that no writes are reordered across this
2522
 * request (acts as a barrier)
2523
 *
2524
 * Returns 0 on success, -errno in error cases.
2525
 */
2526
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2527
    const void *buf, int count)
2528
{
2529
    int ret;
2530

    
2531
    ret = bdrv_pwrite(bs, offset, buf, count);
2532
    if (ret < 0) {
2533
        return ret;
2534
    }
2535

    
2536
    /* No flush needed for cache modes that already do it */
2537
    if (bs->enable_write_cache) {
2538
        bdrv_flush(bs);
2539
    }
2540

    
2541
    return 0;
2542
}
2543

    
2544
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2545
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2546
{
2547
    /* Perform I/O through a temporary buffer so that users who scribble over
2548
     * their read buffer while the operation is in progress do not end up
2549
     * modifying the image file.  This is critical for zero-copy guest I/O
2550
     * where anything might happen inside guest memory.
2551
     */
2552
    void *bounce_buffer;
2553

    
2554
    BlockDriver *drv = bs->drv;
2555
    struct iovec iov;
2556
    QEMUIOVector bounce_qiov;
2557
    int64_t cluster_sector_num;
2558
    int cluster_nb_sectors;
2559
    size_t skip_bytes;
2560
    int ret;
2561

    
2562
    /* Cover entire cluster so no additional backing file I/O is required when
2563
     * allocating cluster in the image file.
2564
     */
2565
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2566
                           &cluster_sector_num, &cluster_nb_sectors);
2567

    
2568
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2569
                                   cluster_sector_num, cluster_nb_sectors);
2570

    
2571
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2572
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2573
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2574

    
2575
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2576
                             &bounce_qiov);
2577
    if (ret < 0) {
2578
        goto err;
2579
    }
2580

    
2581
    if (drv->bdrv_co_write_zeroes &&
2582
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2583
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2584
                                      cluster_nb_sectors);
2585
    } else {
2586
        /* This does not change the data on the disk, it is not necessary
2587
         * to flush even in cache=writethrough mode.
2588
         */
2589
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2590
                                  &bounce_qiov);
2591
    }
2592

    
2593
    if (ret < 0) {
2594
        /* It might be okay to ignore write errors for guest requests.  If this
2595
         * is a deliberate copy-on-read then we don't want to ignore the error.
2596
         * Simply report it in all cases.
2597
         */
2598
        goto err;
2599
    }
2600

    
2601
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2602
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2603
                        nb_sectors * BDRV_SECTOR_SIZE);
2604

    
2605
err:
2606
    qemu_vfree(bounce_buffer);
2607
    return ret;
2608
}
2609

    
2610
/*
2611
 * Handle a read request in coroutine context
2612
 */
2613
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2614
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2615
    BdrvRequestFlags flags)
2616
{
2617
    BlockDriver *drv = bs->drv;
2618
    BdrvTrackedRequest req;
2619
    int ret;
2620

    
2621
    if (!drv) {
2622
        return -ENOMEDIUM;
2623
    }
2624
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2625
        return -EIO;
2626
    }
2627

    
2628
    if (bs->copy_on_read) {
2629
        flags |= BDRV_REQ_COPY_ON_READ;
2630
    }
2631
    if (flags & BDRV_REQ_COPY_ON_READ) {
2632
        bs->copy_on_read_in_flight++;
2633
    }
2634

    
2635
    if (bs->copy_on_read_in_flight) {
2636
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2637
    }
2638

    
2639
    /* throttling disk I/O */
2640
    if (bs->io_limits_enabled) {
2641
        bdrv_io_limits_intercept(bs, nb_sectors, false);
2642
    }
2643

    
2644
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2645

    
2646
    if (flags & BDRV_REQ_COPY_ON_READ) {
2647
        int pnum;
2648

    
2649
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2650
        if (ret < 0) {
2651
            goto out;
2652
        }
2653

    
2654
        if (!ret || pnum != nb_sectors) {
2655
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2656
            goto out;
2657
        }
2658
    }
2659

    
2660
    if (!(bs->zero_beyond_eof && bs->growable)) {
2661
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2662
    } else {
2663
        /* Read zeros after EOF of growable BDSes */
2664
        int64_t len, total_sectors, max_nb_sectors;
2665

    
2666
        len = bdrv_getlength(bs);
2667
        if (len < 0) {
2668
            ret = len;
2669
            goto out;
2670
        }
2671

    
2672
        total_sectors = len >> BDRV_SECTOR_BITS;
2673
        max_nb_sectors = MAX(0, total_sectors - sector_num);
2674
        if (max_nb_sectors > 0) {
2675
            ret = drv->bdrv_co_readv(bs, sector_num,
2676
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2677
        } else {
2678
            ret = 0;
2679
        }
2680

    
2681
        /* Reading beyond end of file is supposed to produce zeroes */
2682
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2683
            uint64_t offset = MAX(0, total_sectors - sector_num);
2684
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2685
                              BDRV_SECTOR_SIZE;
2686
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2687
        }
2688
    }
2689

    
2690
out:
2691
    tracked_request_end(&req);
2692

    
2693
    if (flags & BDRV_REQ_COPY_ON_READ) {
2694
        bs->copy_on_read_in_flight--;
2695
    }
2696

    
2697
    return ret;
2698
}
2699

    
2700
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2701
    int nb_sectors, QEMUIOVector *qiov)
2702
{
2703
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2704

    
2705
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2706
}
2707

    
2708
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2709
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2710
{
2711
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2712

    
2713
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2714
                            BDRV_REQ_COPY_ON_READ);
2715
}
2716

    
2717
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2718
    int64_t sector_num, int nb_sectors)
2719
{
2720
    BlockDriver *drv = bs->drv;
2721
    QEMUIOVector qiov;
2722
    struct iovec iov;
2723
    int ret;
2724

    
2725
    /* TODO Emulate only part of misaligned requests instead of letting block
2726
     * drivers return -ENOTSUP and emulate everything */
2727

    
2728
    /* First try the efficient write zeroes operation */
2729
    if (drv->bdrv_co_write_zeroes) {
2730
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2731
        if (ret != -ENOTSUP) {
2732
            return ret;
2733
        }
2734
    }
2735

    
2736
    /* Fall back to bounce buffer if write zeroes is unsupported */
2737
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2738
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2739
    memset(iov.iov_base, 0, iov.iov_len);
2740
    qemu_iovec_init_external(&qiov, &iov, 1);
2741

    
2742
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2743

    
2744
    qemu_vfree(iov.iov_base);
2745
    return ret;
2746
}
2747

    
2748
/*
2749
 * Handle a write request in coroutine context
2750
 */
2751
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2752
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2753
    BdrvRequestFlags flags)
2754
{
2755
    BlockDriver *drv = bs->drv;
2756
    BdrvTrackedRequest req;
2757
    int ret;
2758

    
2759
    if (!bs->drv) {
2760
        return -ENOMEDIUM;
2761
    }
2762
    if (bs->read_only) {
2763
        return -EACCES;
2764
    }
2765
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2766
        return -EIO;
2767
    }
2768

    
2769
    if (bs->copy_on_read_in_flight) {
2770
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2771
    }
2772

    
2773
    /* throttling disk I/O */
2774
    if (bs->io_limits_enabled) {
2775
        bdrv_io_limits_intercept(bs, nb_sectors, true);
2776
    }
2777

    
2778
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2779

    
2780
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2781

    
2782
    if (ret < 0) {
2783
        /* Do nothing, write notifier decided to fail this request */
2784
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
2785
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2786
    } else {
2787
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2788
    }
2789

    
2790
    if (ret == 0 && !bs->enable_write_cache) {
2791
        ret = bdrv_co_flush(bs);
2792
    }
2793

    
2794
    if (bs->dirty_bitmap) {
2795
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2796
    }
2797

    
2798
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2799
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2800
    }
2801
    if (bs->growable && ret >= 0) {
2802
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
2803
    }
2804

    
2805
    tracked_request_end(&req);
2806

    
2807
    return ret;
2808
}
2809

    
2810
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2811
    int nb_sectors, QEMUIOVector *qiov)
2812
{
2813
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2814

    
2815
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2816
}
2817

    
2818
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2819
                                      int64_t sector_num, int nb_sectors)
2820
{
2821
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2822

    
2823
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2824
                             BDRV_REQ_ZERO_WRITE);
2825
}
2826

    
2827
/**
2828
 * Truncate file to 'offset' bytes (needed only for file protocols)
2829
 */
2830
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2831
{
2832
    BlockDriver *drv = bs->drv;
2833
    int ret;
2834
    if (!drv)
2835
        return -ENOMEDIUM;
2836
    if (!drv->bdrv_truncate)
2837
        return -ENOTSUP;
2838
    if (bs->read_only)
2839
        return -EACCES;
2840
    if (bdrv_in_use(bs))
2841
        return -EBUSY;
2842
    ret = drv->bdrv_truncate(bs, offset);
2843
    if (ret == 0) {
2844
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2845
        bdrv_dev_resize_cb(bs);
2846
    }
2847
    return ret;
2848
}
2849

    
2850
/**
2851
 * Length of a allocated file in bytes. Sparse files are counted by actual
2852
 * allocated space. Return < 0 if error or unknown.
2853
 */
2854
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2855
{
2856
    BlockDriver *drv = bs->drv;
2857
    if (!drv) {
2858
        return -ENOMEDIUM;
2859
    }
2860
    if (drv->bdrv_get_allocated_file_size) {
2861
        return drv->bdrv_get_allocated_file_size(bs);
2862
    }
2863
    if (bs->file) {
2864
        return bdrv_get_allocated_file_size(bs->file);
2865
    }
2866
    return -ENOTSUP;
2867
}
2868

    
2869
/**
2870
 * Length of a file in bytes. Return < 0 if error or unknown.
2871
 */
2872
int64_t bdrv_getlength(BlockDriverState *bs)
2873
{
2874
    BlockDriver *drv = bs->drv;
2875
    if (!drv)
2876
        return -ENOMEDIUM;
2877

    
2878
    if (bdrv_dev_has_removable_media(bs)) {
2879
        if (drv->bdrv_getlength) {
2880
            return drv->bdrv_getlength(bs);
2881
        }
2882
    }
2883
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2884
}
2885

    
2886
/* return 0 as number of sectors if no device present or error */
2887
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2888
{
2889
    int64_t length;
2890
    length = bdrv_getlength(bs);
2891
    if (length < 0)
2892
        length = 0;
2893
    else
2894
        length = length >> BDRV_SECTOR_BITS;
2895
    *nb_sectors_ptr = length;
2896
}
2897

    
2898
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2899
                       BlockdevOnError on_write_error)
2900
{
2901
    bs->on_read_error = on_read_error;
2902
    bs->on_write_error = on_write_error;
2903
}
2904

    
2905
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2906
{
2907
    return is_read ? bs->on_read_error : bs->on_write_error;
2908
}
2909

    
2910
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2911
{
2912
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2913

    
2914
    switch (on_err) {
2915
    case BLOCKDEV_ON_ERROR_ENOSPC:
2916
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2917
    case BLOCKDEV_ON_ERROR_STOP:
2918
        return BDRV_ACTION_STOP;
2919
    case BLOCKDEV_ON_ERROR_REPORT:
2920
        return BDRV_ACTION_REPORT;
2921
    case BLOCKDEV_ON_ERROR_IGNORE:
2922
        return BDRV_ACTION_IGNORE;
2923
    default:
2924
        abort();
2925
    }
2926
}
2927

    
2928
/* This is done by device models because, while the block layer knows
2929
 * about the error, it does not know whether an operation comes from
2930
 * the device or the block layer (from a job, for example).
2931
 */
2932
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2933
                       bool is_read, int error)
2934
{
2935
    assert(error >= 0);
2936
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2937
    if (action == BDRV_ACTION_STOP) {
2938
        vm_stop(RUN_STATE_IO_ERROR);
2939
        bdrv_iostatus_set_err(bs, error);
2940
    }
2941
}
2942

    
2943
int bdrv_is_read_only(BlockDriverState *bs)
2944
{
2945
    return bs->read_only;
2946
}
2947

    
2948
int bdrv_is_sg(BlockDriverState *bs)
2949
{
2950
    return bs->sg;
2951
}
2952

    
2953
int bdrv_enable_write_cache(BlockDriverState *bs)
2954
{
2955
    return bs->enable_write_cache;
2956
}
2957

    
2958
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2959
{
2960
    bs->enable_write_cache = wce;
2961

    
2962
    /* so a reopen() will preserve wce */
2963
    if (wce) {
2964
        bs->open_flags |= BDRV_O_CACHE_WB;
2965
    } else {
2966
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2967
    }
2968
}
2969

    
2970
int bdrv_is_encrypted(BlockDriverState *bs)
2971
{
2972
    if (bs->backing_hd && bs->backing_hd->encrypted)
2973
        return 1;
2974
    return bs->encrypted;
2975
}
2976

    
2977
int bdrv_key_required(BlockDriverState *bs)
2978
{
2979
    BlockDriverState *backing_hd = bs->backing_hd;
2980

    
2981
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2982
        return 1;
2983
    return (bs->encrypted && !bs->valid_key);
2984
}
2985

    
2986
int bdrv_set_key(BlockDriverState *bs, const char *key)
2987
{
2988
    int ret;
2989
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2990
        ret = bdrv_set_key(bs->backing_hd, key);
2991
        if (ret < 0)
2992
            return ret;
2993
        if (!bs->encrypted)
2994
            return 0;
2995
    }
2996
    if (!bs->encrypted) {
2997
        return -EINVAL;
2998
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2999
        return -ENOMEDIUM;
3000
    }
3001
    ret = bs->drv->bdrv_set_key(bs, key);
3002
    if (ret < 0) {
3003
        bs->valid_key = 0;
3004
    } else if (!bs->valid_key) {
3005
        bs->valid_key = 1;
3006
        /* call the change callback now, we skipped it on open */
3007
        bdrv_dev_change_media_cb(bs, true);
3008
    }
3009
    return ret;
3010
}
3011

    
3012
const char *bdrv_get_format_name(BlockDriverState *bs)
3013
{
3014
    return bs->drv ? bs->drv->format_name : NULL;
3015
}
3016

    
3017
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3018
                         void *opaque)
3019
{
3020
    BlockDriver *drv;
3021

    
3022
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3023
        it(opaque, drv->format_name);
3024
    }
3025
}
3026

    
3027
BlockDriverState *bdrv_find(const char *name)
3028
{
3029
    BlockDriverState *bs;
3030

    
3031
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3032
        if (!strcmp(name, bs->device_name)) {
3033
            return bs;
3034
        }
3035
    }
3036
    return NULL;
3037
}
3038

    
3039
BlockDriverState *bdrv_next(BlockDriverState *bs)
3040
{
3041
    if (!bs) {
3042
        return QTAILQ_FIRST(&bdrv_states);
3043
    }
3044
    return QTAILQ_NEXT(bs, list);
3045
}
3046

    
3047
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3048
{
3049
    BlockDriverState *bs;
3050

    
3051
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3052
        it(opaque, bs);
3053
    }
3054
}
3055

    
3056
const char *bdrv_get_device_name(BlockDriverState *bs)
3057
{
3058
    return bs->device_name;
3059
}
3060

    
3061
int bdrv_get_flags(BlockDriverState *bs)
3062
{
3063
    return bs->open_flags;
3064
}
3065

    
3066
int bdrv_flush_all(void)
3067
{
3068
    BlockDriverState *bs;
3069
    int result = 0;
3070

    
3071
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3072
        int ret = bdrv_flush(bs);
3073
        if (ret < 0 && !result) {
3074
            result = ret;
3075
        }
3076
    }
3077

    
3078
    return result;
3079
}
3080

    
3081
int bdrv_has_zero_init_1(BlockDriverState *bs)
3082
{
3083
    return 1;
3084
}
3085

    
3086
int bdrv_has_zero_init(BlockDriverState *bs)
3087
{
3088
    assert(bs->drv);
3089

    
3090
    /* If BS is a copy on write image, it is initialized to
3091
       the contents of the base image, which may not be zeroes.  */
3092
    if (bs->backing_hd) {
3093
        return 0;
3094
    }
3095
    if (bs->drv->bdrv_has_zero_init) {
3096
        return bs->drv->bdrv_has_zero_init(bs);
3097
    }
3098

    
3099
    /* safe default */
3100
    return 0;
3101
}
3102

    
3103
typedef struct BdrvCoGetBlockStatusData {
3104
    BlockDriverState *bs;
3105
    BlockDriverState *base;
3106
    int64_t sector_num;
3107
    int nb_sectors;
3108
    int *pnum;
3109
    int64_t ret;
3110
    bool done;
3111
} BdrvCoGetBlockStatusData;
3112

    
3113
/*
3114
 * Returns true iff the specified sector is present in the disk image. Drivers
3115
 * not implementing the functionality are assumed to not support backing files,
3116
 * hence all their sectors are reported as allocated.
3117
 *
3118
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3119
 * and 'pnum' is set to 0.
3120
 *
3121
 * 'pnum' is set to the number of sectors (including and immediately following
3122
 * the specified sector) that are known to be in the same
3123
 * allocated/unallocated state.
3124
 *
3125
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3126
 * beyond the end of the disk image it will be clamped.
3127
 */
3128
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3129
                                                     int64_t sector_num,
3130
                                                     int nb_sectors, int *pnum)
3131
{
3132
    int64_t length;
3133
    int64_t n;
3134
    int64_t ret, ret2;
3135

    
3136
    length = bdrv_getlength(bs);
3137
    if (length < 0) {
3138
        return length;
3139
    }
3140

    
3141
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3142
        *pnum = 0;
3143
        return 0;
3144
    }
3145

    
3146
    n = bs->total_sectors - sector_num;
3147
    if (n < nb_sectors) {
3148
        nb_sectors = n;
3149
    }
3150

    
3151
    if (!bs->drv->bdrv_co_get_block_status) {
3152
        *pnum = nb_sectors;
3153
        ret = BDRV_BLOCK_DATA;
3154
        if (bs->drv->protocol_name) {
3155
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3156
        }
3157
        return ret;
3158
    }
3159

    
3160
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3161
    if (ret < 0) {
3162
        return ret;
3163
    }
3164

    
3165
    if (!(ret & BDRV_BLOCK_DATA)) {
3166
        if (bdrv_has_zero_init(bs)) {
3167
            ret |= BDRV_BLOCK_ZERO;
3168
        } else {
3169
            BlockDriverState *bs2 = bs->backing_hd;
3170
            int64_t length2 = bdrv_getlength(bs2);
3171
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3172
                ret |= BDRV_BLOCK_ZERO;
3173
            }
3174
        }
3175
    }
3176

    
3177
    if (bs->file &&
3178
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3179
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3180
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3181
                                        *pnum, pnum);
3182
        if (ret2 >= 0) {
3183
            /* Ignore errors.  This is just providing extra information, it
3184
             * is useful but not necessary.
3185
             */
3186
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3187
        }
3188
    }
3189

    
3190
    return ret;
3191
}
3192

    
3193
/* Coroutine wrapper for bdrv_get_block_status() */
3194
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3195
{
3196
    BdrvCoGetBlockStatusData *data = opaque;
3197
    BlockDriverState *bs = data->bs;
3198

    
3199
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3200
                                         data->pnum);
3201
    data->done = true;
3202
}
3203

    
3204
/*
3205
 * Synchronous wrapper around bdrv_co_get_block_status().
3206
 *
3207
 * See bdrv_co_get_block_status() for details.
3208
 */
3209
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3210
                              int nb_sectors, int *pnum)
3211
{
3212
    Coroutine *co;
3213
    BdrvCoGetBlockStatusData data = {
3214
        .bs = bs,
3215
        .sector_num = sector_num,
3216
        .nb_sectors = nb_sectors,
3217
        .pnum = pnum,
3218
        .done = false,
3219
    };
3220

    
3221
    if (qemu_in_coroutine()) {
3222
        /* Fast-path if already in coroutine context */
3223
        bdrv_get_block_status_co_entry(&data);
3224
    } else {
3225
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3226
        qemu_coroutine_enter(co, &data);
3227
        while (!data.done) {
3228
            qemu_aio_wait();
3229
        }
3230
    }
3231
    return data.ret;
3232
}
3233

    
3234
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3235
                                   int nb_sectors, int *pnum)
3236
{
3237
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3238
    if (ret < 0) {
3239
        return ret;
3240
    }
3241
    return
3242
        (ret & BDRV_BLOCK_DATA) ||
3243
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3244
}
3245

    
3246
/*
3247
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3248
 *
3249
 * Return true if the given sector is allocated in any image between
3250
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3251
 * sector is allocated in any image of the chain.  Return false otherwise.
3252
 *
3253
 * 'pnum' is set to the number of sectors (including and immediately following
3254
 *  the specified sector) that are known to be in the same
3255
 *  allocated/unallocated state.
3256
 *
3257
 */
3258
int bdrv_is_allocated_above(BlockDriverState *top,
3259
                            BlockDriverState *base,
3260
                            int64_t sector_num,
3261
                            int nb_sectors, int *pnum)
3262
{
3263
    BlockDriverState *intermediate;
3264
    int ret, n = nb_sectors;
3265

    
3266
    intermediate = top;
3267
    while (intermediate && intermediate != base) {
3268
        int pnum_inter;
3269
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3270
                                &pnum_inter);
3271
        if (ret < 0) {
3272
            return ret;
3273
        } else if (ret) {
3274
            *pnum = pnum_inter;
3275
            return 1;
3276
        }
3277

    
3278
        /*
3279
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3280
         * might have
3281
         *
3282
         * [sector_num+x, nr_sectors] allocated.
3283
         */
3284
        if (n > pnum_inter &&
3285
            (intermediate == top ||
3286
             sector_num + pnum_inter < intermediate->total_sectors)) {
3287
            n = pnum_inter;
3288
        }
3289

    
3290
        intermediate = intermediate->backing_hd;
3291
    }
3292

    
3293
    *pnum = n;
3294
    return 0;
3295
}
3296

    
3297
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3298
{
3299
    if (bs->backing_hd && bs->backing_hd->encrypted)
3300
        return bs->backing_file;
3301
    else if (bs->encrypted)
3302
        return bs->filename;
3303
    else
3304
        return NULL;
3305
}
3306

    
3307
void bdrv_get_backing_filename(BlockDriverState *bs,
3308
                               char *filename, int filename_size)
3309
{
3310
    pstrcpy(filename, filename_size, bs->backing_file);
3311
}
3312

    
3313
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3314
                          const uint8_t *buf, int nb_sectors)
3315
{
3316
    BlockDriver *drv = bs->drv;
3317
    if (!drv)
3318
        return -ENOMEDIUM;
3319
    if (!drv->bdrv_write_compressed)
3320
        return -ENOTSUP;
3321
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3322
        return -EIO;
3323

    
3324
    assert(!bs->dirty_bitmap);
3325

    
3326
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3327
}
3328

    
3329
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3330
{
3331
    BlockDriver *drv = bs->drv;
3332
    if (!drv)
3333
        return -ENOMEDIUM;
3334
    if (!drv->bdrv_get_info)
3335
        return -ENOTSUP;
3336
    memset(bdi, 0, sizeof(*bdi));
3337
    return drv->bdrv_get_info(bs, bdi);
3338
}
3339

    
3340
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3341
                      int64_t pos, int size)
3342
{
3343
    QEMUIOVector qiov;
3344
    struct iovec iov = {
3345
        .iov_base   = (void *) buf,
3346
        .iov_len    = size,
3347
    };
3348

    
3349
    qemu_iovec_init_external(&qiov, &iov, 1);
3350
    return bdrv_writev_vmstate(bs, &qiov, pos);
3351
}
3352

    
3353
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3354
{
3355
    BlockDriver *drv = bs->drv;
3356

    
3357
    if (!drv) {
3358
        return -ENOMEDIUM;
3359
    } else if (drv->bdrv_save_vmstate) {
3360
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3361
    } else if (bs->file) {
3362
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3363
    }
3364

    
3365
    return -ENOTSUP;
3366
}
3367

    
3368
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3369
                      int64_t pos, int size)
3370
{
3371
    BlockDriver *drv = bs->drv;
3372
    if (!drv)
3373
        return -ENOMEDIUM;
3374
    if (drv->bdrv_load_vmstate)
3375
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3376
    if (bs->file)
3377
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3378
    return -ENOTSUP;
3379
}
3380

    
3381
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3382
{
3383
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3384
        return;
3385
    }
3386

    
3387
    bs->drv->bdrv_debug_event(bs, event);
3388
}
3389

    
3390
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3391
                          const char *tag)
3392
{
3393
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3394
        bs = bs->file;
3395
    }
3396

    
3397
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3398
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3399
    }
3400

    
3401
    return -ENOTSUP;
3402
}
3403

    
3404
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3405
{
3406
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3407
        bs = bs->file;
3408
    }
3409

    
3410
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3411
        return bs->drv->bdrv_debug_resume(bs, tag);
3412
    }
3413

    
3414
    return -ENOTSUP;
3415
}
3416

    
3417
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3418
{
3419
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3420
        bs = bs->file;
3421
    }
3422

    
3423
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3424
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3425
    }
3426

    
3427
    return false;
3428
}
3429

    
3430
int bdrv_is_snapshot(BlockDriverState *bs)
3431
{
3432
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3433
}
3434

    
3435
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3436
 * relative, it must be relative to the chain.  So, passing in bs->filename
3437
 * from a BDS as backing_file should not be done, as that may be relative to
3438
 * the CWD rather than the chain. */
3439
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3440
        const char *backing_file)
3441
{
3442
    char *filename_full = NULL;
3443
    char *backing_file_full = NULL;
3444
    char *filename_tmp = NULL;
3445
    int is_protocol = 0;
3446
    BlockDriverState *curr_bs = NULL;
3447
    BlockDriverState *retval = NULL;
3448

    
3449
    if (!bs || !bs->drv || !backing_file) {
3450
        return NULL;
3451
    }
3452

    
3453
    filename_full     = g_malloc(PATH_MAX);
3454
    backing_file_full = g_malloc(PATH_MAX);
3455
    filename_tmp      = g_malloc(PATH_MAX);
3456

    
3457
    is_protocol = path_has_protocol(backing_file);
3458

    
3459
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3460

    
3461
        /* If either of the filename paths is actually a protocol, then
3462
         * compare unmodified paths; otherwise make paths relative */
3463
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3464
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3465
                retval = curr_bs->backing_hd;
3466
                break;
3467
            }
3468
        } else {
3469
            /* If not an absolute filename path, make it relative to the current
3470
             * image's filename path */
3471
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3472
                         backing_file);
3473

    
3474
            /* We are going to compare absolute pathnames */
3475
            if (!realpath(filename_tmp, filename_full)) {
3476
                continue;
3477
            }
3478

    
3479
            /* We need to make sure the backing filename we are comparing against
3480
             * is relative to the current image filename (or absolute) */
3481
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3482
                         curr_bs->backing_file);
3483

    
3484
            if (!realpath(filename_tmp, backing_file_full)) {
3485
                continue;
3486
            }
3487

    
3488
            if (strcmp(backing_file_full, filename_full) == 0) {
3489
                retval = curr_bs->backing_hd;
3490
                break;
3491
            }
3492
        }
3493
    }
3494

    
3495
    g_free(filename_full);
3496
    g_free(backing_file_full);
3497
    g_free(filename_tmp);
3498
    return retval;
3499
}
3500

    
3501
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3502
{
3503
    if (!bs->drv) {
3504
        return 0;
3505
    }
3506

    
3507
    if (!bs->backing_hd) {
3508
        return 0;
3509
    }
3510

    
3511
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3512
}
3513

    
3514
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3515
{
3516
    BlockDriverState *curr_bs = NULL;
3517

    
3518
    if (!bs) {
3519
        return NULL;
3520
    }
3521

    
3522
    curr_bs = bs;
3523

    
3524
    while (curr_bs->backing_hd) {
3525
        curr_bs = curr_bs->backing_hd;
3526
    }
3527
    return curr_bs;
3528
}
3529

    
3530
/**************************************************************/
3531
/* async I/Os */
3532

    
3533
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3534
                                 QEMUIOVector *qiov, int nb_sectors,
3535
                                 BlockDriverCompletionFunc *cb, void *opaque)
3536
{
3537
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3538

    
3539
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3540
                                 cb, opaque, false);
3541
}
3542

    
3543
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3544
                                  QEMUIOVector *qiov, int nb_sectors,
3545
                                  BlockDriverCompletionFunc *cb, void *opaque)
3546
{
3547
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3548

    
3549
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3550
                                 cb, opaque, true);
3551
}
3552

    
3553

    
3554
typedef struct MultiwriteCB {
3555
    int error;
3556
    int num_requests;
3557
    int num_callbacks;
3558
    struct {
3559
        BlockDriverCompletionFunc *cb;
3560
        void *opaque;
3561
        QEMUIOVector *free_qiov;
3562
    } callbacks[];
3563
} MultiwriteCB;
3564

    
3565
static void multiwrite_user_cb(MultiwriteCB *mcb)
3566
{
3567
    int i;
3568

    
3569
    for (i = 0; i < mcb->num_callbacks; i++) {
3570
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3571
        if (mcb->callbacks[i].free_qiov) {
3572
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3573
        }
3574
        g_free(mcb->callbacks[i].free_qiov);
3575
    }
3576
}
3577

    
3578
static void multiwrite_cb(void *opaque, int ret)
3579
{
3580
    MultiwriteCB *mcb = opaque;
3581

    
3582
    trace_multiwrite_cb(mcb, ret);
3583

    
3584
    if (ret < 0 && !mcb->error) {
3585
        mcb->error = ret;
3586
    }
3587

    
3588
    mcb->num_requests--;
3589
    if (mcb->num_requests == 0) {
3590
        multiwrite_user_cb(mcb);
3591
        g_free(mcb);
3592
    }
3593
}
3594

    
3595
static int multiwrite_req_compare(const void *a, const void *b)
3596
{
3597
    const BlockRequest *req1 = a, *req2 = b;
3598

    
3599
    /*
3600
     * Note that we can't simply subtract req2->sector from req1->sector
3601
     * here as that could overflow the return value.
3602
     */
3603
    if (req1->sector > req2->sector) {
3604
        return 1;
3605
    } else if (req1->sector < req2->sector) {
3606
        return -1;
3607
    } else {
3608
        return 0;
3609
    }
3610
}
3611

    
3612
/*
3613
 * Takes a bunch of requests and tries to merge them. Returns the number of
3614
 * requests that remain after merging.
3615
 */
3616
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3617
    int num_reqs, MultiwriteCB *mcb)
3618
{
3619
    int i, outidx;
3620

    
3621
    // Sort requests by start sector
3622
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3623

    
3624
    // Check if adjacent requests touch the same clusters. If so, combine them,
3625
    // filling up gaps with zero sectors.
3626
    outidx = 0;
3627
    for (i = 1; i < num_reqs; i++) {
3628
        int merge = 0;
3629
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3630

    
3631
        // Handle exactly sequential writes and overlapping writes.
3632
        if (reqs[i].sector <= oldreq_last) {
3633
            merge = 1;
3634
        }
3635

    
3636
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3637
            merge = 0;
3638
        }
3639

    
3640
        if (merge) {
3641
            size_t size;
3642
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3643
            qemu_iovec_init(qiov,
3644
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3645

    
3646
            // Add the first request to the merged one. If the requests are
3647
            // overlapping, drop the last sectors of the first request.
3648
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3649
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3650

    
3651
            // We should need to add any zeros between the two requests
3652
            assert (reqs[i].sector <= oldreq_last);
3653

    
3654
            // Add the second request
3655
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3656

    
3657
            reqs[outidx].nb_sectors = qiov->size >> 9;
3658
            reqs[outidx].qiov = qiov;
3659

    
3660
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3661
        } else {
3662
            outidx++;
3663
            reqs[outidx].sector     = reqs[i].sector;
3664
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3665
            reqs[outidx].qiov       = reqs[i].qiov;
3666
        }
3667
    }
3668

    
3669
    return outidx + 1;
3670
}
3671

    
3672
/*
3673
 * Submit multiple AIO write requests at once.
3674
 *
3675
 * On success, the function returns 0 and all requests in the reqs array have
3676
 * been submitted. In error case this function returns -1, and any of the
3677
 * requests may or may not be submitted yet. In particular, this means that the
3678
 * callback will be called for some of the requests, for others it won't. The
3679
 * caller must check the error field of the BlockRequest to wait for the right
3680
 * callbacks (if error != 0, no callback will be called).
3681
 *
3682
 * The implementation may modify the contents of the reqs array, e.g. to merge
3683
 * requests. However, the fields opaque and error are left unmodified as they
3684
 * are used to signal failure for a single request to the caller.
3685
 */
3686
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3687
{
3688
    MultiwriteCB *mcb;
3689
    int i;
3690

    
3691
    /* don't submit writes if we don't have a medium */
3692
    if (bs->drv == NULL) {
3693
        for (i = 0; i < num_reqs; i++) {
3694
            reqs[i].error = -ENOMEDIUM;
3695
        }
3696
        return -1;
3697
    }
3698

    
3699
    if (num_reqs == 0) {
3700
        return 0;
3701
    }
3702

    
3703
    // Create MultiwriteCB structure
3704
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3705
    mcb->num_requests = 0;
3706
    mcb->num_callbacks = num_reqs;
3707

    
3708
    for (i = 0; i < num_reqs; i++) {
3709
        mcb->callbacks[i].cb = reqs[i].cb;
3710
        mcb->callbacks[i].opaque = reqs[i].opaque;
3711
    }
3712

    
3713
    // Check for mergable requests
3714
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3715

    
3716
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3717

    
3718
    /* Run the aio requests. */
3719
    mcb->num_requests = num_reqs;
3720
    for (i = 0; i < num_reqs; i++) {
3721
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3722
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3723
    }
3724

    
3725
    return 0;
3726
}
3727

    
3728
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3729
{
3730
    acb->aiocb_info->cancel(acb);
3731
}
3732

    
3733
/**************************************************************/
3734
/* async block device emulation */
3735

    
3736
typedef struct BlockDriverAIOCBSync {
3737
    BlockDriverAIOCB common;
3738
    QEMUBH *bh;
3739
    int ret;
3740
    /* vector translation state */
3741
    QEMUIOVector *qiov;
3742
    uint8_t *bounce;
3743
    int is_write;
3744
} BlockDriverAIOCBSync;
3745

    
3746
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3747
{
3748
    BlockDriverAIOCBSync *acb =
3749
        container_of(blockacb, BlockDriverAIOCBSync, common);
3750
    qemu_bh_delete(acb->bh);
3751
    acb->bh = NULL;
3752
    qemu_aio_release(acb);
3753
}
3754

    
3755
static const AIOCBInfo bdrv_em_aiocb_info = {
3756
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3757
    .cancel             = bdrv_aio_cancel_em,
3758
};
3759

    
3760
static void bdrv_aio_bh_cb(void *opaque)
3761
{
3762
    BlockDriverAIOCBSync *acb = opaque;
3763

    
3764
    if (!acb->is_write)
3765
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3766
    qemu_vfree(acb->bounce);
3767
    acb->common.cb(acb->common.opaque, acb->ret);
3768
    qemu_bh_delete(acb->bh);
3769
    acb->bh = NULL;
3770
    qemu_aio_release(acb);
3771
}
3772

    
3773
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3774
                                            int64_t sector_num,
3775
                                            QEMUIOVector *qiov,
3776
                                            int nb_sectors,
3777
                                            BlockDriverCompletionFunc *cb,
3778
                                            void *opaque,
3779
                                            int is_write)
3780

    
3781
{
3782
    BlockDriverAIOCBSync *acb;
3783

    
3784
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3785
    acb->is_write = is_write;
3786
    acb->qiov = qiov;
3787
    acb->bounce = qemu_blockalign(bs, qiov->size);
3788
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3789

    
3790
    if (is_write) {
3791
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3792
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3793
    } else {
3794
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3795
    }
3796

    
3797
    qemu_bh_schedule(acb->bh);
3798

    
3799
    return &acb->common;
3800
}
3801

    
3802
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3803
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3804
        BlockDriverCompletionFunc *cb, void *opaque)
3805
{
3806
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3807
}
3808

    
3809
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3810
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3811
        BlockDriverCompletionFunc *cb, void *opaque)
3812
{
3813
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3814
}
3815

    
3816

    
3817
typedef struct BlockDriverAIOCBCoroutine {
3818
    BlockDriverAIOCB common;
3819
    BlockRequest req;
3820
    bool is_write;
3821
    bool *done;
3822
    QEMUBH* bh;
3823
} BlockDriverAIOCBCoroutine;
3824

    
3825
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3826
{
3827
    BlockDriverAIOCBCoroutine *acb =
3828
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3829
    bool done = false;
3830

    
3831
    acb->done = &done;
3832
    while (!done) {
3833
        qemu_aio_wait();
3834
    }
3835
}
3836

    
3837
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3838
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3839
    .cancel             = bdrv_aio_co_cancel_em,
3840
};
3841

    
3842
static void bdrv_co_em_bh(void *opaque)
3843
{
3844
    BlockDriverAIOCBCoroutine *acb = opaque;
3845

    
3846
    acb->common.cb(acb->common.opaque, acb->req.error);
3847

    
3848
    if (acb->done) {
3849
        *acb->done = true;
3850
    }
3851

    
3852
    qemu_bh_delete(acb->bh);
3853
    qemu_aio_release(acb);
3854
}
3855

    
3856
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3857
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3858
{
3859
    BlockDriverAIOCBCoroutine *acb = opaque;
3860
    BlockDriverState *bs = acb->common.bs;
3861

    
3862
    if (!acb->is_write) {
3863
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3864
            acb->req.nb_sectors, acb->req.qiov, 0);
3865
    } else {
3866
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3867
            acb->req.nb_sectors, acb->req.qiov, 0);
3868
    }
3869

    
3870
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3871
    qemu_bh_schedule(acb->bh);
3872
}
3873

    
3874
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3875
                                               int64_t sector_num,
3876
                                               QEMUIOVector *qiov,
3877
                                               int nb_sectors,
3878
                                               BlockDriverCompletionFunc *cb,
3879
                                               void *opaque,
3880
                                               bool is_write)
3881
{
3882
    Coroutine *co;
3883
    BlockDriverAIOCBCoroutine *acb;
3884

    
3885
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3886
    acb->req.sector = sector_num;
3887
    acb->req.nb_sectors = nb_sectors;
3888
    acb->req.qiov = qiov;
3889
    acb->is_write = is_write;
3890
    acb->done = NULL;
3891

    
3892
    co = qemu_coroutine_create(bdrv_co_do_rw);
3893
    qemu_coroutine_enter(co, acb);
3894

    
3895
    return &acb->common;
3896
}
3897

    
3898
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3899
{
3900
    BlockDriverAIOCBCoroutine *acb = opaque;
3901
    BlockDriverState *bs = acb->common.bs;
3902

    
3903
    acb->req.error = bdrv_co_flush(bs);
3904
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3905
    qemu_bh_schedule(acb->bh);
3906
}
3907

    
3908
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3909
        BlockDriverCompletionFunc *cb, void *opaque)
3910
{
3911
    trace_bdrv_aio_flush(bs, opaque);
3912

    
3913
    Coroutine *co;
3914
    BlockDriverAIOCBCoroutine *acb;
3915

    
3916
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3917
    acb->done = NULL;
3918

    
3919
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3920
    qemu_coroutine_enter(co, acb);
3921

    
3922
    return &acb->common;
3923
}
3924

    
3925
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3926
{
3927
    BlockDriverAIOCBCoroutine *acb = opaque;
3928
    BlockDriverState *bs = acb->common.bs;
3929

    
3930
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3931
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3932
    qemu_bh_schedule(acb->bh);
3933
}
3934

    
3935
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3936
        int64_t sector_num, int nb_sectors,
3937
        BlockDriverCompletionFunc *cb, void *opaque)
3938
{
3939
    Coroutine *co;
3940
    BlockDriverAIOCBCoroutine *acb;
3941

    
3942
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3943

    
3944
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3945
    acb->req.sector = sector_num;
3946
    acb->req.nb_sectors = nb_sectors;
3947
    acb->done = NULL;
3948
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3949
    qemu_coroutine_enter(co, acb);
3950

    
3951
    return &acb->common;
3952
}
3953

    
3954
void bdrv_init(void)
3955
{
3956
    module_call_init(MODULE_INIT_BLOCK);
3957
}
3958

    
3959
void bdrv_init_with_whitelist(void)
3960
{
3961
    use_bdrv_whitelist = 1;
3962
    bdrv_init();
3963
}
3964

    
3965
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
3966
                   BlockDriverCompletionFunc *cb, void *opaque)
3967
{
3968
    BlockDriverAIOCB *acb;
3969

    
3970
    acb = g_slice_alloc(aiocb_info->aiocb_size);
3971
    acb->aiocb_info = aiocb_info;
3972
    acb->bs = bs;
3973
    acb->cb = cb;
3974
    acb->opaque = opaque;
3975
    return acb;
3976
}
3977

    
3978
void qemu_aio_release(void *p)
3979
{
3980
    BlockDriverAIOCB *acb = p;
3981
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
3982
}
3983

    
3984
/**************************************************************/
3985
/* Coroutine block device emulation */
3986

    
3987
typedef struct CoroutineIOCompletion {
3988
    Coroutine *coroutine;
3989
    int ret;
3990
} CoroutineIOCompletion;
3991

    
3992
static void bdrv_co_io_em_complete(void *opaque, int ret)
3993
{
3994
    CoroutineIOCompletion *co = opaque;
3995

    
3996
    co->ret = ret;
3997
    qemu_coroutine_enter(co->coroutine, NULL);
3998
}
3999

    
4000
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4001
                                      int nb_sectors, QEMUIOVector *iov,
4002
                                      bool is_write)
4003
{
4004
    CoroutineIOCompletion co = {
4005
        .coroutine = qemu_coroutine_self(),
4006
    };
4007
    BlockDriverAIOCB *acb;
4008

    
4009
    if (is_write) {
4010
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4011
                                       bdrv_co_io_em_complete, &co);
4012
    } else {
4013
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4014
                                      bdrv_co_io_em_complete, &co);
4015
    }
4016

    
4017
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4018
    if (!acb) {
4019
        return -EIO;
4020
    }
4021
    qemu_coroutine_yield();
4022

    
4023
    return co.ret;
4024
}
4025

    
4026
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4027
                                         int64_t sector_num, int nb_sectors,
4028
                                         QEMUIOVector *iov)
4029
{
4030
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4031
}
4032

    
4033
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4034
                                         int64_t sector_num, int nb_sectors,
4035
                                         QEMUIOVector *iov)
4036
{
4037
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4038
}
4039

    
4040
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4041
{
4042
    RwCo *rwco = opaque;
4043

    
4044
    rwco->ret = bdrv_co_flush(rwco->bs);
4045
}
4046

    
4047
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4048
{
4049
    int ret;
4050

    
4051
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4052
        return 0;
4053
    }
4054

    
4055
    /* Write back cached data to the OS even with cache=unsafe */
4056
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4057
    if (bs->drv->bdrv_co_flush_to_os) {
4058
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4059
        if (ret < 0) {
4060
            return ret;
4061
        }
4062
    }
4063

    
4064
    /* But don't actually force it to the disk with cache=unsafe */
4065
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4066
        goto flush_parent;
4067
    }
4068

    
4069
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4070
    if (bs->drv->bdrv_co_flush_to_disk) {
4071
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4072
    } else if (bs->drv->bdrv_aio_flush) {
4073
        BlockDriverAIOCB *acb;
4074
        CoroutineIOCompletion co = {
4075
            .coroutine = qemu_coroutine_self(),
4076
        };
4077

    
4078
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4079
        if (acb == NULL) {
4080
            ret = -EIO;
4081
        } else {
4082
            qemu_coroutine_yield();
4083
            ret = co.ret;
4084
        }
4085
    } else {
4086
        /*
4087
         * Some block drivers always operate in either writethrough or unsafe
4088
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4089
         * know how the server works (because the behaviour is hardcoded or
4090
         * depends on server-side configuration), so we can't ensure that
4091
         * everything is safe on disk. Returning an error doesn't work because
4092
         * that would break guests even if the server operates in writethrough
4093
         * mode.
4094
         *
4095
         * Let's hope the user knows what he's doing.
4096
         */
4097
        ret = 0;
4098
    }
4099
    if (ret < 0) {
4100
        return ret;
4101
    }
4102

    
4103
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4104
     * in the case of cache=unsafe, so there are no useless flushes.
4105
     */
4106
flush_parent:
4107
    return bdrv_co_flush(bs->file);
4108
}
4109

    
4110
void bdrv_invalidate_cache(BlockDriverState *bs)
4111
{
4112
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4113
        bs->drv->bdrv_invalidate_cache(bs);
4114
    }
4115
}
4116

    
4117
void bdrv_invalidate_cache_all(void)
4118
{
4119
    BlockDriverState *bs;
4120

    
4121
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4122
        bdrv_invalidate_cache(bs);
4123
    }
4124
}
4125

    
4126
void bdrv_clear_incoming_migration_all(void)
4127
{
4128
    BlockDriverState *bs;
4129

    
4130
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4131
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4132
    }
4133
}
4134

    
4135
int bdrv_flush(BlockDriverState *bs)
4136
{
4137
    Coroutine *co;
4138
    RwCo rwco = {
4139
        .bs = bs,
4140
        .ret = NOT_DONE,
4141
    };
4142

    
4143
    if (qemu_in_coroutine()) {
4144
        /* Fast-path if already in coroutine context */
4145
        bdrv_flush_co_entry(&rwco);
4146
    } else {
4147
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4148
        qemu_coroutine_enter(co, &rwco);
4149
        while (rwco.ret == NOT_DONE) {
4150
            qemu_aio_wait();
4151
        }
4152
    }
4153

    
4154
    return rwco.ret;
4155
}
4156

    
4157
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4158
{
4159
    RwCo *rwco = opaque;
4160

    
4161
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4162
}
4163

    
4164
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4165
                                 int nb_sectors)
4166
{
4167
    if (!bs->drv) {
4168
        return -ENOMEDIUM;
4169
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4170
        return -EIO;
4171
    } else if (bs->read_only) {
4172
        return -EROFS;
4173
    }
4174

    
4175
    if (bs->dirty_bitmap) {
4176
        bdrv_reset_dirty(bs, sector_num, nb_sectors);
4177
    }
4178

    
4179
    /* Do nothing if disabled.  */
4180
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4181
        return 0;
4182
    }
4183

    
4184
    if (bs->drv->bdrv_co_discard) {
4185
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4186
    } else if (bs->drv->bdrv_aio_discard) {
4187
        BlockDriverAIOCB *acb;
4188
        CoroutineIOCompletion co = {
4189
            .coroutine = qemu_coroutine_self(),
4190
        };
4191

    
4192
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4193
                                        bdrv_co_io_em_complete, &co);
4194
        if (acb == NULL) {
4195
            return -EIO;
4196
        } else {
4197
            qemu_coroutine_yield();
4198
            return co.ret;
4199
        }
4200
    } else {
4201
        return 0;
4202
    }
4203
}
4204

    
4205
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4206
{
4207
    Coroutine *co;
4208
    RwCo rwco = {
4209
        .bs = bs,
4210
        .sector_num = sector_num,
4211
        .nb_sectors = nb_sectors,
4212
        .ret = NOT_DONE,
4213
    };
4214

    
4215
    if (qemu_in_coroutine()) {
4216
        /* Fast-path if already in coroutine context */
4217
        bdrv_discard_co_entry(&rwco);
4218
    } else {
4219
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4220
        qemu_coroutine_enter(co, &rwco);
4221
        while (rwco.ret == NOT_DONE) {
4222
            qemu_aio_wait();
4223
        }
4224
    }
4225

    
4226
    return rwco.ret;
4227
}
4228

    
4229
/**************************************************************/
4230
/* removable device support */
4231

    
4232
/**
4233
 * Return TRUE if the media is present
4234
 */
4235
int bdrv_is_inserted(BlockDriverState *bs)
4236
{
4237
    BlockDriver *drv = bs->drv;
4238

    
4239
    if (!drv)
4240
        return 0;
4241
    if (!drv->bdrv_is_inserted)
4242
        return 1;
4243
    return drv->bdrv_is_inserted(bs);
4244
}
4245

    
4246
/**
4247
 * Return whether the media changed since the last call to this
4248
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4249
 */
4250
int bdrv_media_changed(BlockDriverState *bs)
4251
{
4252
    BlockDriver *drv = bs->drv;
4253

    
4254
    if (drv && drv->bdrv_media_changed) {
4255
        return drv->bdrv_media_changed(bs);
4256
    }
4257
    return -ENOTSUP;
4258
}
4259

    
4260
/**
4261
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4262
 */
4263
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4264
{
4265
    BlockDriver *drv = bs->drv;
4266

    
4267
    if (drv && drv->bdrv_eject) {
4268
        drv->bdrv_eject(bs, eject_flag);
4269
    }
4270

    
4271
    if (bs->device_name[0] != '\0') {
4272
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4273
    }
4274
}
4275

    
4276
/**
4277
 * Lock or unlock the media (if it is locked, the user won't be able
4278
 * to eject it manually).
4279
 */
4280
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4281
{
4282
    BlockDriver *drv = bs->drv;
4283

    
4284
    trace_bdrv_lock_medium(bs, locked);
4285

    
4286
    if (drv && drv->bdrv_lock_medium) {
4287
        drv->bdrv_lock_medium(bs, locked);
4288
    }
4289
}
4290

    
4291
/* needed for generic scsi interface */
4292

    
4293
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4294
{
4295
    BlockDriver *drv = bs->drv;
4296

    
4297
    if (drv && drv->bdrv_ioctl)
4298
        return drv->bdrv_ioctl(bs, req, buf);
4299
    return -ENOTSUP;
4300
}
4301

    
4302
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4303
        unsigned long int req, void *buf,
4304
        BlockDriverCompletionFunc *cb, void *opaque)
4305
{
4306
    BlockDriver *drv = bs->drv;
4307

    
4308
    if (drv && drv->bdrv_aio_ioctl)
4309
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4310
    return NULL;
4311
}
4312

    
4313
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4314
{
4315
    bs->buffer_alignment = align;
4316
}
4317

    
4318
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4319
{
4320
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4321
}
4322

    
4323
/*
4324
 * Check if all memory in this vector is sector aligned.
4325
 */
4326
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4327
{
4328
    int i;
4329

    
4330
    for (i = 0; i < qiov->niov; i++) {
4331
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4332
            return false;
4333
        }
4334
    }
4335

    
4336
    return true;
4337
}
4338

    
4339
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4340
{
4341
    int64_t bitmap_size;
4342

    
4343
    assert((granularity & (granularity - 1)) == 0);
4344

    
4345
    if (granularity) {
4346
        granularity >>= BDRV_SECTOR_BITS;
4347
        assert(!bs->dirty_bitmap);
4348
        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4349
        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4350
    } else {
4351
        if (bs->dirty_bitmap) {
4352
            hbitmap_free(bs->dirty_bitmap);
4353
            bs->dirty_bitmap = NULL;
4354
        }
4355
    }
4356
}
4357

    
4358
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4359
{
4360
    if (bs->dirty_bitmap) {
4361
        return hbitmap_get(bs->dirty_bitmap, sector);
4362
    } else {
4363
        return 0;
4364
    }
4365
}
4366

    
4367
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4368
{
4369
    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4370
}
4371

    
4372
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4373
                    int nr_sectors)
4374
{
4375
    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4376
}
4377

    
4378
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4379
                      int nr_sectors)
4380
{
4381
    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4382
}
4383

    
4384
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4385
{
4386
    if (bs->dirty_bitmap) {
4387
        return hbitmap_count(bs->dirty_bitmap);
4388
    } else {
4389
        return 0;
4390
    }
4391
}
4392

    
4393
/* Get a reference to bs */
4394
void bdrv_ref(BlockDriverState *bs)
4395
{
4396
    bs->refcnt++;
4397
}
4398

    
4399
/* Release a previously grabbed reference to bs.
4400
 * If after releasing, reference count is zero, the BlockDriverState is
4401
 * deleted. */
4402
void bdrv_unref(BlockDriverState *bs)
4403
{
4404
    assert(bs->refcnt > 0);
4405
    if (--bs->refcnt == 0) {
4406
        bdrv_delete(bs);
4407
    }
4408
}
4409

    
4410
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4411
{
4412
    assert(bs->in_use != in_use);
4413
    bs->in_use = in_use;
4414
}
4415

    
4416
int bdrv_in_use(BlockDriverState *bs)
4417
{
4418
    return bs->in_use;
4419
}
4420

    
4421
void bdrv_iostatus_enable(BlockDriverState *bs)
4422
{
4423
    bs->iostatus_enabled = true;
4424
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4425
}
4426

    
4427
/* The I/O status is only enabled if the drive explicitly
4428
 * enables it _and_ the VM is configured to stop on errors */
4429
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4430
{
4431
    return (bs->iostatus_enabled &&
4432
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4433
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4434
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4435
}
4436

    
4437
void bdrv_iostatus_disable(BlockDriverState *bs)
4438
{
4439
    bs->iostatus_enabled = false;
4440
}
4441

    
4442
void bdrv_iostatus_reset(BlockDriverState *bs)
4443
{
4444
    if (bdrv_iostatus_is_enabled(bs)) {
4445
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4446
        if (bs->job) {
4447
            block_job_iostatus_reset(bs->job);
4448
        }
4449
    }
4450
}
4451

    
4452
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4453
{
4454
    assert(bdrv_iostatus_is_enabled(bs));
4455
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4456
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4457
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4458
    }
4459
}
4460

    
4461
void
4462
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4463
        enum BlockAcctType type)
4464
{
4465
    assert(type < BDRV_MAX_IOTYPE);
4466

    
4467
    cookie->bytes = bytes;
4468
    cookie->start_time_ns = get_clock();
4469
    cookie->type = type;
4470
}
4471

    
4472
void
4473
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4474
{
4475
    assert(cookie->type < BDRV_MAX_IOTYPE);
4476

    
4477
    bs->nr_bytes[cookie->type] += cookie->bytes;
4478
    bs->nr_ops[cookie->type]++;
4479
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4480
}
4481

    
4482
void bdrv_img_create(const char *filename, const char *fmt,
4483
                     const char *base_filename, const char *base_fmt,
4484
                     char *options, uint64_t img_size, int flags,
4485
                     Error **errp, bool quiet)
4486
{
4487
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4488
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4489
    BlockDriverState *bs = NULL;
4490
    BlockDriver *drv, *proto_drv;
4491
    BlockDriver *backing_drv = NULL;
4492
    Error *local_err = NULL;
4493
    int ret = 0;
4494

    
4495
    /* Find driver and parse its options */
4496
    drv = bdrv_find_format(fmt);
4497
    if (!drv) {
4498
        error_setg(errp, "Unknown file format '%s'", fmt);
4499
        return;
4500
    }
4501

    
4502
    proto_drv = bdrv_find_protocol(filename, true);
4503
    if (!proto_drv) {
4504
        error_setg(errp, "Unknown protocol '%s'", filename);
4505
        return;
4506
    }
4507

    
4508
    create_options = append_option_parameters(create_options,
4509
                                              drv->create_options);
4510
    create_options = append_option_parameters(create_options,
4511
                                              proto_drv->create_options);
4512

    
4513
    /* Create parameter list with default values */
4514
    param = parse_option_parameters("", create_options, param);
4515

    
4516
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4517

    
4518
    /* Parse -o options */
4519
    if (options) {
4520
        param = parse_option_parameters(options, create_options, param);
4521
        if (param == NULL) {
4522
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4523
            goto out;
4524
        }
4525
    }
4526

    
4527
    if (base_filename) {
4528
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4529
                                 base_filename)) {
4530
            error_setg(errp, "Backing file not supported for file format '%s'",
4531
                       fmt);
4532
            goto out;
4533
        }
4534
    }
4535

    
4536
    if (base_fmt) {
4537
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4538
            error_setg(errp, "Backing file format not supported for file "
4539
                             "format '%s'", fmt);
4540
            goto out;
4541
        }
4542
    }
4543

    
4544
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4545
    if (backing_file && backing_file->value.s) {
4546
        if (!strcmp(filename, backing_file->value.s)) {
4547
            error_setg(errp, "Error: Trying to create an image with the "
4548
                             "same filename as the backing file");
4549
            goto out;
4550
        }
4551
    }
4552

    
4553
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4554
    if (backing_fmt && backing_fmt->value.s) {
4555
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4556
        if (!backing_drv) {
4557
            error_setg(errp, "Unknown backing file format '%s'",
4558
                       backing_fmt->value.s);
4559
            goto out;
4560
        }
4561
    }
4562

    
4563
    // The size for the image must always be specified, with one exception:
4564
    // If we are using a backing file, we can obtain the size from there
4565
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4566
    if (size && size->value.n == -1) {
4567
        if (backing_file && backing_file->value.s) {
4568
            uint64_t size;
4569
            char buf[32];
4570
            int back_flags;
4571

    
4572
            /* backing files always opened read-only */
4573
            back_flags =
4574
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4575

    
4576
            bs = bdrv_new("");
4577

    
4578
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4579
                            backing_drv, &local_err);
4580
            if (ret < 0) {
4581
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
4582
                                 backing_file->value.s,
4583
                                 error_get_pretty(local_err));
4584
                error_free(local_err);
4585
                local_err = NULL;
4586
                goto out;
4587
            }
4588
            bdrv_get_geometry(bs, &size);
4589
            size *= 512;
4590

    
4591
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4592
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4593
        } else {
4594
            error_setg(errp, "Image creation needs a size parameter");
4595
            goto out;
4596
        }
4597
    }
4598

    
4599
    if (!quiet) {
4600
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4601
        print_option_parameters(param);
4602
        puts("");
4603
    }
4604
    ret = bdrv_create(drv, filename, param, &local_err);
4605
    if (ret == -EFBIG) {
4606
        /* This is generally a better message than whatever the driver would
4607
         * deliver (especially because of the cluster_size_hint), since that
4608
         * is most probably not much different from "image too large". */
4609
        const char *cluster_size_hint = "";
4610
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4611
            cluster_size_hint = " (try using a larger cluster size)";
4612
        }
4613
        error_setg(errp, "The image size is too large for file format '%s'"
4614
                   "%s", fmt, cluster_size_hint);
4615
        error_free(local_err);
4616
        local_err = NULL;
4617
    }
4618

    
4619
out:
4620
    free_option_parameters(create_options);
4621
    free_option_parameters(param);
4622

    
4623
    if (bs) {
4624
        bdrv_unref(bs);
4625
    }
4626
    if (error_is_set(&local_err)) {
4627
        error_propagate(errp, local_err);
4628
    }
4629
}
4630

    
4631
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4632
{
4633
    /* Currently BlockDriverState always uses the main loop AioContext */
4634
    return qemu_get_aio_context();
4635
}
4636

    
4637
void bdrv_add_before_write_notifier(BlockDriverState *bs,
4638
                                    NotifierWithReturn *notifier)
4639
{
4640
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4641
}
4642

    
4643
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
4644
{
4645
    if (bs->drv->bdrv_amend_options == NULL) {
4646
        return -ENOTSUP;
4647
    }
4648
    return bs->drv->bdrv_amend_options(bs, options);
4649
}