Statistics
| Branch: | Revision:

root / block.c @ 34b5d2c6

History | View | Annotate | Download (126.6 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
90
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
91

    
92
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
93
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
94

    
95
/* If non-zero, use only whitelisted block drivers */
96
static int use_bdrv_whitelist;
97

    
98
#ifdef _WIN32
99
static int is_windows_drive_prefix(const char *filename)
100
{
101
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
102
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
103
            filename[1] == ':');
104
}
105

    
106
int is_windows_drive(const char *filename)
107
{
108
    if (is_windows_drive_prefix(filename) &&
109
        filename[2] == '\0')
110
        return 1;
111
    if (strstart(filename, "\\\\.\\", NULL) ||
112
        strstart(filename, "//./", NULL))
113
        return 1;
114
    return 0;
115
}
116
#endif
117

    
118
/* throttling disk I/O limits */
119
void bdrv_set_io_limits(BlockDriverState *bs,
120
                        ThrottleConfig *cfg)
121
{
122
    int i;
123

    
124
    throttle_config(&bs->throttle_state, cfg);
125

    
126
    for (i = 0; i < 2; i++) {
127
        qemu_co_enter_next(&bs->throttled_reqs[i]);
128
    }
129
}
130

    
131
/* this function drain all the throttled IOs */
132
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
133
{
134
    bool drained = false;
135
    bool enabled = bs->io_limits_enabled;
136
    int i;
137

    
138
    bs->io_limits_enabled = false;
139

    
140
    for (i = 0; i < 2; i++) {
141
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
142
            drained = true;
143
        }
144
    }
145

    
146
    bs->io_limits_enabled = enabled;
147

    
148
    return drained;
149
}
150

    
151
void bdrv_io_limits_disable(BlockDriverState *bs)
152
{
153
    bs->io_limits_enabled = false;
154

    
155
    bdrv_start_throttled_reqs(bs);
156

    
157
    throttle_destroy(&bs->throttle_state);
158
}
159

    
160
static void bdrv_throttle_read_timer_cb(void *opaque)
161
{
162
    BlockDriverState *bs = opaque;
163
    qemu_co_enter_next(&bs->throttled_reqs[0]);
164
}
165

    
166
static void bdrv_throttle_write_timer_cb(void *opaque)
167
{
168
    BlockDriverState *bs = opaque;
169
    qemu_co_enter_next(&bs->throttled_reqs[1]);
170
}
171

    
172
/* should be called before bdrv_set_io_limits if a limit is set */
173
void bdrv_io_limits_enable(BlockDriverState *bs)
174
{
175
    assert(!bs->io_limits_enabled);
176
    throttle_init(&bs->throttle_state,
177
                  QEMU_CLOCK_VIRTUAL,
178
                  bdrv_throttle_read_timer_cb,
179
                  bdrv_throttle_write_timer_cb,
180
                  bs);
181
    bs->io_limits_enabled = true;
182
}
183

    
184
/* This function makes an IO wait if needed
185
 *
186
 * @nb_sectors: the number of sectors of the IO
187
 * @is_write:   is the IO a write
188
 */
189
static void bdrv_io_limits_intercept(BlockDriverState *bs,
190
                                     int nb_sectors,
191
                                     bool is_write)
192
{
193
    /* does this io must wait */
194
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
195

    
196
    /* if must wait or any request of this type throttled queue the IO */
197
    if (must_wait ||
198
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
199
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
200
    }
201

    
202
    /* the IO will be executed, do the accounting */
203
    throttle_account(&bs->throttle_state,
204
                     is_write,
205
                     nb_sectors * BDRV_SECTOR_SIZE);
206

    
207
    /* if the next request must wait -> do nothing */
208
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
209
        return;
210
    }
211

    
212
    /* else queue next request for execution */
213
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
214
}
215

    
216
/* check if the path starts with "<protocol>:" */
217
static int path_has_protocol(const char *path)
218
{
219
    const char *p;
220

    
221
#ifdef _WIN32
222
    if (is_windows_drive(path) ||
223
        is_windows_drive_prefix(path)) {
224
        return 0;
225
    }
226
    p = path + strcspn(path, ":/\\");
227
#else
228
    p = path + strcspn(path, ":/");
229
#endif
230

    
231
    return *p == ':';
232
}
233

    
234
int path_is_absolute(const char *path)
235
{
236
#ifdef _WIN32
237
    /* specific case for names like: "\\.\d:" */
238
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
239
        return 1;
240
    }
241
    return (*path == '/' || *path == '\\');
242
#else
243
    return (*path == '/');
244
#endif
245
}
246

    
247
/* if filename is absolute, just copy it to dest. Otherwise, build a
248
   path to it by considering it is relative to base_path. URL are
249
   supported. */
250
void path_combine(char *dest, int dest_size,
251
                  const char *base_path,
252
                  const char *filename)
253
{
254
    const char *p, *p1;
255
    int len;
256

    
257
    if (dest_size <= 0)
258
        return;
259
    if (path_is_absolute(filename)) {
260
        pstrcpy(dest, dest_size, filename);
261
    } else {
262
        p = strchr(base_path, ':');
263
        if (p)
264
            p++;
265
        else
266
            p = base_path;
267
        p1 = strrchr(base_path, '/');
268
#ifdef _WIN32
269
        {
270
            const char *p2;
271
            p2 = strrchr(base_path, '\\');
272
            if (!p1 || p2 > p1)
273
                p1 = p2;
274
        }
275
#endif
276
        if (p1)
277
            p1++;
278
        else
279
            p1 = base_path;
280
        if (p1 > p)
281
            p = p1;
282
        len = p - base_path;
283
        if (len > dest_size - 1)
284
            len = dest_size - 1;
285
        memcpy(dest, base_path, len);
286
        dest[len] = '\0';
287
        pstrcat(dest, dest_size, filename);
288
    }
289
}
290

    
291
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
292
{
293
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
294
        pstrcpy(dest, sz, bs->backing_file);
295
    } else {
296
        path_combine(dest, sz, bs->filename, bs->backing_file);
297
    }
298
}
299

    
300
void bdrv_register(BlockDriver *bdrv)
301
{
302
    /* Block drivers without coroutine functions need emulation */
303
    if (!bdrv->bdrv_co_readv) {
304
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
305
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
306

    
307
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
308
         * the block driver lacks aio we need to emulate that too.
309
         */
310
        if (!bdrv->bdrv_aio_readv) {
311
            /* add AIO emulation layer */
312
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
313
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
314
        }
315
    }
316

    
317
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
318
}
319

    
320
/* create a new block device (by default it is empty) */
321
BlockDriverState *bdrv_new(const char *device_name)
322
{
323
    BlockDriverState *bs;
324

    
325
    bs = g_malloc0(sizeof(BlockDriverState));
326
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
327
    if (device_name[0] != '\0') {
328
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
329
    }
330
    bdrv_iostatus_disable(bs);
331
    notifier_list_init(&bs->close_notifiers);
332
    notifier_with_return_list_init(&bs->before_write_notifiers);
333
    qemu_co_queue_init(&bs->throttled_reqs[0]);
334
    qemu_co_queue_init(&bs->throttled_reqs[1]);
335
    bs->refcnt = 1;
336

    
337
    return bs;
338
}
339

    
340
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
341
{
342
    notifier_list_add(&bs->close_notifiers, notify);
343
}
344

    
345
BlockDriver *bdrv_find_format(const char *format_name)
346
{
347
    BlockDriver *drv1;
348
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
349
        if (!strcmp(drv1->format_name, format_name)) {
350
            return drv1;
351
        }
352
    }
353
    return NULL;
354
}
355

    
356
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
357
{
358
    static const char *whitelist_rw[] = {
359
        CONFIG_BDRV_RW_WHITELIST
360
    };
361
    static const char *whitelist_ro[] = {
362
        CONFIG_BDRV_RO_WHITELIST
363
    };
364
    const char **p;
365

    
366
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
367
        return 1;               /* no whitelist, anything goes */
368
    }
369

    
370
    for (p = whitelist_rw; *p; p++) {
371
        if (!strcmp(drv->format_name, *p)) {
372
            return 1;
373
        }
374
    }
375
    if (read_only) {
376
        for (p = whitelist_ro; *p; p++) {
377
            if (!strcmp(drv->format_name, *p)) {
378
                return 1;
379
            }
380
        }
381
    }
382
    return 0;
383
}
384

    
385
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
386
                                          bool read_only)
387
{
388
    BlockDriver *drv = bdrv_find_format(format_name);
389
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
390
}
391

    
392
typedef struct CreateCo {
393
    BlockDriver *drv;
394
    char *filename;
395
    QEMUOptionParameter *options;
396
    int ret;
397
} CreateCo;
398

    
399
static void coroutine_fn bdrv_create_co_entry(void *opaque)
400
{
401
    CreateCo *cco = opaque;
402
    assert(cco->drv);
403

    
404
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options, NULL);
405
}
406

    
407
int bdrv_create(BlockDriver *drv, const char* filename,
408
    QEMUOptionParameter *options)
409
{
410
    int ret;
411

    
412
    Coroutine *co;
413
    CreateCo cco = {
414
        .drv = drv,
415
        .filename = g_strdup(filename),
416
        .options = options,
417
        .ret = NOT_DONE,
418
    };
419

    
420
    if (!drv->bdrv_create) {
421
        ret = -ENOTSUP;
422
        goto out;
423
    }
424

    
425
    if (qemu_in_coroutine()) {
426
        /* Fast-path if already in coroutine context */
427
        bdrv_create_co_entry(&cco);
428
    } else {
429
        co = qemu_coroutine_create(bdrv_create_co_entry);
430
        qemu_coroutine_enter(co, &cco);
431
        while (cco.ret == NOT_DONE) {
432
            qemu_aio_wait();
433
        }
434
    }
435

    
436
    ret = cco.ret;
437

    
438
out:
439
    g_free(cco.filename);
440
    return ret;
441
}
442

    
443
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
444
{
445
    BlockDriver *drv;
446

    
447
    drv = bdrv_find_protocol(filename, true);
448
    if (drv == NULL) {
449
        return -ENOENT;
450
    }
451

    
452
    return bdrv_create(drv, filename, options);
453
}
454

    
455
/*
456
 * Create a uniquely-named empty temporary file.
457
 * Return 0 upon success, otherwise a negative errno value.
458
 */
459
int get_tmp_filename(char *filename, int size)
460
{
461
#ifdef _WIN32
462
    char temp_dir[MAX_PATH];
463
    /* GetTempFileName requires that its output buffer (4th param)
464
       have length MAX_PATH or greater.  */
465
    assert(size >= MAX_PATH);
466
    return (GetTempPath(MAX_PATH, temp_dir)
467
            && GetTempFileName(temp_dir, "qem", 0, filename)
468
            ? 0 : -GetLastError());
469
#else
470
    int fd;
471
    const char *tmpdir;
472
    tmpdir = getenv("TMPDIR");
473
    if (!tmpdir)
474
        tmpdir = "/tmp";
475
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
476
        return -EOVERFLOW;
477
    }
478
    fd = mkstemp(filename);
479
    if (fd < 0) {
480
        return -errno;
481
    }
482
    if (close(fd) != 0) {
483
        unlink(filename);
484
        return -errno;
485
    }
486
    return 0;
487
#endif
488
}
489

    
490
/*
491
 * Detect host devices. By convention, /dev/cdrom[N] is always
492
 * recognized as a host CDROM.
493
 */
494
static BlockDriver *find_hdev_driver(const char *filename)
495
{
496
    int score_max = 0, score;
497
    BlockDriver *drv = NULL, *d;
498

    
499
    QLIST_FOREACH(d, &bdrv_drivers, list) {
500
        if (d->bdrv_probe_device) {
501
            score = d->bdrv_probe_device(filename);
502
            if (score > score_max) {
503
                score_max = score;
504
                drv = d;
505
            }
506
        }
507
    }
508

    
509
    return drv;
510
}
511

    
512
BlockDriver *bdrv_find_protocol(const char *filename,
513
                                bool allow_protocol_prefix)
514
{
515
    BlockDriver *drv1;
516
    char protocol[128];
517
    int len;
518
    const char *p;
519

    
520
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
521

    
522
    /*
523
     * XXX(hch): we really should not let host device detection
524
     * override an explicit protocol specification, but moving this
525
     * later breaks access to device names with colons in them.
526
     * Thanks to the brain-dead persistent naming schemes on udev-
527
     * based Linux systems those actually are quite common.
528
     */
529
    drv1 = find_hdev_driver(filename);
530
    if (drv1) {
531
        return drv1;
532
    }
533

    
534
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
535
        return bdrv_find_format("file");
536
    }
537

    
538
    p = strchr(filename, ':');
539
    assert(p != NULL);
540
    len = p - filename;
541
    if (len > sizeof(protocol) - 1)
542
        len = sizeof(protocol) - 1;
543
    memcpy(protocol, filename, len);
544
    protocol[len] = '\0';
545
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
546
        if (drv1->protocol_name &&
547
            !strcmp(drv1->protocol_name, protocol)) {
548
            return drv1;
549
        }
550
    }
551
    return NULL;
552
}
553

    
554
static int find_image_format(BlockDriverState *bs, const char *filename,
555
                             BlockDriver **pdrv, Error **errp)
556
{
557
    int score, score_max;
558
    BlockDriver *drv1, *drv;
559
    uint8_t buf[2048];
560
    int ret = 0;
561

    
562
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
563
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
564
        drv = bdrv_find_format("raw");
565
        if (!drv) {
566
            error_setg(errp, "Could not find raw image format");
567
            ret = -ENOENT;
568
        }
569
        *pdrv = drv;
570
        return ret;
571
    }
572

    
573
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
574
    if (ret < 0) {
575
        error_setg_errno(errp, -ret, "Could not read image for determining its "
576
                         "format");
577
        *pdrv = NULL;
578
        return ret;
579
    }
580

    
581
    score_max = 0;
582
    drv = NULL;
583
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
584
        if (drv1->bdrv_probe) {
585
            score = drv1->bdrv_probe(buf, ret, filename);
586
            if (score > score_max) {
587
                score_max = score;
588
                drv = drv1;
589
            }
590
        }
591
    }
592
    if (!drv) {
593
        error_setg(errp, "Could not determine image format: No compatible "
594
                   "driver found");
595
        ret = -ENOENT;
596
    }
597
    *pdrv = drv;
598
    return ret;
599
}
600

    
601
/**
602
 * Set the current 'total_sectors' value
603
 */
604
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
605
{
606
    BlockDriver *drv = bs->drv;
607

    
608
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
609
    if (bs->sg)
610
        return 0;
611

    
612
    /* query actual device if possible, otherwise just trust the hint */
613
    if (drv->bdrv_getlength) {
614
        int64_t length = drv->bdrv_getlength(bs);
615
        if (length < 0) {
616
            return length;
617
        }
618
        hint = length >> BDRV_SECTOR_BITS;
619
    }
620

    
621
    bs->total_sectors = hint;
622
    return 0;
623
}
624

    
625
/**
626
 * Set open flags for a given discard mode
627
 *
628
 * Return 0 on success, -1 if the discard mode was invalid.
629
 */
630
int bdrv_parse_discard_flags(const char *mode, int *flags)
631
{
632
    *flags &= ~BDRV_O_UNMAP;
633

    
634
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
635
        /* do nothing */
636
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
637
        *flags |= BDRV_O_UNMAP;
638
    } else {
639
        return -1;
640
    }
641

    
642
    return 0;
643
}
644

    
645
/**
646
 * Set open flags for a given cache mode
647
 *
648
 * Return 0 on success, -1 if the cache mode was invalid.
649
 */
650
int bdrv_parse_cache_flags(const char *mode, int *flags)
651
{
652
    *flags &= ~BDRV_O_CACHE_MASK;
653

    
654
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
655
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
656
    } else if (!strcmp(mode, "directsync")) {
657
        *flags |= BDRV_O_NOCACHE;
658
    } else if (!strcmp(mode, "writeback")) {
659
        *flags |= BDRV_O_CACHE_WB;
660
    } else if (!strcmp(mode, "unsafe")) {
661
        *flags |= BDRV_O_CACHE_WB;
662
        *flags |= BDRV_O_NO_FLUSH;
663
    } else if (!strcmp(mode, "writethrough")) {
664
        /* this is the default */
665
    } else {
666
        return -1;
667
    }
668

    
669
    return 0;
670
}
671

    
672
/**
673
 * The copy-on-read flag is actually a reference count so multiple users may
674
 * use the feature without worrying about clobbering its previous state.
675
 * Copy-on-read stays enabled until all users have called to disable it.
676
 */
677
void bdrv_enable_copy_on_read(BlockDriverState *bs)
678
{
679
    bs->copy_on_read++;
680
}
681

    
682
void bdrv_disable_copy_on_read(BlockDriverState *bs)
683
{
684
    assert(bs->copy_on_read > 0);
685
    bs->copy_on_read--;
686
}
687

    
688
static int bdrv_open_flags(BlockDriverState *bs, int flags)
689
{
690
    int open_flags = flags | BDRV_O_CACHE_WB;
691

    
692
    /*
693
     * Clear flags that are internal to the block layer before opening the
694
     * image.
695
     */
696
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
697

    
698
    /*
699
     * Snapshots should be writable.
700
     */
701
    if (bs->is_temporary) {
702
        open_flags |= BDRV_O_RDWR;
703
    }
704

    
705
    return open_flags;
706
}
707

    
708
/*
709
 * Common part for opening disk images and files
710
 *
711
 * Removes all processed options from *options.
712
 */
713
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
714
    QDict *options, int flags, BlockDriver *drv, Error **errp)
715
{
716
    int ret, open_flags;
717
    const char *filename;
718
    Error *local_err = NULL;
719

    
720
    assert(drv != NULL);
721
    assert(bs->file == NULL);
722
    assert(options != NULL && bs->options != options);
723

    
724
    if (file != NULL) {
725
        filename = file->filename;
726
    } else {
727
        filename = qdict_get_try_str(options, "filename");
728
    }
729

    
730
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
731

    
732
    /* bdrv_open() with directly using a protocol as drv. This layer is already
733
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
734
     * and return immediately. */
735
    if (file != NULL && drv->bdrv_file_open) {
736
        bdrv_swap(file, bs);
737
        return 0;
738
    }
739

    
740
    bs->open_flags = flags;
741
    bs->buffer_alignment = 512;
742
    bs->zero_beyond_eof = true;
743
    open_flags = bdrv_open_flags(bs, flags);
744
    bs->read_only = !(open_flags & BDRV_O_RDWR);
745

    
746
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
747
        error_setg(errp, "Driver '%s' is not whitelisted", drv->format_name);
748
        return -ENOTSUP;
749
    }
750

    
751
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
752
    if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) {
753
        bdrv_enable_copy_on_read(bs);
754
    }
755

    
756
    if (filename != NULL) {
757
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
758
    } else {
759
        bs->filename[0] = '\0';
760
    }
761

    
762
    bs->drv = drv;
763
    bs->opaque = g_malloc0(drv->instance_size);
764

    
765
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
766

    
767
    /* Open the image, either directly or using a protocol */
768
    if (drv->bdrv_file_open) {
769
        assert(file == NULL);
770
        assert(drv->bdrv_parse_filename || filename != NULL);
771
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
772
    } else {
773
        if (file == NULL) {
774
            error_setg(errp, "Can't use '%s' as a block driver for the "
775
                       "protocol level", drv->format_name);
776
            ret = -EINVAL;
777
            goto free_and_fail;
778
        }
779
        bs->file = file;
780
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
781
    }
782

    
783
    if (ret < 0) {
784
        if (error_is_set(&local_err)) {
785
            error_propagate(errp, local_err);
786
        } else if (filename) {
787
            error_setg_errno(errp, -ret, "Could not open '%s'", filename);
788
        } else {
789
            error_setg_errno(errp, -ret, "Could not open image");
790
        }
791
        goto free_and_fail;
792
    }
793

    
794
    ret = refresh_total_sectors(bs, bs->total_sectors);
795
    if (ret < 0) {
796
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
797
        goto free_and_fail;
798
    }
799

    
800
#ifndef _WIN32
801
    if (bs->is_temporary) {
802
        assert(filename != NULL);
803
        unlink(filename);
804
    }
805
#endif
806
    return 0;
807

    
808
free_and_fail:
809
    bs->file = NULL;
810
    g_free(bs->opaque);
811
    bs->opaque = NULL;
812
    bs->drv = NULL;
813
    return ret;
814
}
815

    
816
/*
817
 * Opens a file using a protocol (file, host_device, nbd, ...)
818
 *
819
 * options is a QDict of options to pass to the block drivers, or NULL for an
820
 * empty set of options. The reference to the QDict belongs to the block layer
821
 * after the call (even on failure), so if the caller intends to reuse the
822
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
823
 */
824
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
825
                   QDict *options, int flags, Error **errp)
826
{
827
    BlockDriverState *bs;
828
    BlockDriver *drv;
829
    const char *drvname;
830
    bool allow_protocol_prefix = false;
831
    Error *local_err = NULL;
832
    int ret;
833

    
834
    /* NULL means an empty set of options */
835
    if (options == NULL) {
836
        options = qdict_new();
837
    }
838

    
839
    bs = bdrv_new("");
840
    bs->options = options;
841
    options = qdict_clone_shallow(options);
842

    
843
    /* Fetch the file name from the options QDict if necessary */
844
    if (!filename) {
845
        filename = qdict_get_try_str(options, "filename");
846
    } else if (filename && !qdict_haskey(options, "filename")) {
847
        qdict_put(options, "filename", qstring_from_str(filename));
848
        allow_protocol_prefix = true;
849
    } else {
850
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
851
                   "same time");
852
        ret = -EINVAL;
853
        goto fail;
854
    }
855

    
856
    /* Find the right block driver */
857
    drvname = qdict_get_try_str(options, "driver");
858
    if (drvname) {
859
        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
860
        if (!drv) {
861
            error_setg(errp, "Unknown driver '%s'", drvname);
862
        }
863
        qdict_del(options, "driver");
864
    } else if (filename) {
865
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
866
        if (!drv) {
867
            error_setg(errp, "Unknown protocol");
868
        }
869
    } else {
870
        error_setg(errp, "Must specify either driver or file");
871
        drv = NULL;
872
    }
873

    
874
    if (!drv) {
875
        /* errp has been set already */
876
        ret = -ENOENT;
877
        goto fail;
878
    }
879

    
880
    /* Parse the filename and open it */
881
    if (drv->bdrv_parse_filename && filename) {
882
        drv->bdrv_parse_filename(filename, options, &local_err);
883
        if (error_is_set(&local_err)) {
884
            error_propagate(errp, local_err);
885
            ret = -EINVAL;
886
            goto fail;
887
        }
888
        qdict_del(options, "filename");
889
    } else if (!drv->bdrv_parse_filename && !filename) {
890
        error_setg(errp, "The '%s' block driver requires a file name",
891
                   drv->format_name);
892
        ret = -EINVAL;
893
        goto fail;
894
    }
895

    
896
    ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
897
    if (ret < 0) {
898
        error_propagate(errp, local_err);
899
        goto fail;
900
    }
901

    
902
    /* Check if any unknown options were used */
903
    if (qdict_size(options) != 0) {
904
        const QDictEntry *entry = qdict_first(options);
905
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
906
                   drv->format_name, entry->key);
907
        ret = -EINVAL;
908
        goto fail;
909
    }
910
    QDECREF(options);
911

    
912
    bs->growable = 1;
913
    *pbs = bs;
914
    return 0;
915

    
916
fail:
917
    QDECREF(options);
918
    if (!bs->drv) {
919
        QDECREF(bs->options);
920
    }
921
    bdrv_unref(bs);
922
    return ret;
923
}
924

    
925
/*
926
 * Opens the backing file for a BlockDriverState if not yet open
927
 *
928
 * options is a QDict of options to pass to the block drivers, or NULL for an
929
 * empty set of options. The reference to the QDict is transferred to this
930
 * function (even on failure), so if the caller intends to reuse the dictionary,
931
 * it needs to use QINCREF() before calling bdrv_file_open.
932
 */
933
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
934
{
935
    char backing_filename[PATH_MAX];
936
    int back_flags, ret;
937
    BlockDriver *back_drv = NULL;
938
    Error *local_err = NULL;
939

    
940
    if (bs->backing_hd != NULL) {
941
        QDECREF(options);
942
        return 0;
943
    }
944

    
945
    /* NULL means an empty set of options */
946
    if (options == NULL) {
947
        options = qdict_new();
948
    }
949

    
950
    bs->open_flags &= ~BDRV_O_NO_BACKING;
951
    if (qdict_haskey(options, "file.filename")) {
952
        backing_filename[0] = '\0';
953
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
954
        QDECREF(options);
955
        return 0;
956
    }
957

    
958
    bs->backing_hd = bdrv_new("");
959
    bdrv_get_full_backing_filename(bs, backing_filename,
960
                                   sizeof(backing_filename));
961

    
962
    if (bs->backing_format[0] != '\0') {
963
        back_drv = bdrv_find_format(bs->backing_format);
964
    }
965

    
966
    /* backing files always opened read-only */
967
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
968

    
969
    ret = bdrv_open(bs->backing_hd,
970
                    *backing_filename ? backing_filename : NULL, options,
971
                    back_flags, back_drv, &local_err);
972
    if (ret < 0) {
973
        bdrv_unref(bs->backing_hd);
974
        bs->backing_hd = NULL;
975
        bs->open_flags |= BDRV_O_NO_BACKING;
976
        error_propagate(errp, local_err);
977
        return ret;
978
    }
979
    return 0;
980
}
981

    
982
static void extract_subqdict(QDict *src, QDict **dst, const char *start)
983
{
984
    const QDictEntry *entry, *next;
985
    const char *p;
986

    
987
    *dst = qdict_new();
988
    entry = qdict_first(src);
989

    
990
    while (entry != NULL) {
991
        next = qdict_next(src, entry);
992
        if (strstart(entry->key, start, &p)) {
993
            qobject_incref(entry->value);
994
            qdict_put_obj(*dst, p, entry->value);
995
            qdict_del(src, entry->key);
996
        }
997
        entry = next;
998
    }
999
}
1000

    
1001
/*
1002
 * Opens a disk image (raw, qcow2, vmdk, ...)
1003
 *
1004
 * options is a QDict of options to pass to the block drivers, or NULL for an
1005
 * empty set of options. The reference to the QDict belongs to the block layer
1006
 * after the call (even on failure), so if the caller intends to reuse the
1007
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1008
 */
1009
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1010
              int flags, BlockDriver *drv, Error **errp)
1011
{
1012
    int ret;
1013
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1014
    char tmp_filename[PATH_MAX + 1];
1015
    BlockDriverState *file = NULL;
1016
    QDict *file_options = NULL;
1017
    const char *drvname;
1018
    Error *local_err = NULL;
1019

    
1020
    /* NULL means an empty set of options */
1021
    if (options == NULL) {
1022
        options = qdict_new();
1023
    }
1024

    
1025
    bs->options = options;
1026
    options = qdict_clone_shallow(options);
1027

    
1028
    /* For snapshot=on, create a temporary qcow2 overlay */
1029
    if (flags & BDRV_O_SNAPSHOT) {
1030
        BlockDriverState *bs1;
1031
        int64_t total_size;
1032
        BlockDriver *bdrv_qcow2;
1033
        QEMUOptionParameter *create_options;
1034
        char backing_filename[PATH_MAX];
1035

    
1036
        if (qdict_size(options) != 0) {
1037
            error_setg(errp, "Can't use snapshot=on with driver-specific options");
1038
            ret = -EINVAL;
1039
            goto fail;
1040
        }
1041
        assert(filename != NULL);
1042

    
1043
        /* if snapshot, we create a temporary backing file and open it
1044
           instead of opening 'filename' directly */
1045

    
1046
        /* if there is a backing file, use it */
1047
        bs1 = bdrv_new("");
1048
        ret = bdrv_open(bs1, filename, NULL, 0, drv, &local_err);
1049
        if (ret < 0) {
1050
            bdrv_unref(bs1);
1051
            goto fail;
1052
        }
1053
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1054

    
1055
        bdrv_unref(bs1);
1056

    
1057
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1058
        if (ret < 0) {
1059
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1060
            goto fail;
1061
        }
1062

    
1063
        /* Real path is meaningless for protocols */
1064
        if (path_has_protocol(filename)) {
1065
            snprintf(backing_filename, sizeof(backing_filename),
1066
                     "%s", filename);
1067
        } else if (!realpath(filename, backing_filename)) {
1068
            error_setg_errno(errp, errno, "Could not resolve path '%s'", filename);
1069
            ret = -errno;
1070
            goto fail;
1071
        }
1072

    
1073
        bdrv_qcow2 = bdrv_find_format("qcow2");
1074
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1075
                                                 NULL);
1076

    
1077
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1078
        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
1079
                             backing_filename);
1080
        if (drv) {
1081
            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
1082
                drv->format_name);
1083
        }
1084

    
1085
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options);
1086
        free_option_parameters(create_options);
1087
        if (ret < 0) {
1088
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1089
                             "'%s'", tmp_filename);
1090
            goto fail;
1091
        }
1092

    
1093
        filename = tmp_filename;
1094
        drv = bdrv_qcow2;
1095
        bs->is_temporary = 1;
1096
    }
1097

    
1098
    /* Open image file without format layer */
1099
    if (flags & BDRV_O_RDWR) {
1100
        flags |= BDRV_O_ALLOW_RDWR;
1101
    }
1102

    
1103
    extract_subqdict(options, &file_options, "file.");
1104

    
1105
    ret = bdrv_file_open(&file, filename, file_options,
1106
                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
1107
    if (ret < 0) {
1108
        goto fail;
1109
    }
1110

    
1111
    /* Find the right image format driver */
1112
    drvname = qdict_get_try_str(options, "driver");
1113
    if (drvname) {
1114
        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
1115
        qdict_del(options, "driver");
1116
    }
1117

    
1118
    if (!drv) {
1119
        ret = find_image_format(file, filename, &drv, &local_err);
1120
    }
1121

    
1122
    if (!drv) {
1123
        goto unlink_and_fail;
1124
    }
1125

    
1126
    /* Open the image */
1127
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1128
    if (ret < 0) {
1129
        goto unlink_and_fail;
1130
    }
1131

    
1132
    if (bs->file != file) {
1133
        bdrv_unref(file);
1134
        file = NULL;
1135
    }
1136

    
1137
    /* If there is a backing file, use it */
1138
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1139
        QDict *backing_options;
1140

    
1141
        extract_subqdict(options, &backing_options, "backing.");
1142
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1143
        if (ret < 0) {
1144
            goto close_and_fail;
1145
        }
1146
    }
1147

    
1148
    /* Check if any unknown options were used */
1149
    if (qdict_size(options) != 0) {
1150
        const QDictEntry *entry = qdict_first(options);
1151
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1152
                   "support the option '%s'", drv->format_name, bs->device_name,
1153
                   entry->key);
1154

    
1155
        ret = -EINVAL;
1156
        goto close_and_fail;
1157
    }
1158
    QDECREF(options);
1159

    
1160
    if (!bdrv_key_required(bs)) {
1161
        bdrv_dev_change_media_cb(bs, true);
1162
    }
1163

    
1164
    return 0;
1165

    
1166
unlink_and_fail:
1167
    if (file != NULL) {
1168
        bdrv_unref(file);
1169
    }
1170
    if (bs->is_temporary) {
1171
        unlink(filename);
1172
    }
1173
fail:
1174
    QDECREF(bs->options);
1175
    QDECREF(options);
1176
    bs->options = NULL;
1177
    if (error_is_set(&local_err)) {
1178
        error_propagate(errp, local_err);
1179
    }
1180
    return ret;
1181

    
1182
close_and_fail:
1183
    bdrv_close(bs);
1184
    QDECREF(options);
1185
    if (error_is_set(&local_err)) {
1186
        error_propagate(errp, local_err);
1187
    }
1188
    return ret;
1189
}
1190

    
1191
typedef struct BlockReopenQueueEntry {
1192
     bool prepared;
1193
     BDRVReopenState state;
1194
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1195
} BlockReopenQueueEntry;
1196

    
1197
/*
1198
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1199
 * reopen of multiple devices.
1200
 *
1201
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1202
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1203
 * be created and initialized. This newly created BlockReopenQueue should be
1204
 * passed back in for subsequent calls that are intended to be of the same
1205
 * atomic 'set'.
1206
 *
1207
 * bs is the BlockDriverState to add to the reopen queue.
1208
 *
1209
 * flags contains the open flags for the associated bs
1210
 *
1211
 * returns a pointer to bs_queue, which is either the newly allocated
1212
 * bs_queue, or the existing bs_queue being used.
1213
 *
1214
 */
1215
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1216
                                    BlockDriverState *bs, int flags)
1217
{
1218
    assert(bs != NULL);
1219

    
1220
    BlockReopenQueueEntry *bs_entry;
1221
    if (bs_queue == NULL) {
1222
        bs_queue = g_new0(BlockReopenQueue, 1);
1223
        QSIMPLEQ_INIT(bs_queue);
1224
    }
1225

    
1226
    if (bs->file) {
1227
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1228
    }
1229

    
1230
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1231
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1232

    
1233
    bs_entry->state.bs = bs;
1234
    bs_entry->state.flags = flags;
1235

    
1236
    return bs_queue;
1237
}
1238

    
1239
/*
1240
 * Reopen multiple BlockDriverStates atomically & transactionally.
1241
 *
1242
 * The queue passed in (bs_queue) must have been built up previous
1243
 * via bdrv_reopen_queue().
1244
 *
1245
 * Reopens all BDS specified in the queue, with the appropriate
1246
 * flags.  All devices are prepared for reopen, and failure of any
1247
 * device will cause all device changes to be abandonded, and intermediate
1248
 * data cleaned up.
1249
 *
1250
 * If all devices prepare successfully, then the changes are committed
1251
 * to all devices.
1252
 *
1253
 */
1254
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1255
{
1256
    int ret = -1;
1257
    BlockReopenQueueEntry *bs_entry, *next;
1258
    Error *local_err = NULL;
1259

    
1260
    assert(bs_queue != NULL);
1261

    
1262
    bdrv_drain_all();
1263

    
1264
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1265
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1266
            error_propagate(errp, local_err);
1267
            goto cleanup;
1268
        }
1269
        bs_entry->prepared = true;
1270
    }
1271

    
1272
    /* If we reach this point, we have success and just need to apply the
1273
     * changes
1274
     */
1275
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1276
        bdrv_reopen_commit(&bs_entry->state);
1277
    }
1278

    
1279
    ret = 0;
1280

    
1281
cleanup:
1282
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1283
        if (ret && bs_entry->prepared) {
1284
            bdrv_reopen_abort(&bs_entry->state);
1285
        }
1286
        g_free(bs_entry);
1287
    }
1288
    g_free(bs_queue);
1289
    return ret;
1290
}
1291

    
1292

    
1293
/* Reopen a single BlockDriverState with the specified flags. */
1294
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1295
{
1296
    int ret = -1;
1297
    Error *local_err = NULL;
1298
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1299

    
1300
    ret = bdrv_reopen_multiple(queue, &local_err);
1301
    if (local_err != NULL) {
1302
        error_propagate(errp, local_err);
1303
    }
1304
    return ret;
1305
}
1306

    
1307

    
1308
/*
1309
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1310
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1311
 * the block driver layer .bdrv_reopen_prepare()
1312
 *
1313
 * bs is the BlockDriverState to reopen
1314
 * flags are the new open flags
1315
 * queue is the reopen queue
1316
 *
1317
 * Returns 0 on success, non-zero on error.  On error errp will be set
1318
 * as well.
1319
 *
1320
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1321
 * It is the responsibility of the caller to then call the abort() or
1322
 * commit() for any other BDS that have been left in a prepare() state
1323
 *
1324
 */
1325
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1326
                        Error **errp)
1327
{
1328
    int ret = -1;
1329
    Error *local_err = NULL;
1330
    BlockDriver *drv;
1331

    
1332
    assert(reopen_state != NULL);
1333
    assert(reopen_state->bs->drv != NULL);
1334
    drv = reopen_state->bs->drv;
1335

    
1336
    /* if we are to stay read-only, do not allow permission change
1337
     * to r/w */
1338
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1339
        reopen_state->flags & BDRV_O_RDWR) {
1340
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1341
                  reopen_state->bs->device_name);
1342
        goto error;
1343
    }
1344

    
1345

    
1346
    ret = bdrv_flush(reopen_state->bs);
1347
    if (ret) {
1348
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1349
                  strerror(-ret));
1350
        goto error;
1351
    }
1352

    
1353
    if (drv->bdrv_reopen_prepare) {
1354
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1355
        if (ret) {
1356
            if (local_err != NULL) {
1357
                error_propagate(errp, local_err);
1358
            } else {
1359
                error_setg(errp, "failed while preparing to reopen image '%s'",
1360
                           reopen_state->bs->filename);
1361
            }
1362
            goto error;
1363
        }
1364
    } else {
1365
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1366
         * handler for each supported drv. */
1367
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1368
                  drv->format_name, reopen_state->bs->device_name,
1369
                 "reopening of file");
1370
        ret = -1;
1371
        goto error;
1372
    }
1373

    
1374
    ret = 0;
1375

    
1376
error:
1377
    return ret;
1378
}
1379

    
1380
/*
1381
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1382
 * makes them final by swapping the staging BlockDriverState contents into
1383
 * the active BlockDriverState contents.
1384
 */
1385
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1386
{
1387
    BlockDriver *drv;
1388

    
1389
    assert(reopen_state != NULL);
1390
    drv = reopen_state->bs->drv;
1391
    assert(drv != NULL);
1392

    
1393
    /* If there are any driver level actions to take */
1394
    if (drv->bdrv_reopen_commit) {
1395
        drv->bdrv_reopen_commit(reopen_state);
1396
    }
1397

    
1398
    /* set BDS specific flags now */
1399
    reopen_state->bs->open_flags         = reopen_state->flags;
1400
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1401
                                              BDRV_O_CACHE_WB);
1402
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1403
}
1404

    
1405
/*
1406
 * Abort the reopen, and delete and free the staged changes in
1407
 * reopen_state
1408
 */
1409
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1410
{
1411
    BlockDriver *drv;
1412

    
1413
    assert(reopen_state != NULL);
1414
    drv = reopen_state->bs->drv;
1415
    assert(drv != NULL);
1416

    
1417
    if (drv->bdrv_reopen_abort) {
1418
        drv->bdrv_reopen_abort(reopen_state);
1419
    }
1420
}
1421

    
1422

    
1423
void bdrv_close(BlockDriverState *bs)
1424
{
1425
    if (bs->job) {
1426
        block_job_cancel_sync(bs->job);
1427
    }
1428
    bdrv_drain_all(); /* complete I/O */
1429
    bdrv_flush(bs);
1430
    bdrv_drain_all(); /* in case flush left pending I/O */
1431
    notifier_list_notify(&bs->close_notifiers, bs);
1432

    
1433
    if (bs->drv) {
1434
        if (bs->backing_hd) {
1435
            bdrv_unref(bs->backing_hd);
1436
            bs->backing_hd = NULL;
1437
        }
1438
        bs->drv->bdrv_close(bs);
1439
        g_free(bs->opaque);
1440
#ifdef _WIN32
1441
        if (bs->is_temporary) {
1442
            unlink(bs->filename);
1443
        }
1444
#endif
1445
        bs->opaque = NULL;
1446
        bs->drv = NULL;
1447
        bs->copy_on_read = 0;
1448
        bs->backing_file[0] = '\0';
1449
        bs->backing_format[0] = '\0';
1450
        bs->total_sectors = 0;
1451
        bs->encrypted = 0;
1452
        bs->valid_key = 0;
1453
        bs->sg = 0;
1454
        bs->growable = 0;
1455
        bs->zero_beyond_eof = false;
1456
        QDECREF(bs->options);
1457
        bs->options = NULL;
1458

    
1459
        if (bs->file != NULL) {
1460
            bdrv_unref(bs->file);
1461
            bs->file = NULL;
1462
        }
1463
    }
1464

    
1465
    bdrv_dev_change_media_cb(bs, false);
1466

    
1467
    /*throttling disk I/O limits*/
1468
    if (bs->io_limits_enabled) {
1469
        bdrv_io_limits_disable(bs);
1470
    }
1471
}
1472

    
1473
void bdrv_close_all(void)
1474
{
1475
    BlockDriverState *bs;
1476

    
1477
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1478
        bdrv_close(bs);
1479
    }
1480
}
1481

    
1482
/* Check if any requests are in-flight (including throttled requests) */
1483
static bool bdrv_requests_pending(BlockDriverState *bs)
1484
{
1485
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1486
        return true;
1487
    }
1488
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1489
        return true;
1490
    }
1491
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1492
        return true;
1493
    }
1494
    if (bs->file && bdrv_requests_pending(bs->file)) {
1495
        return true;
1496
    }
1497
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1498
        return true;
1499
    }
1500
    return false;
1501
}
1502

    
1503
static bool bdrv_requests_pending_all(void)
1504
{
1505
    BlockDriverState *bs;
1506
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1507
        if (bdrv_requests_pending(bs)) {
1508
            return true;
1509
        }
1510
    }
1511
    return false;
1512
}
1513

    
1514
/*
1515
 * Wait for pending requests to complete across all BlockDriverStates
1516
 *
1517
 * This function does not flush data to disk, use bdrv_flush_all() for that
1518
 * after calling this function.
1519
 *
1520
 * Note that completion of an asynchronous I/O operation can trigger any
1521
 * number of other I/O operations on other devices---for example a coroutine
1522
 * can be arbitrarily complex and a constant flow of I/O can come until the
1523
 * coroutine is complete.  Because of this, it is not possible to have a
1524
 * function to drain a single device's I/O queue.
1525
 */
1526
void bdrv_drain_all(void)
1527
{
1528
    /* Always run first iteration so any pending completion BHs run */
1529
    bool busy = true;
1530
    BlockDriverState *bs;
1531

    
1532
    while (busy) {
1533
        /* FIXME: We do not have timer support here, so this is effectively
1534
         * a busy wait.
1535
         */
1536
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1537
            if (bdrv_start_throttled_reqs(bs)) {
1538
                busy = true;
1539
            }
1540
        }
1541

    
1542
        busy = bdrv_requests_pending_all();
1543
        busy |= aio_poll(qemu_get_aio_context(), busy);
1544
    }
1545
}
1546

    
1547
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1548
   Also, NULL terminate the device_name to prevent double remove */
1549
void bdrv_make_anon(BlockDriverState *bs)
1550
{
1551
    if (bs->device_name[0] != '\0') {
1552
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1553
    }
1554
    bs->device_name[0] = '\0';
1555
}
1556

    
1557
static void bdrv_rebind(BlockDriverState *bs)
1558
{
1559
    if (bs->drv && bs->drv->bdrv_rebind) {
1560
        bs->drv->bdrv_rebind(bs);
1561
    }
1562
}
1563

    
1564
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1565
                                     BlockDriverState *bs_src)
1566
{
1567
    /* move some fields that need to stay attached to the device */
1568
    bs_dest->open_flags         = bs_src->open_flags;
1569

    
1570
    /* dev info */
1571
    bs_dest->dev_ops            = bs_src->dev_ops;
1572
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1573
    bs_dest->dev                = bs_src->dev;
1574
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1575
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1576

    
1577
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1578

    
1579
    /* i/o throttled req */
1580
    memcpy(&bs_dest->throttle_state,
1581
           &bs_src->throttle_state,
1582
           sizeof(ThrottleState));
1583
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1584
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1585
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1586

    
1587
    /* r/w error */
1588
    bs_dest->on_read_error      = bs_src->on_read_error;
1589
    bs_dest->on_write_error     = bs_src->on_write_error;
1590

    
1591
    /* i/o status */
1592
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1593
    bs_dest->iostatus           = bs_src->iostatus;
1594

    
1595
    /* dirty bitmap */
1596
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1597

    
1598
    /* reference count */
1599
    bs_dest->refcnt             = bs_src->refcnt;
1600

    
1601
    /* job */
1602
    bs_dest->in_use             = bs_src->in_use;
1603
    bs_dest->job                = bs_src->job;
1604

    
1605
    /* keep the same entry in bdrv_states */
1606
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1607
            bs_src->device_name);
1608
    bs_dest->list = bs_src->list;
1609
}
1610

    
1611
/*
1612
 * Swap bs contents for two image chains while they are live,
1613
 * while keeping required fields on the BlockDriverState that is
1614
 * actually attached to a device.
1615
 *
1616
 * This will modify the BlockDriverState fields, and swap contents
1617
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1618
 *
1619
 * bs_new is required to be anonymous.
1620
 *
1621
 * This function does not create any image files.
1622
 */
1623
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1624
{
1625
    BlockDriverState tmp;
1626

    
1627
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1628
    assert(bs_new->device_name[0] == '\0');
1629
    assert(bs_new->dirty_bitmap == NULL);
1630
    assert(bs_new->job == NULL);
1631
    assert(bs_new->dev == NULL);
1632
    assert(bs_new->in_use == 0);
1633
    assert(bs_new->io_limits_enabled == false);
1634
    assert(!throttle_have_timer(&bs_new->throttle_state));
1635

    
1636
    tmp = *bs_new;
1637
    *bs_new = *bs_old;
1638
    *bs_old = tmp;
1639

    
1640
    /* there are some fields that should not be swapped, move them back */
1641
    bdrv_move_feature_fields(&tmp, bs_old);
1642
    bdrv_move_feature_fields(bs_old, bs_new);
1643
    bdrv_move_feature_fields(bs_new, &tmp);
1644

    
1645
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1646
    assert(bs_new->device_name[0] == '\0');
1647

    
1648
    /* Check a few fields that should remain attached to the device */
1649
    assert(bs_new->dev == NULL);
1650
    assert(bs_new->job == NULL);
1651
    assert(bs_new->in_use == 0);
1652
    assert(bs_new->io_limits_enabled == false);
1653
    assert(!throttle_have_timer(&bs_new->throttle_state));
1654

    
1655
    bdrv_rebind(bs_new);
1656
    bdrv_rebind(bs_old);
1657
}
1658

    
1659
/*
1660
 * Add new bs contents at the top of an image chain while the chain is
1661
 * live, while keeping required fields on the top layer.
1662
 *
1663
 * This will modify the BlockDriverState fields, and swap contents
1664
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1665
 *
1666
 * bs_new is required to be anonymous.
1667
 *
1668
 * This function does not create any image files.
1669
 */
1670
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1671
{
1672
    bdrv_swap(bs_new, bs_top);
1673

    
1674
    /* The contents of 'tmp' will become bs_top, as we are
1675
     * swapping bs_new and bs_top contents. */
1676
    bs_top->backing_hd = bs_new;
1677
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1678
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1679
            bs_new->filename);
1680
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1681
            bs_new->drv ? bs_new->drv->format_name : "");
1682
}
1683

    
1684
static void bdrv_delete(BlockDriverState *bs)
1685
{
1686
    assert(!bs->dev);
1687
    assert(!bs->job);
1688
    assert(!bs->in_use);
1689
    assert(!bs->refcnt);
1690

    
1691
    bdrv_close(bs);
1692

    
1693
    /* remove from list, if necessary */
1694
    bdrv_make_anon(bs);
1695

    
1696
    g_free(bs);
1697
}
1698

    
1699
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1700
/* TODO change to DeviceState *dev when all users are qdevified */
1701
{
1702
    if (bs->dev) {
1703
        return -EBUSY;
1704
    }
1705
    bs->dev = dev;
1706
    bdrv_iostatus_reset(bs);
1707
    return 0;
1708
}
1709

    
1710
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1711
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1712
{
1713
    if (bdrv_attach_dev(bs, dev) < 0) {
1714
        abort();
1715
    }
1716
}
1717

    
1718
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1719
/* TODO change to DeviceState *dev when all users are qdevified */
1720
{
1721
    assert(bs->dev == dev);
1722
    bs->dev = NULL;
1723
    bs->dev_ops = NULL;
1724
    bs->dev_opaque = NULL;
1725
    bs->buffer_alignment = 512;
1726
}
1727

    
1728
/* TODO change to return DeviceState * when all users are qdevified */
1729
void *bdrv_get_attached_dev(BlockDriverState *bs)
1730
{
1731
    return bs->dev;
1732
}
1733

    
1734
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1735
                      void *opaque)
1736
{
1737
    bs->dev_ops = ops;
1738
    bs->dev_opaque = opaque;
1739
}
1740

    
1741
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1742
                               enum MonitorEvent ev,
1743
                               BlockErrorAction action, bool is_read)
1744
{
1745
    QObject *data;
1746
    const char *action_str;
1747

    
1748
    switch (action) {
1749
    case BDRV_ACTION_REPORT:
1750
        action_str = "report";
1751
        break;
1752
    case BDRV_ACTION_IGNORE:
1753
        action_str = "ignore";
1754
        break;
1755
    case BDRV_ACTION_STOP:
1756
        action_str = "stop";
1757
        break;
1758
    default:
1759
        abort();
1760
    }
1761

    
1762
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1763
                              bdrv->device_name,
1764
                              action_str,
1765
                              is_read ? "read" : "write");
1766
    monitor_protocol_event(ev, data);
1767

    
1768
    qobject_decref(data);
1769
}
1770

    
1771
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1772
{
1773
    QObject *data;
1774

    
1775
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1776
                              bdrv_get_device_name(bs), ejected);
1777
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1778

    
1779
    qobject_decref(data);
1780
}
1781

    
1782
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1783
{
1784
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1785
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1786
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1787
        if (tray_was_closed) {
1788
            /* tray open */
1789
            bdrv_emit_qmp_eject_event(bs, true);
1790
        }
1791
        if (load) {
1792
            /* tray close */
1793
            bdrv_emit_qmp_eject_event(bs, false);
1794
        }
1795
    }
1796
}
1797

    
1798
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1799
{
1800
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1801
}
1802

    
1803
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1804
{
1805
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1806
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1807
    }
1808
}
1809

    
1810
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1811
{
1812
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1813
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1814
    }
1815
    return false;
1816
}
1817

    
1818
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1819
{
1820
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1821
        bs->dev_ops->resize_cb(bs->dev_opaque);
1822
    }
1823
}
1824

    
1825
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1826
{
1827
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1828
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1829
    }
1830
    return false;
1831
}
1832

    
1833
/*
1834
 * Run consistency checks on an image
1835
 *
1836
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1837
 * free of errors) or -errno when an internal error occurred. The results of the
1838
 * check are stored in res.
1839
 */
1840
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1841
{
1842
    if (bs->drv->bdrv_check == NULL) {
1843
        return -ENOTSUP;
1844
    }
1845

    
1846
    memset(res, 0, sizeof(*res));
1847
    return bs->drv->bdrv_check(bs, res, fix);
1848
}
1849

    
1850
#define COMMIT_BUF_SECTORS 2048
1851

    
1852
/* commit COW file into the raw image */
1853
int bdrv_commit(BlockDriverState *bs)
1854
{
1855
    BlockDriver *drv = bs->drv;
1856
    int64_t sector, total_sectors;
1857
    int n, ro, open_flags;
1858
    int ret = 0;
1859
    uint8_t *buf;
1860
    char filename[PATH_MAX];
1861

    
1862
    if (!drv)
1863
        return -ENOMEDIUM;
1864
    
1865
    if (!bs->backing_hd) {
1866
        return -ENOTSUP;
1867
    }
1868

    
1869
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1870
        return -EBUSY;
1871
    }
1872

    
1873
    ro = bs->backing_hd->read_only;
1874
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1875
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1876
    open_flags =  bs->backing_hd->open_flags;
1877

    
1878
    if (ro) {
1879
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1880
            return -EACCES;
1881
        }
1882
    }
1883

    
1884
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1885
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1886

    
1887
    for (sector = 0; sector < total_sectors; sector += n) {
1888
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
1889
        if (ret < 0) {
1890
            goto ro_cleanup;
1891
        }
1892
        if (ret) {
1893
            if (bdrv_read(bs, sector, buf, n) != 0) {
1894
                ret = -EIO;
1895
                goto ro_cleanup;
1896
            }
1897

    
1898
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1899
                ret = -EIO;
1900
                goto ro_cleanup;
1901
            }
1902
        }
1903
    }
1904

    
1905
    if (drv->bdrv_make_empty) {
1906
        ret = drv->bdrv_make_empty(bs);
1907
        bdrv_flush(bs);
1908
    }
1909

    
1910
    /*
1911
     * Make sure all data we wrote to the backing device is actually
1912
     * stable on disk.
1913
     */
1914
    if (bs->backing_hd)
1915
        bdrv_flush(bs->backing_hd);
1916

    
1917
ro_cleanup:
1918
    g_free(buf);
1919

    
1920
    if (ro) {
1921
        /* ignoring error return here */
1922
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1923
    }
1924

    
1925
    return ret;
1926
}
1927

    
1928
int bdrv_commit_all(void)
1929
{
1930
    BlockDriverState *bs;
1931

    
1932
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1933
        if (bs->drv && bs->backing_hd) {
1934
            int ret = bdrv_commit(bs);
1935
            if (ret < 0) {
1936
                return ret;
1937
            }
1938
        }
1939
    }
1940
    return 0;
1941
}
1942

    
1943
/**
1944
 * Remove an active request from the tracked requests list
1945
 *
1946
 * This function should be called when a tracked request is completing.
1947
 */
1948
static void tracked_request_end(BdrvTrackedRequest *req)
1949
{
1950
    QLIST_REMOVE(req, list);
1951
    qemu_co_queue_restart_all(&req->wait_queue);
1952
}
1953

    
1954
/**
1955
 * Add an active request to the tracked requests list
1956
 */
1957
static void tracked_request_begin(BdrvTrackedRequest *req,
1958
                                  BlockDriverState *bs,
1959
                                  int64_t sector_num,
1960
                                  int nb_sectors, bool is_write)
1961
{
1962
    *req = (BdrvTrackedRequest){
1963
        .bs = bs,
1964
        .sector_num = sector_num,
1965
        .nb_sectors = nb_sectors,
1966
        .is_write = is_write,
1967
        .co = qemu_coroutine_self(),
1968
    };
1969

    
1970
    qemu_co_queue_init(&req->wait_queue);
1971

    
1972
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1973
}
1974

    
1975
/**
1976
 * Round a region to cluster boundaries
1977
 */
1978
void bdrv_round_to_clusters(BlockDriverState *bs,
1979
                            int64_t sector_num, int nb_sectors,
1980
                            int64_t *cluster_sector_num,
1981
                            int *cluster_nb_sectors)
1982
{
1983
    BlockDriverInfo bdi;
1984

    
1985
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1986
        *cluster_sector_num = sector_num;
1987
        *cluster_nb_sectors = nb_sectors;
1988
    } else {
1989
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1990
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1991
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1992
                                            nb_sectors, c);
1993
    }
1994
}
1995

    
1996
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1997
                                     int64_t sector_num, int nb_sectors) {
1998
    /*        aaaa   bbbb */
1999
    if (sector_num >= req->sector_num + req->nb_sectors) {
2000
        return false;
2001
    }
2002
    /* bbbb   aaaa        */
2003
    if (req->sector_num >= sector_num + nb_sectors) {
2004
        return false;
2005
    }
2006
    return true;
2007
}
2008

    
2009
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2010
        int64_t sector_num, int nb_sectors)
2011
{
2012
    BdrvTrackedRequest *req;
2013
    int64_t cluster_sector_num;
2014
    int cluster_nb_sectors;
2015
    bool retry;
2016

    
2017
    /* If we touch the same cluster it counts as an overlap.  This guarantees
2018
     * that allocating writes will be serialized and not race with each other
2019
     * for the same cluster.  For example, in copy-on-read it ensures that the
2020
     * CoR read and write operations are atomic and guest writes cannot
2021
     * interleave between them.
2022
     */
2023
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2024
                           &cluster_sector_num, &cluster_nb_sectors);
2025

    
2026
    do {
2027
        retry = false;
2028
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2029
            if (tracked_request_overlaps(req, cluster_sector_num,
2030
                                         cluster_nb_sectors)) {
2031
                /* Hitting this means there was a reentrant request, for
2032
                 * example, a block driver issuing nested requests.  This must
2033
                 * never happen since it means deadlock.
2034
                 */
2035
                assert(qemu_coroutine_self() != req->co);
2036

    
2037
                qemu_co_queue_wait(&req->wait_queue);
2038
                retry = true;
2039
                break;
2040
            }
2041
        }
2042
    } while (retry);
2043
}
2044

    
2045
/*
2046
 * Return values:
2047
 * 0        - success
2048
 * -EINVAL  - backing format specified, but no file
2049
 * -ENOSPC  - can't update the backing file because no space is left in the
2050
 *            image file header
2051
 * -ENOTSUP - format driver doesn't support changing the backing file
2052
 */
2053
int bdrv_change_backing_file(BlockDriverState *bs,
2054
    const char *backing_file, const char *backing_fmt)
2055
{
2056
    BlockDriver *drv = bs->drv;
2057
    int ret;
2058

    
2059
    /* Backing file format doesn't make sense without a backing file */
2060
    if (backing_fmt && !backing_file) {
2061
        return -EINVAL;
2062
    }
2063

    
2064
    if (drv->bdrv_change_backing_file != NULL) {
2065
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2066
    } else {
2067
        ret = -ENOTSUP;
2068
    }
2069

    
2070
    if (ret == 0) {
2071
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2072
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2073
    }
2074
    return ret;
2075
}
2076

    
2077
/*
2078
 * Finds the image layer in the chain that has 'bs' as its backing file.
2079
 *
2080
 * active is the current topmost image.
2081
 *
2082
 * Returns NULL if bs is not found in active's image chain,
2083
 * or if active == bs.
2084
 */
2085
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2086
                                    BlockDriverState *bs)
2087
{
2088
    BlockDriverState *overlay = NULL;
2089
    BlockDriverState *intermediate;
2090

    
2091
    assert(active != NULL);
2092
    assert(bs != NULL);
2093

    
2094
    /* if bs is the same as active, then by definition it has no overlay
2095
     */
2096
    if (active == bs) {
2097
        return NULL;
2098
    }
2099

    
2100
    intermediate = active;
2101
    while (intermediate->backing_hd) {
2102
        if (intermediate->backing_hd == bs) {
2103
            overlay = intermediate;
2104
            break;
2105
        }
2106
        intermediate = intermediate->backing_hd;
2107
    }
2108

    
2109
    return overlay;
2110
}
2111

    
2112
typedef struct BlkIntermediateStates {
2113
    BlockDriverState *bs;
2114
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2115
} BlkIntermediateStates;
2116

    
2117

    
2118
/*
2119
 * Drops images above 'base' up to and including 'top', and sets the image
2120
 * above 'top' to have base as its backing file.
2121
 *
2122
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2123
 * information in 'bs' can be properly updated.
2124
 *
2125
 * E.g., this will convert the following chain:
2126
 * bottom <- base <- intermediate <- top <- active
2127
 *
2128
 * to
2129
 *
2130
 * bottom <- base <- active
2131
 *
2132
 * It is allowed for bottom==base, in which case it converts:
2133
 *
2134
 * base <- intermediate <- top <- active
2135
 *
2136
 * to
2137
 *
2138
 * base <- active
2139
 *
2140
 * Error conditions:
2141
 *  if active == top, that is considered an error
2142
 *
2143
 */
2144
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2145
                           BlockDriverState *base)
2146
{
2147
    BlockDriverState *intermediate;
2148
    BlockDriverState *base_bs = NULL;
2149
    BlockDriverState *new_top_bs = NULL;
2150
    BlkIntermediateStates *intermediate_state, *next;
2151
    int ret = -EIO;
2152

    
2153
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2154
    QSIMPLEQ_INIT(&states_to_delete);
2155

    
2156
    if (!top->drv || !base->drv) {
2157
        goto exit;
2158
    }
2159

    
2160
    new_top_bs = bdrv_find_overlay(active, top);
2161

    
2162
    if (new_top_bs == NULL) {
2163
        /* we could not find the image above 'top', this is an error */
2164
        goto exit;
2165
    }
2166

    
2167
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2168
     * to do, no intermediate images */
2169
    if (new_top_bs->backing_hd == base) {
2170
        ret = 0;
2171
        goto exit;
2172
    }
2173

    
2174
    intermediate = top;
2175

    
2176
    /* now we will go down through the list, and add each BDS we find
2177
     * into our deletion queue, until we hit the 'base'
2178
     */
2179
    while (intermediate) {
2180
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2181
        intermediate_state->bs = intermediate;
2182
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2183

    
2184
        if (intermediate->backing_hd == base) {
2185
            base_bs = intermediate->backing_hd;
2186
            break;
2187
        }
2188
        intermediate = intermediate->backing_hd;
2189
    }
2190
    if (base_bs == NULL) {
2191
        /* something went wrong, we did not end at the base. safely
2192
         * unravel everything, and exit with error */
2193
        goto exit;
2194
    }
2195

    
2196
    /* success - we can delete the intermediate states, and link top->base */
2197
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2198
                                   base_bs->drv ? base_bs->drv->format_name : "");
2199
    if (ret) {
2200
        goto exit;
2201
    }
2202
    new_top_bs->backing_hd = base_bs;
2203

    
2204

    
2205
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2206
        /* so that bdrv_close() does not recursively close the chain */
2207
        intermediate_state->bs->backing_hd = NULL;
2208
        bdrv_unref(intermediate_state->bs);
2209
    }
2210
    ret = 0;
2211

    
2212
exit:
2213
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2214
        g_free(intermediate_state);
2215
    }
2216
    return ret;
2217
}
2218

    
2219

    
2220
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2221
                                   size_t size)
2222
{
2223
    int64_t len;
2224

    
2225
    if (!bdrv_is_inserted(bs))
2226
        return -ENOMEDIUM;
2227

    
2228
    if (bs->growable)
2229
        return 0;
2230

    
2231
    len = bdrv_getlength(bs);
2232

    
2233
    if (offset < 0)
2234
        return -EIO;
2235

    
2236
    if ((offset > len) || (len - offset < size))
2237
        return -EIO;
2238

    
2239
    return 0;
2240
}
2241

    
2242
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2243
                              int nb_sectors)
2244
{
2245
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2246
                                   nb_sectors * BDRV_SECTOR_SIZE);
2247
}
2248

    
2249
typedef struct RwCo {
2250
    BlockDriverState *bs;
2251
    int64_t sector_num;
2252
    int nb_sectors;
2253
    QEMUIOVector *qiov;
2254
    bool is_write;
2255
    int ret;
2256
    BdrvRequestFlags flags;
2257
} RwCo;
2258

    
2259
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2260
{
2261
    RwCo *rwco = opaque;
2262

    
2263
    if (!rwco->is_write) {
2264
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2265
                                     rwco->nb_sectors, rwco->qiov,
2266
                                     rwco->flags);
2267
    } else {
2268
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2269
                                      rwco->nb_sectors, rwco->qiov,
2270
                                      rwco->flags);
2271
    }
2272
}
2273

    
2274
/*
2275
 * Process a vectored synchronous request using coroutines
2276
 */
2277
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2278
                       QEMUIOVector *qiov, bool is_write,
2279
                       BdrvRequestFlags flags)
2280
{
2281
    Coroutine *co;
2282
    RwCo rwco = {
2283
        .bs = bs,
2284
        .sector_num = sector_num,
2285
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2286
        .qiov = qiov,
2287
        .is_write = is_write,
2288
        .ret = NOT_DONE,
2289
        .flags = flags,
2290
    };
2291
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2292

    
2293
    /**
2294
     * In sync call context, when the vcpu is blocked, this throttling timer
2295
     * will not fire; so the I/O throttling function has to be disabled here
2296
     * if it has been enabled.
2297
     */
2298
    if (bs->io_limits_enabled) {
2299
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2300
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2301
        bdrv_io_limits_disable(bs);
2302
    }
2303

    
2304
    if (qemu_in_coroutine()) {
2305
        /* Fast-path if already in coroutine context */
2306
        bdrv_rw_co_entry(&rwco);
2307
    } else {
2308
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2309
        qemu_coroutine_enter(co, &rwco);
2310
        while (rwco.ret == NOT_DONE) {
2311
            qemu_aio_wait();
2312
        }
2313
    }
2314
    return rwco.ret;
2315
}
2316

    
2317
/*
2318
 * Process a synchronous request using coroutines
2319
 */
2320
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2321
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2322
{
2323
    QEMUIOVector qiov;
2324
    struct iovec iov = {
2325
        .iov_base = (void *)buf,
2326
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2327
    };
2328

    
2329
    qemu_iovec_init_external(&qiov, &iov, 1);
2330
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2331
}
2332

    
2333
/* return < 0 if error. See bdrv_write() for the return codes */
2334
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2335
              uint8_t *buf, int nb_sectors)
2336
{
2337
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2338
}
2339

    
2340
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2341
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2342
                          uint8_t *buf, int nb_sectors)
2343
{
2344
    bool enabled;
2345
    int ret;
2346

    
2347
    enabled = bs->io_limits_enabled;
2348
    bs->io_limits_enabled = false;
2349
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2350
    bs->io_limits_enabled = enabled;
2351
    return ret;
2352
}
2353

    
2354
/* Return < 0 if error. Important errors are:
2355
  -EIO         generic I/O error (may happen for all errors)
2356
  -ENOMEDIUM   No media inserted.
2357
  -EINVAL      Invalid sector number or nb_sectors
2358
  -EACCES      Trying to write a read-only device
2359
*/
2360
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2361
               const uint8_t *buf, int nb_sectors)
2362
{
2363
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2364
}
2365

    
2366
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2367
{
2368
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2369
}
2370

    
2371
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2372
{
2373
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2374
                      BDRV_REQ_ZERO_WRITE);
2375
}
2376

    
2377
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2378
               void *buf, int count1)
2379
{
2380
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2381
    int len, nb_sectors, count;
2382
    int64_t sector_num;
2383
    int ret;
2384

    
2385
    count = count1;
2386
    /* first read to align to sector start */
2387
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2388
    if (len > count)
2389
        len = count;
2390
    sector_num = offset >> BDRV_SECTOR_BITS;
2391
    if (len > 0) {
2392
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2393
            return ret;
2394
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2395
        count -= len;
2396
        if (count == 0)
2397
            return count1;
2398
        sector_num++;
2399
        buf += len;
2400
    }
2401

    
2402
    /* read the sectors "in place" */
2403
    nb_sectors = count >> BDRV_SECTOR_BITS;
2404
    if (nb_sectors > 0) {
2405
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2406
            return ret;
2407
        sector_num += nb_sectors;
2408
        len = nb_sectors << BDRV_SECTOR_BITS;
2409
        buf += len;
2410
        count -= len;
2411
    }
2412

    
2413
    /* add data from the last sector */
2414
    if (count > 0) {
2415
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2416
            return ret;
2417
        memcpy(buf, tmp_buf, count);
2418
    }
2419
    return count1;
2420
}
2421

    
2422
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2423
{
2424
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2425
    int len, nb_sectors, count;
2426
    int64_t sector_num;
2427
    int ret;
2428

    
2429
    count = qiov->size;
2430

    
2431
    /* first write to align to sector start */
2432
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2433
    if (len > count)
2434
        len = count;
2435
    sector_num = offset >> BDRV_SECTOR_BITS;
2436
    if (len > 0) {
2437
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2438
            return ret;
2439
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2440
                          len);
2441
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2442
            return ret;
2443
        count -= len;
2444
        if (count == 0)
2445
            return qiov->size;
2446
        sector_num++;
2447
    }
2448

    
2449
    /* write the sectors "in place" */
2450
    nb_sectors = count >> BDRV_SECTOR_BITS;
2451
    if (nb_sectors > 0) {
2452
        QEMUIOVector qiov_inplace;
2453

    
2454
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2455
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2456
                          nb_sectors << BDRV_SECTOR_BITS);
2457
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2458
        qemu_iovec_destroy(&qiov_inplace);
2459
        if (ret < 0) {
2460
            return ret;
2461
        }
2462

    
2463
        sector_num += nb_sectors;
2464
        len = nb_sectors << BDRV_SECTOR_BITS;
2465
        count -= len;
2466
    }
2467

    
2468
    /* add data from the last sector */
2469
    if (count > 0) {
2470
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2471
            return ret;
2472
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2473
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2474
            return ret;
2475
    }
2476
    return qiov->size;
2477
}
2478

    
2479
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2480
                const void *buf, int count1)
2481
{
2482
    QEMUIOVector qiov;
2483
    struct iovec iov = {
2484
        .iov_base   = (void *) buf,
2485
        .iov_len    = count1,
2486
    };
2487

    
2488
    qemu_iovec_init_external(&qiov, &iov, 1);
2489
    return bdrv_pwritev(bs, offset, &qiov);
2490
}
2491

    
2492
/*
2493
 * Writes to the file and ensures that no writes are reordered across this
2494
 * request (acts as a barrier)
2495
 *
2496
 * Returns 0 on success, -errno in error cases.
2497
 */
2498
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2499
    const void *buf, int count)
2500
{
2501
    int ret;
2502

    
2503
    ret = bdrv_pwrite(bs, offset, buf, count);
2504
    if (ret < 0) {
2505
        return ret;
2506
    }
2507

    
2508
    /* No flush needed for cache modes that already do it */
2509
    if (bs->enable_write_cache) {
2510
        bdrv_flush(bs);
2511
    }
2512

    
2513
    return 0;
2514
}
2515

    
2516
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2517
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2518
{
2519
    /* Perform I/O through a temporary buffer so that users who scribble over
2520
     * their read buffer while the operation is in progress do not end up
2521
     * modifying the image file.  This is critical for zero-copy guest I/O
2522
     * where anything might happen inside guest memory.
2523
     */
2524
    void *bounce_buffer;
2525

    
2526
    BlockDriver *drv = bs->drv;
2527
    struct iovec iov;
2528
    QEMUIOVector bounce_qiov;
2529
    int64_t cluster_sector_num;
2530
    int cluster_nb_sectors;
2531
    size_t skip_bytes;
2532
    int ret;
2533

    
2534
    /* Cover entire cluster so no additional backing file I/O is required when
2535
     * allocating cluster in the image file.
2536
     */
2537
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2538
                           &cluster_sector_num, &cluster_nb_sectors);
2539

    
2540
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2541
                                   cluster_sector_num, cluster_nb_sectors);
2542

    
2543
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2544
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2545
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2546

    
2547
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2548
                             &bounce_qiov);
2549
    if (ret < 0) {
2550
        goto err;
2551
    }
2552

    
2553
    if (drv->bdrv_co_write_zeroes &&
2554
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2555
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2556
                                      cluster_nb_sectors);
2557
    } else {
2558
        /* This does not change the data on the disk, it is not necessary
2559
         * to flush even in cache=writethrough mode.
2560
         */
2561
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2562
                                  &bounce_qiov);
2563
    }
2564

    
2565
    if (ret < 0) {
2566
        /* It might be okay to ignore write errors for guest requests.  If this
2567
         * is a deliberate copy-on-read then we don't want to ignore the error.
2568
         * Simply report it in all cases.
2569
         */
2570
        goto err;
2571
    }
2572

    
2573
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2574
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2575
                        nb_sectors * BDRV_SECTOR_SIZE);
2576

    
2577
err:
2578
    qemu_vfree(bounce_buffer);
2579
    return ret;
2580
}
2581

    
2582
/*
2583
 * Handle a read request in coroutine context
2584
 */
2585
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2586
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2587
    BdrvRequestFlags flags)
2588
{
2589
    BlockDriver *drv = bs->drv;
2590
    BdrvTrackedRequest req;
2591
    int ret;
2592

    
2593
    if (!drv) {
2594
        return -ENOMEDIUM;
2595
    }
2596
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2597
        return -EIO;
2598
    }
2599

    
2600
    if (bs->copy_on_read) {
2601
        flags |= BDRV_REQ_COPY_ON_READ;
2602
    }
2603
    if (flags & BDRV_REQ_COPY_ON_READ) {
2604
        bs->copy_on_read_in_flight++;
2605
    }
2606

    
2607
    if (bs->copy_on_read_in_flight) {
2608
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2609
    }
2610

    
2611
    /* throttling disk I/O */
2612
    if (bs->io_limits_enabled) {
2613
        bdrv_io_limits_intercept(bs, nb_sectors, false);
2614
    }
2615

    
2616
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2617

    
2618
    if (flags & BDRV_REQ_COPY_ON_READ) {
2619
        int pnum;
2620

    
2621
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2622
        if (ret < 0) {
2623
            goto out;
2624
        }
2625

    
2626
        if (!ret || pnum != nb_sectors) {
2627
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2628
            goto out;
2629
        }
2630
    }
2631

    
2632
    if (!(bs->zero_beyond_eof && bs->growable)) {
2633
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2634
    } else {
2635
        /* Read zeros after EOF of growable BDSes */
2636
        int64_t len, total_sectors, max_nb_sectors;
2637

    
2638
        len = bdrv_getlength(bs);
2639
        if (len < 0) {
2640
            ret = len;
2641
            goto out;
2642
        }
2643

    
2644
        total_sectors = len >> BDRV_SECTOR_BITS;
2645
        max_nb_sectors = MAX(0, total_sectors - sector_num);
2646
        if (max_nb_sectors > 0) {
2647
            ret = drv->bdrv_co_readv(bs, sector_num,
2648
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2649
        } else {
2650
            ret = 0;
2651
        }
2652

    
2653
        /* Reading beyond end of file is supposed to produce zeroes */
2654
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2655
            uint64_t offset = MAX(0, total_sectors - sector_num);
2656
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2657
                              BDRV_SECTOR_SIZE;
2658
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2659
        }
2660
    }
2661

    
2662
out:
2663
    tracked_request_end(&req);
2664

    
2665
    if (flags & BDRV_REQ_COPY_ON_READ) {
2666
        bs->copy_on_read_in_flight--;
2667
    }
2668

    
2669
    return ret;
2670
}
2671

    
2672
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2673
    int nb_sectors, QEMUIOVector *qiov)
2674
{
2675
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2676

    
2677
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2678
}
2679

    
2680
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2681
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2682
{
2683
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2684

    
2685
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2686
                            BDRV_REQ_COPY_ON_READ);
2687
}
2688

    
2689
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2690
    int64_t sector_num, int nb_sectors)
2691
{
2692
    BlockDriver *drv = bs->drv;
2693
    QEMUIOVector qiov;
2694
    struct iovec iov;
2695
    int ret;
2696

    
2697
    /* TODO Emulate only part of misaligned requests instead of letting block
2698
     * drivers return -ENOTSUP and emulate everything */
2699

    
2700
    /* First try the efficient write zeroes operation */
2701
    if (drv->bdrv_co_write_zeroes) {
2702
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2703
        if (ret != -ENOTSUP) {
2704
            return ret;
2705
        }
2706
    }
2707

    
2708
    /* Fall back to bounce buffer if write zeroes is unsupported */
2709
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2710
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2711
    memset(iov.iov_base, 0, iov.iov_len);
2712
    qemu_iovec_init_external(&qiov, &iov, 1);
2713

    
2714
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2715

    
2716
    qemu_vfree(iov.iov_base);
2717
    return ret;
2718
}
2719

    
2720
/*
2721
 * Handle a write request in coroutine context
2722
 */
2723
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2724
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2725
    BdrvRequestFlags flags)
2726
{
2727
    BlockDriver *drv = bs->drv;
2728
    BdrvTrackedRequest req;
2729
    int ret;
2730

    
2731
    if (!bs->drv) {
2732
        return -ENOMEDIUM;
2733
    }
2734
    if (bs->read_only) {
2735
        return -EACCES;
2736
    }
2737
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2738
        return -EIO;
2739
    }
2740

    
2741
    if (bs->copy_on_read_in_flight) {
2742
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2743
    }
2744

    
2745
    /* throttling disk I/O */
2746
    if (bs->io_limits_enabled) {
2747
        bdrv_io_limits_intercept(bs, nb_sectors, true);
2748
    }
2749

    
2750
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2751

    
2752
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2753

    
2754
    if (ret < 0) {
2755
        /* Do nothing, write notifier decided to fail this request */
2756
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
2757
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2758
    } else {
2759
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2760
    }
2761

    
2762
    if (ret == 0 && !bs->enable_write_cache) {
2763
        ret = bdrv_co_flush(bs);
2764
    }
2765

    
2766
    if (bs->dirty_bitmap) {
2767
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2768
    }
2769

    
2770
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2771
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2772
    }
2773
    if (bs->growable && ret >= 0) {
2774
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
2775
    }
2776

    
2777
    tracked_request_end(&req);
2778

    
2779
    return ret;
2780
}
2781

    
2782
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2783
    int nb_sectors, QEMUIOVector *qiov)
2784
{
2785
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2786

    
2787
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2788
}
2789

    
2790
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2791
                                      int64_t sector_num, int nb_sectors)
2792
{
2793
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2794

    
2795
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2796
                             BDRV_REQ_ZERO_WRITE);
2797
}
2798

    
2799
/**
2800
 * Truncate file to 'offset' bytes (needed only for file protocols)
2801
 */
2802
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2803
{
2804
    BlockDriver *drv = bs->drv;
2805
    int ret;
2806
    if (!drv)
2807
        return -ENOMEDIUM;
2808
    if (!drv->bdrv_truncate)
2809
        return -ENOTSUP;
2810
    if (bs->read_only)
2811
        return -EACCES;
2812
    if (bdrv_in_use(bs))
2813
        return -EBUSY;
2814
    ret = drv->bdrv_truncate(bs, offset);
2815
    if (ret == 0) {
2816
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2817
        bdrv_dev_resize_cb(bs);
2818
    }
2819
    return ret;
2820
}
2821

    
2822
/**
2823
 * Length of a allocated file in bytes. Sparse files are counted by actual
2824
 * allocated space. Return < 0 if error or unknown.
2825
 */
2826
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2827
{
2828
    BlockDriver *drv = bs->drv;
2829
    if (!drv) {
2830
        return -ENOMEDIUM;
2831
    }
2832
    if (drv->bdrv_get_allocated_file_size) {
2833
        return drv->bdrv_get_allocated_file_size(bs);
2834
    }
2835
    if (bs->file) {
2836
        return bdrv_get_allocated_file_size(bs->file);
2837
    }
2838
    return -ENOTSUP;
2839
}
2840

    
2841
/**
2842
 * Length of a file in bytes. Return < 0 if error or unknown.
2843
 */
2844
int64_t bdrv_getlength(BlockDriverState *bs)
2845
{
2846
    BlockDriver *drv = bs->drv;
2847
    if (!drv)
2848
        return -ENOMEDIUM;
2849

    
2850
    if (bdrv_dev_has_removable_media(bs)) {
2851
        if (drv->bdrv_getlength) {
2852
            return drv->bdrv_getlength(bs);
2853
        }
2854
    }
2855
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2856
}
2857

    
2858
/* return 0 as number of sectors if no device present or error */
2859
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2860
{
2861
    int64_t length;
2862
    length = bdrv_getlength(bs);
2863
    if (length < 0)
2864
        length = 0;
2865
    else
2866
        length = length >> BDRV_SECTOR_BITS;
2867
    *nb_sectors_ptr = length;
2868
}
2869

    
2870
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2871
                       BlockdevOnError on_write_error)
2872
{
2873
    bs->on_read_error = on_read_error;
2874
    bs->on_write_error = on_write_error;
2875
}
2876

    
2877
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2878
{
2879
    return is_read ? bs->on_read_error : bs->on_write_error;
2880
}
2881

    
2882
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2883
{
2884
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2885

    
2886
    switch (on_err) {
2887
    case BLOCKDEV_ON_ERROR_ENOSPC:
2888
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2889
    case BLOCKDEV_ON_ERROR_STOP:
2890
        return BDRV_ACTION_STOP;
2891
    case BLOCKDEV_ON_ERROR_REPORT:
2892
        return BDRV_ACTION_REPORT;
2893
    case BLOCKDEV_ON_ERROR_IGNORE:
2894
        return BDRV_ACTION_IGNORE;
2895
    default:
2896
        abort();
2897
    }
2898
}
2899

    
2900
/* This is done by device models because, while the block layer knows
2901
 * about the error, it does not know whether an operation comes from
2902
 * the device or the block layer (from a job, for example).
2903
 */
2904
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2905
                       bool is_read, int error)
2906
{
2907
    assert(error >= 0);
2908
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2909
    if (action == BDRV_ACTION_STOP) {
2910
        vm_stop(RUN_STATE_IO_ERROR);
2911
        bdrv_iostatus_set_err(bs, error);
2912
    }
2913
}
2914

    
2915
int bdrv_is_read_only(BlockDriverState *bs)
2916
{
2917
    return bs->read_only;
2918
}
2919

    
2920
int bdrv_is_sg(BlockDriverState *bs)
2921
{
2922
    return bs->sg;
2923
}
2924

    
2925
int bdrv_enable_write_cache(BlockDriverState *bs)
2926
{
2927
    return bs->enable_write_cache;
2928
}
2929

    
2930
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2931
{
2932
    bs->enable_write_cache = wce;
2933

    
2934
    /* so a reopen() will preserve wce */
2935
    if (wce) {
2936
        bs->open_flags |= BDRV_O_CACHE_WB;
2937
    } else {
2938
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2939
    }
2940
}
2941

    
2942
int bdrv_is_encrypted(BlockDriverState *bs)
2943
{
2944
    if (bs->backing_hd && bs->backing_hd->encrypted)
2945
        return 1;
2946
    return bs->encrypted;
2947
}
2948

    
2949
int bdrv_key_required(BlockDriverState *bs)
2950
{
2951
    BlockDriverState *backing_hd = bs->backing_hd;
2952

    
2953
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2954
        return 1;
2955
    return (bs->encrypted && !bs->valid_key);
2956
}
2957

    
2958
int bdrv_set_key(BlockDriverState *bs, const char *key)
2959
{
2960
    int ret;
2961
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2962
        ret = bdrv_set_key(bs->backing_hd, key);
2963
        if (ret < 0)
2964
            return ret;
2965
        if (!bs->encrypted)
2966
            return 0;
2967
    }
2968
    if (!bs->encrypted) {
2969
        return -EINVAL;
2970
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2971
        return -ENOMEDIUM;
2972
    }
2973
    ret = bs->drv->bdrv_set_key(bs, key);
2974
    if (ret < 0) {
2975
        bs->valid_key = 0;
2976
    } else if (!bs->valid_key) {
2977
        bs->valid_key = 1;
2978
        /* call the change callback now, we skipped it on open */
2979
        bdrv_dev_change_media_cb(bs, true);
2980
    }
2981
    return ret;
2982
}
2983

    
2984
const char *bdrv_get_format_name(BlockDriverState *bs)
2985
{
2986
    return bs->drv ? bs->drv->format_name : NULL;
2987
}
2988

    
2989
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2990
                         void *opaque)
2991
{
2992
    BlockDriver *drv;
2993

    
2994
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2995
        it(opaque, drv->format_name);
2996
    }
2997
}
2998

    
2999
BlockDriverState *bdrv_find(const char *name)
3000
{
3001
    BlockDriverState *bs;
3002

    
3003
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3004
        if (!strcmp(name, bs->device_name)) {
3005
            return bs;
3006
        }
3007
    }
3008
    return NULL;
3009
}
3010

    
3011
BlockDriverState *bdrv_next(BlockDriverState *bs)
3012
{
3013
    if (!bs) {
3014
        return QTAILQ_FIRST(&bdrv_states);
3015
    }
3016
    return QTAILQ_NEXT(bs, list);
3017
}
3018

    
3019
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3020
{
3021
    BlockDriverState *bs;
3022

    
3023
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3024
        it(opaque, bs);
3025
    }
3026
}
3027

    
3028
const char *bdrv_get_device_name(BlockDriverState *bs)
3029
{
3030
    return bs->device_name;
3031
}
3032

    
3033
int bdrv_get_flags(BlockDriverState *bs)
3034
{
3035
    return bs->open_flags;
3036
}
3037

    
3038
int bdrv_flush_all(void)
3039
{
3040
    BlockDriverState *bs;
3041
    int result = 0;
3042

    
3043
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3044
        int ret = bdrv_flush(bs);
3045
        if (ret < 0 && !result) {
3046
            result = ret;
3047
        }
3048
    }
3049

    
3050
    return result;
3051
}
3052

    
3053
int bdrv_has_zero_init_1(BlockDriverState *bs)
3054
{
3055
    return 1;
3056
}
3057

    
3058
int bdrv_has_zero_init(BlockDriverState *bs)
3059
{
3060
    assert(bs->drv);
3061

    
3062
    /* If BS is a copy on write image, it is initialized to
3063
       the contents of the base image, which may not be zeroes.  */
3064
    if (bs->backing_hd) {
3065
        return 0;
3066
    }
3067
    if (bs->drv->bdrv_has_zero_init) {
3068
        return bs->drv->bdrv_has_zero_init(bs);
3069
    }
3070

    
3071
    /* safe default */
3072
    return 0;
3073
}
3074

    
3075
typedef struct BdrvCoGetBlockStatusData {
3076
    BlockDriverState *bs;
3077
    BlockDriverState *base;
3078
    int64_t sector_num;
3079
    int nb_sectors;
3080
    int *pnum;
3081
    int64_t ret;
3082
    bool done;
3083
} BdrvCoGetBlockStatusData;
3084

    
3085
/*
3086
 * Returns true iff the specified sector is present in the disk image. Drivers
3087
 * not implementing the functionality are assumed to not support backing files,
3088
 * hence all their sectors are reported as allocated.
3089
 *
3090
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3091
 * and 'pnum' is set to 0.
3092
 *
3093
 * 'pnum' is set to the number of sectors (including and immediately following
3094
 * the specified sector) that are known to be in the same
3095
 * allocated/unallocated state.
3096
 *
3097
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3098
 * beyond the end of the disk image it will be clamped.
3099
 */
3100
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3101
                                                     int64_t sector_num,
3102
                                                     int nb_sectors, int *pnum)
3103
{
3104
    int64_t length;
3105
    int64_t n;
3106
    int64_t ret, ret2;
3107

    
3108
    length = bdrv_getlength(bs);
3109
    if (length < 0) {
3110
        return length;
3111
    }
3112

    
3113
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3114
        *pnum = 0;
3115
        return 0;
3116
    }
3117

    
3118
    n = bs->total_sectors - sector_num;
3119
    if (n < nb_sectors) {
3120
        nb_sectors = n;
3121
    }
3122

    
3123
    if (!bs->drv->bdrv_co_get_block_status) {
3124
        *pnum = nb_sectors;
3125
        ret = BDRV_BLOCK_DATA;
3126
        if (bs->drv->protocol_name) {
3127
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3128
        }
3129
        return ret;
3130
    }
3131

    
3132
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3133
    if (ret < 0) {
3134
        return ret;
3135
    }
3136

    
3137
    if (!(ret & BDRV_BLOCK_DATA)) {
3138
        if (bdrv_has_zero_init(bs)) {
3139
            ret |= BDRV_BLOCK_ZERO;
3140
        } else {
3141
            BlockDriverState *bs2 = bs->backing_hd;
3142
            int64_t length2 = bdrv_getlength(bs2);
3143
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3144
                ret |= BDRV_BLOCK_ZERO;
3145
            }
3146
        }
3147
    }
3148

    
3149
    if (bs->file &&
3150
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3151
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3152
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3153
                                        *pnum, pnum);
3154
        if (ret2 >= 0) {
3155
            /* Ignore errors.  This is just providing extra information, it
3156
             * is useful but not necessary.
3157
             */
3158
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3159
        }
3160
    }
3161

    
3162
    return ret;
3163
}
3164

    
3165
/* Coroutine wrapper for bdrv_get_block_status() */
3166
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3167
{
3168
    BdrvCoGetBlockStatusData *data = opaque;
3169
    BlockDriverState *bs = data->bs;
3170

    
3171
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3172
                                         data->pnum);
3173
    data->done = true;
3174
}
3175

    
3176
/*
3177
 * Synchronous wrapper around bdrv_co_get_block_status().
3178
 *
3179
 * See bdrv_co_get_block_status() for details.
3180
 */
3181
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3182
                              int nb_sectors, int *pnum)
3183
{
3184
    Coroutine *co;
3185
    BdrvCoGetBlockStatusData data = {
3186
        .bs = bs,
3187
        .sector_num = sector_num,
3188
        .nb_sectors = nb_sectors,
3189
        .pnum = pnum,
3190
        .done = false,
3191
    };
3192

    
3193
    if (qemu_in_coroutine()) {
3194
        /* Fast-path if already in coroutine context */
3195
        bdrv_get_block_status_co_entry(&data);
3196
    } else {
3197
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3198
        qemu_coroutine_enter(co, &data);
3199
        while (!data.done) {
3200
            qemu_aio_wait();
3201
        }
3202
    }
3203
    return data.ret;
3204
}
3205

    
3206
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3207
                                   int nb_sectors, int *pnum)
3208
{
3209
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3210
    if (ret < 0) {
3211
        return ret;
3212
    }
3213
    return
3214
        (ret & BDRV_BLOCK_DATA) ||
3215
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3216
}
3217

    
3218
/*
3219
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3220
 *
3221
 * Return true if the given sector is allocated in any image between
3222
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3223
 * sector is allocated in any image of the chain.  Return false otherwise.
3224
 *
3225
 * 'pnum' is set to the number of sectors (including and immediately following
3226
 *  the specified sector) that are known to be in the same
3227
 *  allocated/unallocated state.
3228
 *
3229
 */
3230
int bdrv_is_allocated_above(BlockDriverState *top,
3231
                            BlockDriverState *base,
3232
                            int64_t sector_num,
3233
                            int nb_sectors, int *pnum)
3234
{
3235
    BlockDriverState *intermediate;
3236
    int ret, n = nb_sectors;
3237

    
3238
    intermediate = top;
3239
    while (intermediate && intermediate != base) {
3240
        int pnum_inter;
3241
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3242
                                &pnum_inter);
3243
        if (ret < 0) {
3244
            return ret;
3245
        } else if (ret) {
3246
            *pnum = pnum_inter;
3247
            return 1;
3248
        }
3249

    
3250
        /*
3251
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3252
         * might have
3253
         *
3254
         * [sector_num+x, nr_sectors] allocated.
3255
         */
3256
        if (n > pnum_inter &&
3257
            (intermediate == top ||
3258
             sector_num + pnum_inter < intermediate->total_sectors)) {
3259
            n = pnum_inter;
3260
        }
3261

    
3262
        intermediate = intermediate->backing_hd;
3263
    }
3264

    
3265
    *pnum = n;
3266
    return 0;
3267
}
3268

    
3269
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3270
{
3271
    if (bs->backing_hd && bs->backing_hd->encrypted)
3272
        return bs->backing_file;
3273
    else if (bs->encrypted)
3274
        return bs->filename;
3275
    else
3276
        return NULL;
3277
}
3278

    
3279
void bdrv_get_backing_filename(BlockDriverState *bs,
3280
                               char *filename, int filename_size)
3281
{
3282
    pstrcpy(filename, filename_size, bs->backing_file);
3283
}
3284

    
3285
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3286
                          const uint8_t *buf, int nb_sectors)
3287
{
3288
    BlockDriver *drv = bs->drv;
3289
    if (!drv)
3290
        return -ENOMEDIUM;
3291
    if (!drv->bdrv_write_compressed)
3292
        return -ENOTSUP;
3293
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3294
        return -EIO;
3295

    
3296
    assert(!bs->dirty_bitmap);
3297

    
3298
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3299
}
3300

    
3301
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3302
{
3303
    BlockDriver *drv = bs->drv;
3304
    if (!drv)
3305
        return -ENOMEDIUM;
3306
    if (!drv->bdrv_get_info)
3307
        return -ENOTSUP;
3308
    memset(bdi, 0, sizeof(*bdi));
3309
    return drv->bdrv_get_info(bs, bdi);
3310
}
3311

    
3312
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3313
                      int64_t pos, int size)
3314
{
3315
    QEMUIOVector qiov;
3316
    struct iovec iov = {
3317
        .iov_base   = (void *) buf,
3318
        .iov_len    = size,
3319
    };
3320

    
3321
    qemu_iovec_init_external(&qiov, &iov, 1);
3322
    return bdrv_writev_vmstate(bs, &qiov, pos);
3323
}
3324

    
3325
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3326
{
3327
    BlockDriver *drv = bs->drv;
3328

    
3329
    if (!drv) {
3330
        return -ENOMEDIUM;
3331
    } else if (drv->bdrv_save_vmstate) {
3332
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3333
    } else if (bs->file) {
3334
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3335
    }
3336

    
3337
    return -ENOTSUP;
3338
}
3339

    
3340
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3341
                      int64_t pos, int size)
3342
{
3343
    BlockDriver *drv = bs->drv;
3344
    if (!drv)
3345
        return -ENOMEDIUM;
3346
    if (drv->bdrv_load_vmstate)
3347
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3348
    if (bs->file)
3349
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3350
    return -ENOTSUP;
3351
}
3352

    
3353
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3354
{
3355
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3356
        return;
3357
    }
3358

    
3359
    bs->drv->bdrv_debug_event(bs, event);
3360
}
3361

    
3362
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3363
                          const char *tag)
3364
{
3365
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3366
        bs = bs->file;
3367
    }
3368

    
3369
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3370
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3371
    }
3372

    
3373
    return -ENOTSUP;
3374
}
3375

    
3376
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3377
{
3378
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3379
        bs = bs->file;
3380
    }
3381

    
3382
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3383
        return bs->drv->bdrv_debug_resume(bs, tag);
3384
    }
3385

    
3386
    return -ENOTSUP;
3387
}
3388

    
3389
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3390
{
3391
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3392
        bs = bs->file;
3393
    }
3394

    
3395
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3396
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3397
    }
3398

    
3399
    return false;
3400
}
3401

    
3402
int bdrv_is_snapshot(BlockDriverState *bs)
3403
{
3404
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3405
}
3406

    
3407
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3408
 * relative, it must be relative to the chain.  So, passing in bs->filename
3409
 * from a BDS as backing_file should not be done, as that may be relative to
3410
 * the CWD rather than the chain. */
3411
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3412
        const char *backing_file)
3413
{
3414
    char *filename_full = NULL;
3415
    char *backing_file_full = NULL;
3416
    char *filename_tmp = NULL;
3417
    int is_protocol = 0;
3418
    BlockDriverState *curr_bs = NULL;
3419
    BlockDriverState *retval = NULL;
3420

    
3421
    if (!bs || !bs->drv || !backing_file) {
3422
        return NULL;
3423
    }
3424

    
3425
    filename_full     = g_malloc(PATH_MAX);
3426
    backing_file_full = g_malloc(PATH_MAX);
3427
    filename_tmp      = g_malloc(PATH_MAX);
3428

    
3429
    is_protocol = path_has_protocol(backing_file);
3430

    
3431
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3432

    
3433
        /* If either of the filename paths is actually a protocol, then
3434
         * compare unmodified paths; otherwise make paths relative */
3435
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3436
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3437
                retval = curr_bs->backing_hd;
3438
                break;
3439
            }
3440
        } else {
3441
            /* If not an absolute filename path, make it relative to the current
3442
             * image's filename path */
3443
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3444
                         backing_file);
3445

    
3446
            /* We are going to compare absolute pathnames */
3447
            if (!realpath(filename_tmp, filename_full)) {
3448
                continue;
3449
            }
3450

    
3451
            /* We need to make sure the backing filename we are comparing against
3452
             * is relative to the current image filename (or absolute) */
3453
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3454
                         curr_bs->backing_file);
3455

    
3456
            if (!realpath(filename_tmp, backing_file_full)) {
3457
                continue;
3458
            }
3459

    
3460
            if (strcmp(backing_file_full, filename_full) == 0) {
3461
                retval = curr_bs->backing_hd;
3462
                break;
3463
            }
3464
        }
3465
    }
3466

    
3467
    g_free(filename_full);
3468
    g_free(backing_file_full);
3469
    g_free(filename_tmp);
3470
    return retval;
3471
}
3472

    
3473
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3474
{
3475
    if (!bs->drv) {
3476
        return 0;
3477
    }
3478

    
3479
    if (!bs->backing_hd) {
3480
        return 0;
3481
    }
3482

    
3483
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3484
}
3485

    
3486
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3487
{
3488
    BlockDriverState *curr_bs = NULL;
3489

    
3490
    if (!bs) {
3491
        return NULL;
3492
    }
3493

    
3494
    curr_bs = bs;
3495

    
3496
    while (curr_bs->backing_hd) {
3497
        curr_bs = curr_bs->backing_hd;
3498
    }
3499
    return curr_bs;
3500
}
3501

    
3502
/**************************************************************/
3503
/* async I/Os */
3504

    
3505
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3506
                                 QEMUIOVector *qiov, int nb_sectors,
3507
                                 BlockDriverCompletionFunc *cb, void *opaque)
3508
{
3509
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3510

    
3511
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3512
                                 cb, opaque, false);
3513
}
3514

    
3515
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3516
                                  QEMUIOVector *qiov, int nb_sectors,
3517
                                  BlockDriverCompletionFunc *cb, void *opaque)
3518
{
3519
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3520

    
3521
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3522
                                 cb, opaque, true);
3523
}
3524

    
3525

    
3526
typedef struct MultiwriteCB {
3527
    int error;
3528
    int num_requests;
3529
    int num_callbacks;
3530
    struct {
3531
        BlockDriverCompletionFunc *cb;
3532
        void *opaque;
3533
        QEMUIOVector *free_qiov;
3534
    } callbacks[];
3535
} MultiwriteCB;
3536

    
3537
static void multiwrite_user_cb(MultiwriteCB *mcb)
3538
{
3539
    int i;
3540

    
3541
    for (i = 0; i < mcb->num_callbacks; i++) {
3542
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3543
        if (mcb->callbacks[i].free_qiov) {
3544
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3545
        }
3546
        g_free(mcb->callbacks[i].free_qiov);
3547
    }
3548
}
3549

    
3550
static void multiwrite_cb(void *opaque, int ret)
3551
{
3552
    MultiwriteCB *mcb = opaque;
3553

    
3554
    trace_multiwrite_cb(mcb, ret);
3555

    
3556
    if (ret < 0 && !mcb->error) {
3557
        mcb->error = ret;
3558
    }
3559

    
3560
    mcb->num_requests--;
3561
    if (mcb->num_requests == 0) {
3562
        multiwrite_user_cb(mcb);
3563
        g_free(mcb);
3564
    }
3565
}
3566

    
3567
static int multiwrite_req_compare(const void *a, const void *b)
3568
{
3569
    const BlockRequest *req1 = a, *req2 = b;
3570

    
3571
    /*
3572
     * Note that we can't simply subtract req2->sector from req1->sector
3573
     * here as that could overflow the return value.
3574
     */
3575
    if (req1->sector > req2->sector) {
3576
        return 1;
3577
    } else if (req1->sector < req2->sector) {
3578
        return -1;
3579
    } else {
3580
        return 0;
3581
    }
3582
}
3583

    
3584
/*
3585
 * Takes a bunch of requests and tries to merge them. Returns the number of
3586
 * requests that remain after merging.
3587
 */
3588
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3589
    int num_reqs, MultiwriteCB *mcb)
3590
{
3591
    int i, outidx;
3592

    
3593
    // Sort requests by start sector
3594
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3595

    
3596
    // Check if adjacent requests touch the same clusters. If so, combine them,
3597
    // filling up gaps with zero sectors.
3598
    outidx = 0;
3599
    for (i = 1; i < num_reqs; i++) {
3600
        int merge = 0;
3601
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3602

    
3603
        // Handle exactly sequential writes and overlapping writes.
3604
        if (reqs[i].sector <= oldreq_last) {
3605
            merge = 1;
3606
        }
3607

    
3608
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3609
            merge = 0;
3610
        }
3611

    
3612
        if (merge) {
3613
            size_t size;
3614
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3615
            qemu_iovec_init(qiov,
3616
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3617

    
3618
            // Add the first request to the merged one. If the requests are
3619
            // overlapping, drop the last sectors of the first request.
3620
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3621
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3622

    
3623
            // We should need to add any zeros between the two requests
3624
            assert (reqs[i].sector <= oldreq_last);
3625

    
3626
            // Add the second request
3627
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3628

    
3629
            reqs[outidx].nb_sectors = qiov->size >> 9;
3630
            reqs[outidx].qiov = qiov;
3631

    
3632
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3633
        } else {
3634
            outidx++;
3635
            reqs[outidx].sector     = reqs[i].sector;
3636
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3637
            reqs[outidx].qiov       = reqs[i].qiov;
3638
        }
3639
    }
3640

    
3641
    return outidx + 1;
3642
}
3643

    
3644
/*
3645
 * Submit multiple AIO write requests at once.
3646
 *
3647
 * On success, the function returns 0 and all requests in the reqs array have
3648
 * been submitted. In error case this function returns -1, and any of the
3649
 * requests may or may not be submitted yet. In particular, this means that the
3650
 * callback will be called for some of the requests, for others it won't. The
3651
 * caller must check the error field of the BlockRequest to wait for the right
3652
 * callbacks (if error != 0, no callback will be called).
3653
 *
3654
 * The implementation may modify the contents of the reqs array, e.g. to merge
3655
 * requests. However, the fields opaque and error are left unmodified as they
3656
 * are used to signal failure for a single request to the caller.
3657
 */
3658
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3659
{
3660
    MultiwriteCB *mcb;
3661
    int i;
3662

    
3663
    /* don't submit writes if we don't have a medium */
3664
    if (bs->drv == NULL) {
3665
        for (i = 0; i < num_reqs; i++) {
3666
            reqs[i].error = -ENOMEDIUM;
3667
        }
3668
        return -1;
3669
    }
3670

    
3671
    if (num_reqs == 0) {
3672
        return 0;
3673
    }
3674

    
3675
    // Create MultiwriteCB structure
3676
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3677
    mcb->num_requests = 0;
3678
    mcb->num_callbacks = num_reqs;
3679

    
3680
    for (i = 0; i < num_reqs; i++) {
3681
        mcb->callbacks[i].cb = reqs[i].cb;
3682
        mcb->callbacks[i].opaque = reqs[i].opaque;
3683
    }
3684

    
3685
    // Check for mergable requests
3686
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3687

    
3688
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3689

    
3690
    /* Run the aio requests. */
3691
    mcb->num_requests = num_reqs;
3692
    for (i = 0; i < num_reqs; i++) {
3693
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3694
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3695
    }
3696

    
3697
    return 0;
3698
}
3699

    
3700
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3701
{
3702
    acb->aiocb_info->cancel(acb);
3703
}
3704

    
3705
/**************************************************************/
3706
/* async block device emulation */
3707

    
3708
typedef struct BlockDriverAIOCBSync {
3709
    BlockDriverAIOCB common;
3710
    QEMUBH *bh;
3711
    int ret;
3712
    /* vector translation state */
3713
    QEMUIOVector *qiov;
3714
    uint8_t *bounce;
3715
    int is_write;
3716
} BlockDriverAIOCBSync;
3717

    
3718
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3719
{
3720
    BlockDriverAIOCBSync *acb =
3721
        container_of(blockacb, BlockDriverAIOCBSync, common);
3722
    qemu_bh_delete(acb->bh);
3723
    acb->bh = NULL;
3724
    qemu_aio_release(acb);
3725
}
3726

    
3727
static const AIOCBInfo bdrv_em_aiocb_info = {
3728
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3729
    .cancel             = bdrv_aio_cancel_em,
3730
};
3731

    
3732
static void bdrv_aio_bh_cb(void *opaque)
3733
{
3734
    BlockDriverAIOCBSync *acb = opaque;
3735

    
3736
    if (!acb->is_write)
3737
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3738
    qemu_vfree(acb->bounce);
3739
    acb->common.cb(acb->common.opaque, acb->ret);
3740
    qemu_bh_delete(acb->bh);
3741
    acb->bh = NULL;
3742
    qemu_aio_release(acb);
3743
}
3744

    
3745
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3746
                                            int64_t sector_num,
3747
                                            QEMUIOVector *qiov,
3748
                                            int nb_sectors,
3749
                                            BlockDriverCompletionFunc *cb,
3750
                                            void *opaque,
3751
                                            int is_write)
3752

    
3753
{
3754
    BlockDriverAIOCBSync *acb;
3755

    
3756
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3757
    acb->is_write = is_write;
3758
    acb->qiov = qiov;
3759
    acb->bounce = qemu_blockalign(bs, qiov->size);
3760
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3761

    
3762
    if (is_write) {
3763
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3764
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3765
    } else {
3766
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3767
    }
3768

    
3769
    qemu_bh_schedule(acb->bh);
3770

    
3771
    return &acb->common;
3772
}
3773

    
3774
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3775
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3776
        BlockDriverCompletionFunc *cb, void *opaque)
3777
{
3778
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3779
}
3780

    
3781
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3782
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3783
        BlockDriverCompletionFunc *cb, void *opaque)
3784
{
3785
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3786
}
3787

    
3788

    
3789
typedef struct BlockDriverAIOCBCoroutine {
3790
    BlockDriverAIOCB common;
3791
    BlockRequest req;
3792
    bool is_write;
3793
    bool *done;
3794
    QEMUBH* bh;
3795
} BlockDriverAIOCBCoroutine;
3796

    
3797
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3798
{
3799
    BlockDriverAIOCBCoroutine *acb =
3800
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3801
    bool done = false;
3802

    
3803
    acb->done = &done;
3804
    while (!done) {
3805
        qemu_aio_wait();
3806
    }
3807
}
3808

    
3809
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3810
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3811
    .cancel             = bdrv_aio_co_cancel_em,
3812
};
3813

    
3814
static void bdrv_co_em_bh(void *opaque)
3815
{
3816
    BlockDriverAIOCBCoroutine *acb = opaque;
3817

    
3818
    acb->common.cb(acb->common.opaque, acb->req.error);
3819

    
3820
    if (acb->done) {
3821
        *acb->done = true;
3822
    }
3823

    
3824
    qemu_bh_delete(acb->bh);
3825
    qemu_aio_release(acb);
3826
}
3827

    
3828
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3829
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3830
{
3831
    BlockDriverAIOCBCoroutine *acb = opaque;
3832
    BlockDriverState *bs = acb->common.bs;
3833

    
3834
    if (!acb->is_write) {
3835
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3836
            acb->req.nb_sectors, acb->req.qiov, 0);
3837
    } else {
3838
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3839
            acb->req.nb_sectors, acb->req.qiov, 0);
3840
    }
3841

    
3842
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3843
    qemu_bh_schedule(acb->bh);
3844
}
3845

    
3846
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3847
                                               int64_t sector_num,
3848
                                               QEMUIOVector *qiov,
3849
                                               int nb_sectors,
3850
                                               BlockDriverCompletionFunc *cb,
3851
                                               void *opaque,
3852
                                               bool is_write)
3853
{
3854
    Coroutine *co;
3855
    BlockDriverAIOCBCoroutine *acb;
3856

    
3857
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3858
    acb->req.sector = sector_num;
3859
    acb->req.nb_sectors = nb_sectors;
3860
    acb->req.qiov = qiov;
3861
    acb->is_write = is_write;
3862
    acb->done = NULL;
3863

    
3864
    co = qemu_coroutine_create(bdrv_co_do_rw);
3865
    qemu_coroutine_enter(co, acb);
3866

    
3867
    return &acb->common;
3868
}
3869

    
3870
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3871
{
3872
    BlockDriverAIOCBCoroutine *acb = opaque;
3873
    BlockDriverState *bs = acb->common.bs;
3874

    
3875
    acb->req.error = bdrv_co_flush(bs);
3876
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3877
    qemu_bh_schedule(acb->bh);
3878
}
3879

    
3880
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3881
        BlockDriverCompletionFunc *cb, void *opaque)
3882
{
3883
    trace_bdrv_aio_flush(bs, opaque);
3884

    
3885
    Coroutine *co;
3886
    BlockDriverAIOCBCoroutine *acb;
3887

    
3888
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3889
    acb->done = NULL;
3890

    
3891
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3892
    qemu_coroutine_enter(co, acb);
3893

    
3894
    return &acb->common;
3895
}
3896

    
3897
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3898
{
3899
    BlockDriverAIOCBCoroutine *acb = opaque;
3900
    BlockDriverState *bs = acb->common.bs;
3901

    
3902
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3903
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3904
    qemu_bh_schedule(acb->bh);
3905
}
3906

    
3907
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3908
        int64_t sector_num, int nb_sectors,
3909
        BlockDriverCompletionFunc *cb, void *opaque)
3910
{
3911
    Coroutine *co;
3912
    BlockDriverAIOCBCoroutine *acb;
3913

    
3914
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3915

    
3916
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3917
    acb->req.sector = sector_num;
3918
    acb->req.nb_sectors = nb_sectors;
3919
    acb->done = NULL;
3920
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3921
    qemu_coroutine_enter(co, acb);
3922

    
3923
    return &acb->common;
3924
}
3925

    
3926
void bdrv_init(void)
3927
{
3928
    module_call_init(MODULE_INIT_BLOCK);
3929
}
3930

    
3931
void bdrv_init_with_whitelist(void)
3932
{
3933
    use_bdrv_whitelist = 1;
3934
    bdrv_init();
3935
}
3936

    
3937
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
3938
                   BlockDriverCompletionFunc *cb, void *opaque)
3939
{
3940
    BlockDriverAIOCB *acb;
3941

    
3942
    acb = g_slice_alloc(aiocb_info->aiocb_size);
3943
    acb->aiocb_info = aiocb_info;
3944
    acb->bs = bs;
3945
    acb->cb = cb;
3946
    acb->opaque = opaque;
3947
    return acb;
3948
}
3949

    
3950
void qemu_aio_release(void *p)
3951
{
3952
    BlockDriverAIOCB *acb = p;
3953
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
3954
}
3955

    
3956
/**************************************************************/
3957
/* Coroutine block device emulation */
3958

    
3959
typedef struct CoroutineIOCompletion {
3960
    Coroutine *coroutine;
3961
    int ret;
3962
} CoroutineIOCompletion;
3963

    
3964
static void bdrv_co_io_em_complete(void *opaque, int ret)
3965
{
3966
    CoroutineIOCompletion *co = opaque;
3967

    
3968
    co->ret = ret;
3969
    qemu_coroutine_enter(co->coroutine, NULL);
3970
}
3971

    
3972
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3973
                                      int nb_sectors, QEMUIOVector *iov,
3974
                                      bool is_write)
3975
{
3976
    CoroutineIOCompletion co = {
3977
        .coroutine = qemu_coroutine_self(),
3978
    };
3979
    BlockDriverAIOCB *acb;
3980

    
3981
    if (is_write) {
3982
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3983
                                       bdrv_co_io_em_complete, &co);
3984
    } else {
3985
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3986
                                      bdrv_co_io_em_complete, &co);
3987
    }
3988

    
3989
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3990
    if (!acb) {
3991
        return -EIO;
3992
    }
3993
    qemu_coroutine_yield();
3994

    
3995
    return co.ret;
3996
}
3997

    
3998
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3999
                                         int64_t sector_num, int nb_sectors,
4000
                                         QEMUIOVector *iov)
4001
{
4002
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4003
}
4004

    
4005
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4006
                                         int64_t sector_num, int nb_sectors,
4007
                                         QEMUIOVector *iov)
4008
{
4009
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4010
}
4011

    
4012
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4013
{
4014
    RwCo *rwco = opaque;
4015

    
4016
    rwco->ret = bdrv_co_flush(rwco->bs);
4017
}
4018

    
4019
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4020
{
4021
    int ret;
4022

    
4023
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4024
        return 0;
4025
    }
4026

    
4027
    /* Write back cached data to the OS even with cache=unsafe */
4028
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4029
    if (bs->drv->bdrv_co_flush_to_os) {
4030
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4031
        if (ret < 0) {
4032
            return ret;
4033
        }
4034
    }
4035

    
4036
    /* But don't actually force it to the disk with cache=unsafe */
4037
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4038
        goto flush_parent;
4039
    }
4040

    
4041
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4042
    if (bs->drv->bdrv_co_flush_to_disk) {
4043
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4044
    } else if (bs->drv->bdrv_aio_flush) {
4045
        BlockDriverAIOCB *acb;
4046
        CoroutineIOCompletion co = {
4047
            .coroutine = qemu_coroutine_self(),
4048
        };
4049

    
4050
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4051
        if (acb == NULL) {
4052
            ret = -EIO;
4053
        } else {
4054
            qemu_coroutine_yield();
4055
            ret = co.ret;
4056
        }
4057
    } else {
4058
        /*
4059
         * Some block drivers always operate in either writethrough or unsafe
4060
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4061
         * know how the server works (because the behaviour is hardcoded or
4062
         * depends on server-side configuration), so we can't ensure that
4063
         * everything is safe on disk. Returning an error doesn't work because
4064
         * that would break guests even if the server operates in writethrough
4065
         * mode.
4066
         *
4067
         * Let's hope the user knows what he's doing.
4068
         */
4069
        ret = 0;
4070
    }
4071
    if (ret < 0) {
4072
        return ret;
4073
    }
4074

    
4075
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4076
     * in the case of cache=unsafe, so there are no useless flushes.
4077
     */
4078
flush_parent:
4079
    return bdrv_co_flush(bs->file);
4080
}
4081

    
4082
void bdrv_invalidate_cache(BlockDriverState *bs)
4083
{
4084
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4085
        bs->drv->bdrv_invalidate_cache(bs);
4086
    }
4087
}
4088

    
4089
void bdrv_invalidate_cache_all(void)
4090
{
4091
    BlockDriverState *bs;
4092

    
4093
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4094
        bdrv_invalidate_cache(bs);
4095
    }
4096
}
4097

    
4098
void bdrv_clear_incoming_migration_all(void)
4099
{
4100
    BlockDriverState *bs;
4101

    
4102
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4103
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4104
    }
4105
}
4106

    
4107
int bdrv_flush(BlockDriverState *bs)
4108
{
4109
    Coroutine *co;
4110
    RwCo rwco = {
4111
        .bs = bs,
4112
        .ret = NOT_DONE,
4113
    };
4114

    
4115
    if (qemu_in_coroutine()) {
4116
        /* Fast-path if already in coroutine context */
4117
        bdrv_flush_co_entry(&rwco);
4118
    } else {
4119
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4120
        qemu_coroutine_enter(co, &rwco);
4121
        while (rwco.ret == NOT_DONE) {
4122
            qemu_aio_wait();
4123
        }
4124
    }
4125

    
4126
    return rwco.ret;
4127
}
4128

    
4129
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4130
{
4131
    RwCo *rwco = opaque;
4132

    
4133
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4134
}
4135

    
4136
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4137
                                 int nb_sectors)
4138
{
4139
    if (!bs->drv) {
4140
        return -ENOMEDIUM;
4141
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4142
        return -EIO;
4143
    } else if (bs->read_only) {
4144
        return -EROFS;
4145
    }
4146

    
4147
    if (bs->dirty_bitmap) {
4148
        bdrv_reset_dirty(bs, sector_num, nb_sectors);
4149
    }
4150

    
4151
    /* Do nothing if disabled.  */
4152
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4153
        return 0;
4154
    }
4155

    
4156
    if (bs->drv->bdrv_co_discard) {
4157
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4158
    } else if (bs->drv->bdrv_aio_discard) {
4159
        BlockDriverAIOCB *acb;
4160
        CoroutineIOCompletion co = {
4161
            .coroutine = qemu_coroutine_self(),
4162
        };
4163

    
4164
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4165
                                        bdrv_co_io_em_complete, &co);
4166
        if (acb == NULL) {
4167
            return -EIO;
4168
        } else {
4169
            qemu_coroutine_yield();
4170
            return co.ret;
4171
        }
4172
    } else {
4173
        return 0;
4174
    }
4175
}
4176

    
4177
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4178
{
4179
    Coroutine *co;
4180
    RwCo rwco = {
4181
        .bs = bs,
4182
        .sector_num = sector_num,
4183
        .nb_sectors = nb_sectors,
4184
        .ret = NOT_DONE,
4185
    };
4186

    
4187
    if (qemu_in_coroutine()) {
4188
        /* Fast-path if already in coroutine context */
4189
        bdrv_discard_co_entry(&rwco);
4190
    } else {
4191
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4192
        qemu_coroutine_enter(co, &rwco);
4193
        while (rwco.ret == NOT_DONE) {
4194
            qemu_aio_wait();
4195
        }
4196
    }
4197

    
4198
    return rwco.ret;
4199
}
4200

    
4201
/**************************************************************/
4202
/* removable device support */
4203

    
4204
/**
4205
 * Return TRUE if the media is present
4206
 */
4207
int bdrv_is_inserted(BlockDriverState *bs)
4208
{
4209
    BlockDriver *drv = bs->drv;
4210

    
4211
    if (!drv)
4212
        return 0;
4213
    if (!drv->bdrv_is_inserted)
4214
        return 1;
4215
    return drv->bdrv_is_inserted(bs);
4216
}
4217

    
4218
/**
4219
 * Return whether the media changed since the last call to this
4220
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4221
 */
4222
int bdrv_media_changed(BlockDriverState *bs)
4223
{
4224
    BlockDriver *drv = bs->drv;
4225

    
4226
    if (drv && drv->bdrv_media_changed) {
4227
        return drv->bdrv_media_changed(bs);
4228
    }
4229
    return -ENOTSUP;
4230
}
4231

    
4232
/**
4233
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4234
 */
4235
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4236
{
4237
    BlockDriver *drv = bs->drv;
4238

    
4239
    if (drv && drv->bdrv_eject) {
4240
        drv->bdrv_eject(bs, eject_flag);
4241
    }
4242

    
4243
    if (bs->device_name[0] != '\0') {
4244
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4245
    }
4246
}
4247

    
4248
/**
4249
 * Lock or unlock the media (if it is locked, the user won't be able
4250
 * to eject it manually).
4251
 */
4252
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4253
{
4254
    BlockDriver *drv = bs->drv;
4255

    
4256
    trace_bdrv_lock_medium(bs, locked);
4257

    
4258
    if (drv && drv->bdrv_lock_medium) {
4259
        drv->bdrv_lock_medium(bs, locked);
4260
    }
4261
}
4262

    
4263
/* needed for generic scsi interface */
4264

    
4265
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4266
{
4267
    BlockDriver *drv = bs->drv;
4268

    
4269
    if (drv && drv->bdrv_ioctl)
4270
        return drv->bdrv_ioctl(bs, req, buf);
4271
    return -ENOTSUP;
4272
}
4273

    
4274
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4275
        unsigned long int req, void *buf,
4276
        BlockDriverCompletionFunc *cb, void *opaque)
4277
{
4278
    BlockDriver *drv = bs->drv;
4279

    
4280
    if (drv && drv->bdrv_aio_ioctl)
4281
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4282
    return NULL;
4283
}
4284

    
4285
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4286
{
4287
    bs->buffer_alignment = align;
4288
}
4289

    
4290
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4291
{
4292
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4293
}
4294

    
4295
/*
4296
 * Check if all memory in this vector is sector aligned.
4297
 */
4298
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4299
{
4300
    int i;
4301

    
4302
    for (i = 0; i < qiov->niov; i++) {
4303
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4304
            return false;
4305
        }
4306
    }
4307

    
4308
    return true;
4309
}
4310

    
4311
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4312
{
4313
    int64_t bitmap_size;
4314

    
4315
    assert((granularity & (granularity - 1)) == 0);
4316

    
4317
    if (granularity) {
4318
        granularity >>= BDRV_SECTOR_BITS;
4319
        assert(!bs->dirty_bitmap);
4320
        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4321
        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4322
    } else {
4323
        if (bs->dirty_bitmap) {
4324
            hbitmap_free(bs->dirty_bitmap);
4325
            bs->dirty_bitmap = NULL;
4326
        }
4327
    }
4328
}
4329

    
4330
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4331
{
4332
    if (bs->dirty_bitmap) {
4333
        return hbitmap_get(bs->dirty_bitmap, sector);
4334
    } else {
4335
        return 0;
4336
    }
4337
}
4338

    
4339
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4340
{
4341
    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4342
}
4343

    
4344
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4345
                    int nr_sectors)
4346
{
4347
    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4348
}
4349

    
4350
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4351
                      int nr_sectors)
4352
{
4353
    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4354
}
4355

    
4356
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4357
{
4358
    if (bs->dirty_bitmap) {
4359
        return hbitmap_count(bs->dirty_bitmap);
4360
    } else {
4361
        return 0;
4362
    }
4363
}
4364

    
4365
/* Get a reference to bs */
4366
void bdrv_ref(BlockDriverState *bs)
4367
{
4368
    bs->refcnt++;
4369
}
4370

    
4371
/* Release a previously grabbed reference to bs.
4372
 * If after releasing, reference count is zero, the BlockDriverState is
4373
 * deleted. */
4374
void bdrv_unref(BlockDriverState *bs)
4375
{
4376
    assert(bs->refcnt > 0);
4377
    if (--bs->refcnt == 0) {
4378
        bdrv_delete(bs);
4379
    }
4380
}
4381

    
4382
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4383
{
4384
    assert(bs->in_use != in_use);
4385
    bs->in_use = in_use;
4386
}
4387

    
4388
int bdrv_in_use(BlockDriverState *bs)
4389
{
4390
    return bs->in_use;
4391
}
4392

    
4393
void bdrv_iostatus_enable(BlockDriverState *bs)
4394
{
4395
    bs->iostatus_enabled = true;
4396
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4397
}
4398

    
4399
/* The I/O status is only enabled if the drive explicitly
4400
 * enables it _and_ the VM is configured to stop on errors */
4401
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4402
{
4403
    return (bs->iostatus_enabled &&
4404
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4405
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4406
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4407
}
4408

    
4409
void bdrv_iostatus_disable(BlockDriverState *bs)
4410
{
4411
    bs->iostatus_enabled = false;
4412
}
4413

    
4414
void bdrv_iostatus_reset(BlockDriverState *bs)
4415
{
4416
    if (bdrv_iostatus_is_enabled(bs)) {
4417
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4418
        if (bs->job) {
4419
            block_job_iostatus_reset(bs->job);
4420
        }
4421
    }
4422
}
4423

    
4424
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4425
{
4426
    assert(bdrv_iostatus_is_enabled(bs));
4427
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4428
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4429
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4430
    }
4431
}
4432

    
4433
void
4434
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4435
        enum BlockAcctType type)
4436
{
4437
    assert(type < BDRV_MAX_IOTYPE);
4438

    
4439
    cookie->bytes = bytes;
4440
    cookie->start_time_ns = get_clock();
4441
    cookie->type = type;
4442
}
4443

    
4444
void
4445
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4446
{
4447
    assert(cookie->type < BDRV_MAX_IOTYPE);
4448

    
4449
    bs->nr_bytes[cookie->type] += cookie->bytes;
4450
    bs->nr_ops[cookie->type]++;
4451
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4452
}
4453

    
4454
void bdrv_img_create(const char *filename, const char *fmt,
4455
                     const char *base_filename, const char *base_fmt,
4456
                     char *options, uint64_t img_size, int flags,
4457
                     Error **errp, bool quiet)
4458
{
4459
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4460
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4461
    BlockDriverState *bs = NULL;
4462
    BlockDriver *drv, *proto_drv;
4463
    BlockDriver *backing_drv = NULL;
4464
    int ret = 0;
4465

    
4466
    /* Find driver and parse its options */
4467
    drv = bdrv_find_format(fmt);
4468
    if (!drv) {
4469
        error_setg(errp, "Unknown file format '%s'", fmt);
4470
        return;
4471
    }
4472

    
4473
    proto_drv = bdrv_find_protocol(filename, true);
4474
    if (!proto_drv) {
4475
        error_setg(errp, "Unknown protocol '%s'", filename);
4476
        return;
4477
    }
4478

    
4479
    create_options = append_option_parameters(create_options,
4480
                                              drv->create_options);
4481
    create_options = append_option_parameters(create_options,
4482
                                              proto_drv->create_options);
4483

    
4484
    /* Create parameter list with default values */
4485
    param = parse_option_parameters("", create_options, param);
4486

    
4487
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4488

    
4489
    /* Parse -o options */
4490
    if (options) {
4491
        param = parse_option_parameters(options, create_options, param);
4492
        if (param == NULL) {
4493
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4494
            goto out;
4495
        }
4496
    }
4497

    
4498
    if (base_filename) {
4499
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4500
                                 base_filename)) {
4501
            error_setg(errp, "Backing file not supported for file format '%s'",
4502
                       fmt);
4503
            goto out;
4504
        }
4505
    }
4506

    
4507
    if (base_fmt) {
4508
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4509
            error_setg(errp, "Backing file format not supported for file "
4510
                             "format '%s'", fmt);
4511
            goto out;
4512
        }
4513
    }
4514

    
4515
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4516
    if (backing_file && backing_file->value.s) {
4517
        if (!strcmp(filename, backing_file->value.s)) {
4518
            error_setg(errp, "Error: Trying to create an image with the "
4519
                             "same filename as the backing file");
4520
            goto out;
4521
        }
4522
    }
4523

    
4524
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4525
    if (backing_fmt && backing_fmt->value.s) {
4526
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4527
        if (!backing_drv) {
4528
            error_setg(errp, "Unknown backing file format '%s'",
4529
                       backing_fmt->value.s);
4530
            goto out;
4531
        }
4532
    }
4533

    
4534
    // The size for the image must always be specified, with one exception:
4535
    // If we are using a backing file, we can obtain the size from there
4536
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4537
    if (size && size->value.n == -1) {
4538
        if (backing_file && backing_file->value.s) {
4539
            uint64_t size;
4540
            char buf[32];
4541
            int back_flags;
4542

    
4543
            /* backing files always opened read-only */
4544
            back_flags =
4545
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4546

    
4547
            bs = bdrv_new("");
4548

    
4549
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4550
                            backing_drv, NULL);
4551
            if (ret < 0) {
4552
                error_setg_errno(errp, -ret, "Could not open '%s'",
4553
                                 backing_file->value.s);
4554
                goto out;
4555
            }
4556
            bdrv_get_geometry(bs, &size);
4557
            size *= 512;
4558

    
4559
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4560
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4561
        } else {
4562
            error_setg(errp, "Image creation needs a size parameter");
4563
            goto out;
4564
        }
4565
    }
4566

    
4567
    if (!quiet) {
4568
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4569
        print_option_parameters(param);
4570
        puts("");
4571
    }
4572
    ret = bdrv_create(drv, filename, param);
4573
    if (ret < 0) {
4574
        if (ret == -ENOTSUP) {
4575
            error_setg(errp,"Formatting or formatting option not supported for "
4576
                            "file format '%s'", fmt);
4577
        } else if (ret == -EFBIG) {
4578
            const char *cluster_size_hint = "";
4579
            if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4580
                cluster_size_hint = " (try using a larger cluster size)";
4581
            }
4582
            error_setg(errp, "The image size is too large for file format '%s'%s",
4583
                       fmt, cluster_size_hint);
4584
        } else {
4585
            error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
4586
                       strerror(-ret));
4587
        }
4588
    }
4589

    
4590
out:
4591
    free_option_parameters(create_options);
4592
    free_option_parameters(param);
4593

    
4594
    if (bs) {
4595
        bdrv_unref(bs);
4596
    }
4597
}
4598

    
4599
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4600
{
4601
    /* Currently BlockDriverState always uses the main loop AioContext */
4602
    return qemu_get_aio_context();
4603
}
4604

    
4605
void bdrv_add_before_write_notifier(BlockDriverState *bs,
4606
                                    NotifierWithReturn *notifier)
4607
{
4608
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4609
}
4610

    
4611
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
4612
{
4613
    if (bs->drv->bdrv_amend_options == NULL) {
4614
        return -ENOTSUP;
4615
    }
4616
    return bs->drv->bdrv_amend_options(bs, options);
4617
}