Statistics
| Branch: | Revision:

root / block.c @ 92bc50a5

History | View | Annotate | Download (127.9 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
90
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
91

    
92
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
93
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
94

    
95
/* If non-zero, use only whitelisted block drivers */
96
static int use_bdrv_whitelist;
97

    
98
#ifdef _WIN32
99
static int is_windows_drive_prefix(const char *filename)
100
{
101
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
102
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
103
            filename[1] == ':');
104
}
105

    
106
int is_windows_drive(const char *filename)
107
{
108
    if (is_windows_drive_prefix(filename) &&
109
        filename[2] == '\0')
110
        return 1;
111
    if (strstart(filename, "\\\\.\\", NULL) ||
112
        strstart(filename, "//./", NULL))
113
        return 1;
114
    return 0;
115
}
116
#endif
117

    
118
/* throttling disk I/O limits */
119
void bdrv_set_io_limits(BlockDriverState *bs,
120
                        ThrottleConfig *cfg)
121
{
122
    int i;
123

    
124
    throttle_config(&bs->throttle_state, cfg);
125

    
126
    for (i = 0; i < 2; i++) {
127
        qemu_co_enter_next(&bs->throttled_reqs[i]);
128
    }
129
}
130

    
131
/* this function drain all the throttled IOs */
132
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
133
{
134
    bool drained = false;
135
    bool enabled = bs->io_limits_enabled;
136
    int i;
137

    
138
    bs->io_limits_enabled = false;
139

    
140
    for (i = 0; i < 2; i++) {
141
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
142
            drained = true;
143
        }
144
    }
145

    
146
    bs->io_limits_enabled = enabled;
147

    
148
    return drained;
149
}
150

    
151
void bdrv_io_limits_disable(BlockDriverState *bs)
152
{
153
    bs->io_limits_enabled = false;
154

    
155
    bdrv_start_throttled_reqs(bs);
156

    
157
    throttle_destroy(&bs->throttle_state);
158
}
159

    
160
static void bdrv_throttle_read_timer_cb(void *opaque)
161
{
162
    BlockDriverState *bs = opaque;
163
    qemu_co_enter_next(&bs->throttled_reqs[0]);
164
}
165

    
166
static void bdrv_throttle_write_timer_cb(void *opaque)
167
{
168
    BlockDriverState *bs = opaque;
169
    qemu_co_enter_next(&bs->throttled_reqs[1]);
170
}
171

    
172
/* should be called before bdrv_set_io_limits if a limit is set */
173
void bdrv_io_limits_enable(BlockDriverState *bs)
174
{
175
    assert(!bs->io_limits_enabled);
176
    throttle_init(&bs->throttle_state,
177
                  QEMU_CLOCK_VIRTUAL,
178
                  bdrv_throttle_read_timer_cb,
179
                  bdrv_throttle_write_timer_cb,
180
                  bs);
181
    bs->io_limits_enabled = true;
182
}
183

    
184
/* This function makes an IO wait if needed
185
 *
186
 * @nb_sectors: the number of sectors of the IO
187
 * @is_write:   is the IO a write
188
 */
189
static void bdrv_io_limits_intercept(BlockDriverState *bs,
190
                                     int nb_sectors,
191
                                     bool is_write)
192
{
193
    /* does this io must wait */
194
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
195

    
196
    /* if must wait or any request of this type throttled queue the IO */
197
    if (must_wait ||
198
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
199
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
200
    }
201

    
202
    /* the IO will be executed, do the accounting */
203
    throttle_account(&bs->throttle_state,
204
                     is_write,
205
                     nb_sectors * BDRV_SECTOR_SIZE);
206

    
207
    /* if the next request must wait -> do nothing */
208
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
209
        return;
210
    }
211

    
212
    /* else queue next request for execution */
213
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
214
}
215

    
216
/* check if the path starts with "<protocol>:" */
217
static int path_has_protocol(const char *path)
218
{
219
    const char *p;
220

    
221
#ifdef _WIN32
222
    if (is_windows_drive(path) ||
223
        is_windows_drive_prefix(path)) {
224
        return 0;
225
    }
226
    p = path + strcspn(path, ":/\\");
227
#else
228
    p = path + strcspn(path, ":/");
229
#endif
230

    
231
    return *p == ':';
232
}
233

    
234
int path_is_absolute(const char *path)
235
{
236
#ifdef _WIN32
237
    /* specific case for names like: "\\.\d:" */
238
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
239
        return 1;
240
    }
241
    return (*path == '/' || *path == '\\');
242
#else
243
    return (*path == '/');
244
#endif
245
}
246

    
247
/* if filename is absolute, just copy it to dest. Otherwise, build a
248
   path to it by considering it is relative to base_path. URL are
249
   supported. */
250
void path_combine(char *dest, int dest_size,
251
                  const char *base_path,
252
                  const char *filename)
253
{
254
    const char *p, *p1;
255
    int len;
256

    
257
    if (dest_size <= 0)
258
        return;
259
    if (path_is_absolute(filename)) {
260
        pstrcpy(dest, dest_size, filename);
261
    } else {
262
        p = strchr(base_path, ':');
263
        if (p)
264
            p++;
265
        else
266
            p = base_path;
267
        p1 = strrchr(base_path, '/');
268
#ifdef _WIN32
269
        {
270
            const char *p2;
271
            p2 = strrchr(base_path, '\\');
272
            if (!p1 || p2 > p1)
273
                p1 = p2;
274
        }
275
#endif
276
        if (p1)
277
            p1++;
278
        else
279
            p1 = base_path;
280
        if (p1 > p)
281
            p = p1;
282
        len = p - base_path;
283
        if (len > dest_size - 1)
284
            len = dest_size - 1;
285
        memcpy(dest, base_path, len);
286
        dest[len] = '\0';
287
        pstrcat(dest, dest_size, filename);
288
    }
289
}
290

    
291
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
292
{
293
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
294
        pstrcpy(dest, sz, bs->backing_file);
295
    } else {
296
        path_combine(dest, sz, bs->filename, bs->backing_file);
297
    }
298
}
299

    
300
void bdrv_register(BlockDriver *bdrv)
301
{
302
    /* Block drivers without coroutine functions need emulation */
303
    if (!bdrv->bdrv_co_readv) {
304
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
305
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
306

    
307
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
308
         * the block driver lacks aio we need to emulate that too.
309
         */
310
        if (!bdrv->bdrv_aio_readv) {
311
            /* add AIO emulation layer */
312
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
313
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
314
        }
315
    }
316

    
317
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
318
}
319

    
320
/* create a new block device (by default it is empty) */
321
BlockDriverState *bdrv_new(const char *device_name)
322
{
323
    BlockDriverState *bs;
324

    
325
    bs = g_malloc0(sizeof(BlockDriverState));
326
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
327
    if (device_name[0] != '\0') {
328
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
329
    }
330
    bdrv_iostatus_disable(bs);
331
    notifier_list_init(&bs->close_notifiers);
332
    notifier_with_return_list_init(&bs->before_write_notifiers);
333
    qemu_co_queue_init(&bs->throttled_reqs[0]);
334
    qemu_co_queue_init(&bs->throttled_reqs[1]);
335
    bs->refcnt = 1;
336

    
337
    return bs;
338
}
339

    
340
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
341
{
342
    notifier_list_add(&bs->close_notifiers, notify);
343
}
344

    
345
BlockDriver *bdrv_find_format(const char *format_name)
346
{
347
    BlockDriver *drv1;
348
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
349
        if (!strcmp(drv1->format_name, format_name)) {
350
            return drv1;
351
        }
352
    }
353
    return NULL;
354
}
355

    
356
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
357
{
358
    static const char *whitelist_rw[] = {
359
        CONFIG_BDRV_RW_WHITELIST
360
    };
361
    static const char *whitelist_ro[] = {
362
        CONFIG_BDRV_RO_WHITELIST
363
    };
364
    const char **p;
365

    
366
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
367
        return 1;               /* no whitelist, anything goes */
368
    }
369

    
370
    for (p = whitelist_rw; *p; p++) {
371
        if (!strcmp(drv->format_name, *p)) {
372
            return 1;
373
        }
374
    }
375
    if (read_only) {
376
        for (p = whitelist_ro; *p; p++) {
377
            if (!strcmp(drv->format_name, *p)) {
378
                return 1;
379
            }
380
        }
381
    }
382
    return 0;
383
}
384

    
385
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
386
                                          bool read_only)
387
{
388
    BlockDriver *drv = bdrv_find_format(format_name);
389
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
390
}
391

    
392
typedef struct CreateCo {
393
    BlockDriver *drv;
394
    char *filename;
395
    QEMUOptionParameter *options;
396
    int ret;
397
    Error *err;
398
} CreateCo;
399

    
400
static void coroutine_fn bdrv_create_co_entry(void *opaque)
401
{
402
    Error *local_err = NULL;
403
    int ret;
404

    
405
    CreateCo *cco = opaque;
406
    assert(cco->drv);
407

    
408
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
409
    if (error_is_set(&local_err)) {
410
        error_propagate(&cco->err, local_err);
411
    }
412
    cco->ret = ret;
413
}
414

    
415
int bdrv_create(BlockDriver *drv, const char* filename,
416
    QEMUOptionParameter *options, Error **errp)
417
{
418
    int ret;
419

    
420
    Coroutine *co;
421
    CreateCo cco = {
422
        .drv = drv,
423
        .filename = g_strdup(filename),
424
        .options = options,
425
        .ret = NOT_DONE,
426
        .err = NULL,
427
    };
428

    
429
    if (!drv->bdrv_create) {
430
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
431
        ret = -ENOTSUP;
432
        goto out;
433
    }
434

    
435
    if (qemu_in_coroutine()) {
436
        /* Fast-path if already in coroutine context */
437
        bdrv_create_co_entry(&cco);
438
    } else {
439
        co = qemu_coroutine_create(bdrv_create_co_entry);
440
        qemu_coroutine_enter(co, &cco);
441
        while (cco.ret == NOT_DONE) {
442
            qemu_aio_wait();
443
        }
444
    }
445

    
446
    ret = cco.ret;
447
    if (ret < 0) {
448
        if (error_is_set(&cco.err)) {
449
            error_propagate(errp, cco.err);
450
        } else {
451
            error_setg_errno(errp, -ret, "Could not create image");
452
        }
453
    }
454

    
455
out:
456
    g_free(cco.filename);
457
    return ret;
458
}
459

    
460
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
461
                     Error **errp)
462
{
463
    BlockDriver *drv;
464
    Error *local_err = NULL;
465
    int ret;
466

    
467
    drv = bdrv_find_protocol(filename, true);
468
    if (drv == NULL) {
469
        error_setg(errp, "Could not find protocol for file '%s'", filename);
470
        return -ENOENT;
471
    }
472

    
473
    ret = bdrv_create(drv, filename, options, &local_err);
474
    if (error_is_set(&local_err)) {
475
        error_propagate(errp, local_err);
476
    }
477
    return ret;
478
}
479

    
480
/*
481
 * Create a uniquely-named empty temporary file.
482
 * Return 0 upon success, otherwise a negative errno value.
483
 */
484
int get_tmp_filename(char *filename, int size)
485
{
486
#ifdef _WIN32
487
    char temp_dir[MAX_PATH];
488
    /* GetTempFileName requires that its output buffer (4th param)
489
       have length MAX_PATH or greater.  */
490
    assert(size >= MAX_PATH);
491
    return (GetTempPath(MAX_PATH, temp_dir)
492
            && GetTempFileName(temp_dir, "qem", 0, filename)
493
            ? 0 : -GetLastError());
494
#else
495
    int fd;
496
    const char *tmpdir;
497
    tmpdir = getenv("TMPDIR");
498
    if (!tmpdir)
499
        tmpdir = "/tmp";
500
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
501
        return -EOVERFLOW;
502
    }
503
    fd = mkstemp(filename);
504
    if (fd < 0) {
505
        return -errno;
506
    }
507
    if (close(fd) != 0) {
508
        unlink(filename);
509
        return -errno;
510
    }
511
    return 0;
512
#endif
513
}
514

    
515
/*
516
 * Detect host devices. By convention, /dev/cdrom[N] is always
517
 * recognized as a host CDROM.
518
 */
519
static BlockDriver *find_hdev_driver(const char *filename)
520
{
521
    int score_max = 0, score;
522
    BlockDriver *drv = NULL, *d;
523

    
524
    QLIST_FOREACH(d, &bdrv_drivers, list) {
525
        if (d->bdrv_probe_device) {
526
            score = d->bdrv_probe_device(filename);
527
            if (score > score_max) {
528
                score_max = score;
529
                drv = d;
530
            }
531
        }
532
    }
533

    
534
    return drv;
535
}
536

    
537
BlockDriver *bdrv_find_protocol(const char *filename,
538
                                bool allow_protocol_prefix)
539
{
540
    BlockDriver *drv1;
541
    char protocol[128];
542
    int len;
543
    const char *p;
544

    
545
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
546

    
547
    /*
548
     * XXX(hch): we really should not let host device detection
549
     * override an explicit protocol specification, but moving this
550
     * later breaks access to device names with colons in them.
551
     * Thanks to the brain-dead persistent naming schemes on udev-
552
     * based Linux systems those actually are quite common.
553
     */
554
    drv1 = find_hdev_driver(filename);
555
    if (drv1) {
556
        return drv1;
557
    }
558

    
559
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
560
        return bdrv_find_format("file");
561
    }
562

    
563
    p = strchr(filename, ':');
564
    assert(p != NULL);
565
    len = p - filename;
566
    if (len > sizeof(protocol) - 1)
567
        len = sizeof(protocol) - 1;
568
    memcpy(protocol, filename, len);
569
    protocol[len] = '\0';
570
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
571
        if (drv1->protocol_name &&
572
            !strcmp(drv1->protocol_name, protocol)) {
573
            return drv1;
574
        }
575
    }
576
    return NULL;
577
}
578

    
579
static int find_image_format(BlockDriverState *bs, const char *filename,
580
                             BlockDriver **pdrv, Error **errp)
581
{
582
    int score, score_max;
583
    BlockDriver *drv1, *drv;
584
    uint8_t buf[2048];
585
    int ret = 0;
586

    
587
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
588
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
589
        drv = bdrv_find_format("raw");
590
        if (!drv) {
591
            error_setg(errp, "Could not find raw image format");
592
            ret = -ENOENT;
593
        }
594
        *pdrv = drv;
595
        return ret;
596
    }
597

    
598
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
599
    if (ret < 0) {
600
        error_setg_errno(errp, -ret, "Could not read image for determining its "
601
                         "format");
602
        *pdrv = NULL;
603
        return ret;
604
    }
605

    
606
    score_max = 0;
607
    drv = NULL;
608
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
609
        if (drv1->bdrv_probe) {
610
            score = drv1->bdrv_probe(buf, ret, filename);
611
            if (score > score_max) {
612
                score_max = score;
613
                drv = drv1;
614
            }
615
        }
616
    }
617
    if (!drv) {
618
        error_setg(errp, "Could not determine image format: No compatible "
619
                   "driver found");
620
        ret = -ENOENT;
621
    }
622
    *pdrv = drv;
623
    return ret;
624
}
625

    
626
/**
627
 * Set the current 'total_sectors' value
628
 */
629
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
630
{
631
    BlockDriver *drv = bs->drv;
632

    
633
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
634
    if (bs->sg)
635
        return 0;
636

    
637
    /* query actual device if possible, otherwise just trust the hint */
638
    if (drv->bdrv_getlength) {
639
        int64_t length = drv->bdrv_getlength(bs);
640
        if (length < 0) {
641
            return length;
642
        }
643
        hint = length >> BDRV_SECTOR_BITS;
644
    }
645

    
646
    bs->total_sectors = hint;
647
    return 0;
648
}
649

    
650
/**
651
 * Set open flags for a given discard mode
652
 *
653
 * Return 0 on success, -1 if the discard mode was invalid.
654
 */
655
int bdrv_parse_discard_flags(const char *mode, int *flags)
656
{
657
    *flags &= ~BDRV_O_UNMAP;
658

    
659
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
660
        /* do nothing */
661
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
662
        *flags |= BDRV_O_UNMAP;
663
    } else {
664
        return -1;
665
    }
666

    
667
    return 0;
668
}
669

    
670
/**
671
 * Set open flags for a given cache mode
672
 *
673
 * Return 0 on success, -1 if the cache mode was invalid.
674
 */
675
int bdrv_parse_cache_flags(const char *mode, int *flags)
676
{
677
    *flags &= ~BDRV_O_CACHE_MASK;
678

    
679
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
680
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
681
    } else if (!strcmp(mode, "directsync")) {
682
        *flags |= BDRV_O_NOCACHE;
683
    } else if (!strcmp(mode, "writeback")) {
684
        *flags |= BDRV_O_CACHE_WB;
685
    } else if (!strcmp(mode, "unsafe")) {
686
        *flags |= BDRV_O_CACHE_WB;
687
        *flags |= BDRV_O_NO_FLUSH;
688
    } else if (!strcmp(mode, "writethrough")) {
689
        /* this is the default */
690
    } else {
691
        return -1;
692
    }
693

    
694
    return 0;
695
}
696

    
697
/**
698
 * The copy-on-read flag is actually a reference count so multiple users may
699
 * use the feature without worrying about clobbering its previous state.
700
 * Copy-on-read stays enabled until all users have called to disable it.
701
 */
702
void bdrv_enable_copy_on_read(BlockDriverState *bs)
703
{
704
    bs->copy_on_read++;
705
}
706

    
707
void bdrv_disable_copy_on_read(BlockDriverState *bs)
708
{
709
    assert(bs->copy_on_read > 0);
710
    bs->copy_on_read--;
711
}
712

    
713
static int bdrv_open_flags(BlockDriverState *bs, int flags)
714
{
715
    int open_flags = flags | BDRV_O_CACHE_WB;
716

    
717
    /*
718
     * Clear flags that are internal to the block layer before opening the
719
     * image.
720
     */
721
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
722

    
723
    /*
724
     * Snapshots should be writable.
725
     */
726
    if (bs->is_temporary) {
727
        open_flags |= BDRV_O_RDWR;
728
    }
729

    
730
    return open_flags;
731
}
732

    
733
/*
734
 * Common part for opening disk images and files
735
 *
736
 * Removes all processed options from *options.
737
 */
738
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
739
    QDict *options, int flags, BlockDriver *drv, Error **errp)
740
{
741
    int ret, open_flags;
742
    const char *filename;
743
    Error *local_err = NULL;
744

    
745
    assert(drv != NULL);
746
    assert(bs->file == NULL);
747
    assert(options != NULL && bs->options != options);
748

    
749
    if (file != NULL) {
750
        filename = file->filename;
751
    } else {
752
        filename = qdict_get_try_str(options, "filename");
753
    }
754

    
755
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
756

    
757
    /* bdrv_open() with directly using a protocol as drv. This layer is already
758
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
759
     * and return immediately. */
760
    if (file != NULL && drv->bdrv_file_open) {
761
        bdrv_swap(file, bs);
762
        return 0;
763
    }
764

    
765
    bs->open_flags = flags;
766
    bs->buffer_alignment = 512;
767
    bs->zero_beyond_eof = true;
768
    open_flags = bdrv_open_flags(bs, flags);
769
    bs->read_only = !(open_flags & BDRV_O_RDWR);
770

    
771
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
772
        error_setg(errp, "Driver '%s' is not whitelisted", drv->format_name);
773
        return -ENOTSUP;
774
    }
775

    
776
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
777
    if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) {
778
        bdrv_enable_copy_on_read(bs);
779
    }
780

    
781
    if (filename != NULL) {
782
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
783
    } else {
784
        bs->filename[0] = '\0';
785
    }
786

    
787
    bs->drv = drv;
788
    bs->opaque = g_malloc0(drv->instance_size);
789

    
790
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
791

    
792
    /* Open the image, either directly or using a protocol */
793
    if (drv->bdrv_file_open) {
794
        assert(file == NULL);
795
        assert(!drv->bdrv_needs_filename || filename != NULL);
796
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
797
    } else {
798
        if (file == NULL) {
799
            error_setg(errp, "Can't use '%s' as a block driver for the "
800
                       "protocol level", drv->format_name);
801
            ret = -EINVAL;
802
            goto free_and_fail;
803
        }
804
        bs->file = file;
805
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
806
    }
807

    
808
    if (ret < 0) {
809
        if (error_is_set(&local_err)) {
810
            error_propagate(errp, local_err);
811
        } else if (bs->filename[0]) {
812
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
813
        } else {
814
            error_setg_errno(errp, -ret, "Could not open image");
815
        }
816
        goto free_and_fail;
817
    }
818

    
819
    ret = refresh_total_sectors(bs, bs->total_sectors);
820
    if (ret < 0) {
821
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
822
        goto free_and_fail;
823
    }
824

    
825
#ifndef _WIN32
826
    if (bs->is_temporary) {
827
        assert(bs->filename[0] != '\0');
828
        unlink(bs->filename);
829
    }
830
#endif
831
    return 0;
832

    
833
free_and_fail:
834
    bs->file = NULL;
835
    g_free(bs->opaque);
836
    bs->opaque = NULL;
837
    bs->drv = NULL;
838
    return ret;
839
}
840

    
841
/*
842
 * Opens a file using a protocol (file, host_device, nbd, ...)
843
 *
844
 * options is a QDict of options to pass to the block drivers, or NULL for an
845
 * empty set of options. The reference to the QDict belongs to the block layer
846
 * after the call (even on failure), so if the caller intends to reuse the
847
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
848
 */
849
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
850
                   QDict *options, int flags, Error **errp)
851
{
852
    BlockDriverState *bs;
853
    BlockDriver *drv;
854
    const char *drvname;
855
    bool allow_protocol_prefix = false;
856
    Error *local_err = NULL;
857
    int ret;
858

    
859
    /* NULL means an empty set of options */
860
    if (options == NULL) {
861
        options = qdict_new();
862
    }
863

    
864
    bs = bdrv_new("");
865
    bs->options = options;
866
    options = qdict_clone_shallow(options);
867

    
868
    /* Fetch the file name from the options QDict if necessary */
869
    if (!filename) {
870
        filename = qdict_get_try_str(options, "filename");
871
    } else if (filename && !qdict_haskey(options, "filename")) {
872
        qdict_put(options, "filename", qstring_from_str(filename));
873
        allow_protocol_prefix = true;
874
    } else {
875
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
876
                   "same time");
877
        ret = -EINVAL;
878
        goto fail;
879
    }
880

    
881
    /* Find the right block driver */
882
    drvname = qdict_get_try_str(options, "driver");
883
    if (drvname) {
884
        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
885
        if (!drv) {
886
            error_setg(errp, "Unknown driver '%s'", drvname);
887
        }
888
        qdict_del(options, "driver");
889
    } else if (filename) {
890
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
891
        if (!drv) {
892
            error_setg(errp, "Unknown protocol");
893
        }
894
    } else {
895
        error_setg(errp, "Must specify either driver or file");
896
        drv = NULL;
897
    }
898

    
899
    if (!drv) {
900
        /* errp has been set already */
901
        ret = -ENOENT;
902
        goto fail;
903
    }
904

    
905
    /* Parse the filename and open it */
906
    if (drv->bdrv_parse_filename && filename) {
907
        drv->bdrv_parse_filename(filename, options, &local_err);
908
        if (error_is_set(&local_err)) {
909
            error_propagate(errp, local_err);
910
            ret = -EINVAL;
911
            goto fail;
912
        }
913
        qdict_del(options, "filename");
914
    } else if (drv->bdrv_needs_filename && !filename) {
915
        error_setg(errp, "The '%s' block driver requires a file name",
916
                   drv->format_name);
917
        ret = -EINVAL;
918
        goto fail;
919
    }
920

    
921
    ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
922
    if (ret < 0) {
923
        error_propagate(errp, local_err);
924
        goto fail;
925
    }
926

    
927
    /* Check if any unknown options were used */
928
    if (qdict_size(options) != 0) {
929
        const QDictEntry *entry = qdict_first(options);
930
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
931
                   drv->format_name, entry->key);
932
        ret = -EINVAL;
933
        goto fail;
934
    }
935
    QDECREF(options);
936

    
937
    bs->growable = 1;
938
    *pbs = bs;
939
    return 0;
940

    
941
fail:
942
    QDECREF(options);
943
    if (!bs->drv) {
944
        QDECREF(bs->options);
945
    }
946
    bdrv_unref(bs);
947
    return ret;
948
}
949

    
950
/*
951
 * Opens the backing file for a BlockDriverState if not yet open
952
 *
953
 * options is a QDict of options to pass to the block drivers, or NULL for an
954
 * empty set of options. The reference to the QDict is transferred to this
955
 * function (even on failure), so if the caller intends to reuse the dictionary,
956
 * it needs to use QINCREF() before calling bdrv_file_open.
957
 */
958
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
959
{
960
    char backing_filename[PATH_MAX];
961
    int back_flags, ret;
962
    BlockDriver *back_drv = NULL;
963
    Error *local_err = NULL;
964

    
965
    if (bs->backing_hd != NULL) {
966
        QDECREF(options);
967
        return 0;
968
    }
969

    
970
    /* NULL means an empty set of options */
971
    if (options == NULL) {
972
        options = qdict_new();
973
    }
974

    
975
    bs->open_flags &= ~BDRV_O_NO_BACKING;
976
    if (qdict_haskey(options, "file.filename")) {
977
        backing_filename[0] = '\0';
978
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
979
        QDECREF(options);
980
        return 0;
981
    } else {
982
        bdrv_get_full_backing_filename(bs, backing_filename,
983
                                       sizeof(backing_filename));
984
    }
985

    
986
    bs->backing_hd = bdrv_new("");
987

    
988
    if (bs->backing_format[0] != '\0') {
989
        back_drv = bdrv_find_format(bs->backing_format);
990
    }
991

    
992
    /* backing files always opened read-only */
993
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
994

    
995
    ret = bdrv_open(bs->backing_hd,
996
                    *backing_filename ? backing_filename : NULL, options,
997
                    back_flags, back_drv, &local_err);
998
    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
999
            bs->backing_hd->file->filename);
1000
    if (ret < 0) {
1001
        bdrv_unref(bs->backing_hd);
1002
        bs->backing_hd = NULL;
1003
        bs->open_flags |= BDRV_O_NO_BACKING;
1004
        error_propagate(errp, local_err);
1005
        return ret;
1006
    }
1007
    return 0;
1008
}
1009

    
1010
/*
1011
 * Opens a disk image (raw, qcow2, vmdk, ...)
1012
 *
1013
 * options is a QDict of options to pass to the block drivers, or NULL for an
1014
 * empty set of options. The reference to the QDict belongs to the block layer
1015
 * after the call (even on failure), so if the caller intends to reuse the
1016
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1017
 */
1018
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1019
              int flags, BlockDriver *drv, Error **errp)
1020
{
1021
    int ret;
1022
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1023
    char tmp_filename[PATH_MAX + 1];
1024
    BlockDriverState *file = NULL;
1025
    QDict *file_options = NULL;
1026
    const char *drvname;
1027
    Error *local_err = NULL;
1028

    
1029
    /* NULL means an empty set of options */
1030
    if (options == NULL) {
1031
        options = qdict_new();
1032
    }
1033

    
1034
    bs->options = options;
1035
    options = qdict_clone_shallow(options);
1036

    
1037
    /* For snapshot=on, create a temporary qcow2 overlay */
1038
    if (flags & BDRV_O_SNAPSHOT) {
1039
        BlockDriverState *bs1;
1040
        int64_t total_size;
1041
        BlockDriver *bdrv_qcow2;
1042
        QEMUOptionParameter *create_options;
1043
        char backing_filename[PATH_MAX];
1044

    
1045
        if (qdict_size(options) != 0) {
1046
            error_setg(errp, "Can't use snapshot=on with driver-specific options");
1047
            ret = -EINVAL;
1048
            goto fail;
1049
        }
1050
        assert(filename != NULL);
1051

    
1052
        /* if snapshot, we create a temporary backing file and open it
1053
           instead of opening 'filename' directly */
1054

    
1055
        /* if there is a backing file, use it */
1056
        bs1 = bdrv_new("");
1057
        ret = bdrv_open(bs1, filename, NULL, 0, drv, &local_err);
1058
        if (ret < 0) {
1059
            bdrv_unref(bs1);
1060
            goto fail;
1061
        }
1062
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1063

    
1064
        bdrv_unref(bs1);
1065

    
1066
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1067
        if (ret < 0) {
1068
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1069
            goto fail;
1070
        }
1071

    
1072
        /* Real path is meaningless for protocols */
1073
        if (path_has_protocol(filename)) {
1074
            snprintf(backing_filename, sizeof(backing_filename),
1075
                     "%s", filename);
1076
        } else if (!realpath(filename, backing_filename)) {
1077
            error_setg_errno(errp, errno, "Could not resolve path '%s'", filename);
1078
            ret = -errno;
1079
            goto fail;
1080
        }
1081

    
1082
        bdrv_qcow2 = bdrv_find_format("qcow2");
1083
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1084
                                                 NULL);
1085

    
1086
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1087
        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
1088
                             backing_filename);
1089
        if (drv) {
1090
            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
1091
                drv->format_name);
1092
        }
1093

    
1094
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1095
        free_option_parameters(create_options);
1096
        if (ret < 0) {
1097
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1098
                             "'%s': %s", tmp_filename,
1099
                             error_get_pretty(local_err));
1100
            error_free(local_err);
1101
            local_err = NULL;
1102
            goto fail;
1103
        }
1104

    
1105
        filename = tmp_filename;
1106
        drv = bdrv_qcow2;
1107
        bs->is_temporary = 1;
1108
    }
1109

    
1110
    /* Open image file without format layer */
1111
    if (flags & BDRV_O_RDWR) {
1112
        flags |= BDRV_O_ALLOW_RDWR;
1113
    }
1114

    
1115
    qdict_extract_subqdict(options, &file_options, "file.");
1116

    
1117
    ret = bdrv_file_open(&file, filename, file_options,
1118
                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
1119
    if (ret < 0) {
1120
        goto fail;
1121
    }
1122

    
1123
    /* Find the right image format driver */
1124
    drvname = qdict_get_try_str(options, "driver");
1125
    if (drvname) {
1126
        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
1127
        qdict_del(options, "driver");
1128
    }
1129

    
1130
    if (!drv) {
1131
        ret = find_image_format(file, filename, &drv, &local_err);
1132
    }
1133

    
1134
    if (!drv) {
1135
        goto unlink_and_fail;
1136
    }
1137

    
1138
    /* Open the image */
1139
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1140
    if (ret < 0) {
1141
        goto unlink_and_fail;
1142
    }
1143

    
1144
    if (bs->file != file) {
1145
        bdrv_unref(file);
1146
        file = NULL;
1147
    }
1148

    
1149
    /* If there is a backing file, use it */
1150
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1151
        QDict *backing_options;
1152

    
1153
        qdict_extract_subqdict(options, &backing_options, "backing.");
1154
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1155
        if (ret < 0) {
1156
            goto close_and_fail;
1157
        }
1158
    }
1159

    
1160
    /* Check if any unknown options were used */
1161
    if (qdict_size(options) != 0) {
1162
        const QDictEntry *entry = qdict_first(options);
1163
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1164
                   "support the option '%s'", drv->format_name, bs->device_name,
1165
                   entry->key);
1166

    
1167
        ret = -EINVAL;
1168
        goto close_and_fail;
1169
    }
1170
    QDECREF(options);
1171

    
1172
    if (!bdrv_key_required(bs)) {
1173
        bdrv_dev_change_media_cb(bs, true);
1174
    }
1175

    
1176
    return 0;
1177

    
1178
unlink_and_fail:
1179
    if (file != NULL) {
1180
        bdrv_unref(file);
1181
    }
1182
    if (bs->is_temporary) {
1183
        unlink(filename);
1184
    }
1185
fail:
1186
    QDECREF(bs->options);
1187
    QDECREF(options);
1188
    bs->options = NULL;
1189
    if (error_is_set(&local_err)) {
1190
        error_propagate(errp, local_err);
1191
    }
1192
    return ret;
1193

    
1194
close_and_fail:
1195
    bdrv_close(bs);
1196
    QDECREF(options);
1197
    if (error_is_set(&local_err)) {
1198
        error_propagate(errp, local_err);
1199
    }
1200
    return ret;
1201
}
1202

    
1203
typedef struct BlockReopenQueueEntry {
1204
     bool prepared;
1205
     BDRVReopenState state;
1206
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1207
} BlockReopenQueueEntry;
1208

    
1209
/*
1210
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1211
 * reopen of multiple devices.
1212
 *
1213
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1214
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1215
 * be created and initialized. This newly created BlockReopenQueue should be
1216
 * passed back in for subsequent calls that are intended to be of the same
1217
 * atomic 'set'.
1218
 *
1219
 * bs is the BlockDriverState to add to the reopen queue.
1220
 *
1221
 * flags contains the open flags for the associated bs
1222
 *
1223
 * returns a pointer to bs_queue, which is either the newly allocated
1224
 * bs_queue, or the existing bs_queue being used.
1225
 *
1226
 */
1227
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1228
                                    BlockDriverState *bs, int flags)
1229
{
1230
    assert(bs != NULL);
1231

    
1232
    BlockReopenQueueEntry *bs_entry;
1233
    if (bs_queue == NULL) {
1234
        bs_queue = g_new0(BlockReopenQueue, 1);
1235
        QSIMPLEQ_INIT(bs_queue);
1236
    }
1237

    
1238
    if (bs->file) {
1239
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1240
    }
1241

    
1242
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1243
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1244

    
1245
    bs_entry->state.bs = bs;
1246
    bs_entry->state.flags = flags;
1247

    
1248
    return bs_queue;
1249
}
1250

    
1251
/*
1252
 * Reopen multiple BlockDriverStates atomically & transactionally.
1253
 *
1254
 * The queue passed in (bs_queue) must have been built up previous
1255
 * via bdrv_reopen_queue().
1256
 *
1257
 * Reopens all BDS specified in the queue, with the appropriate
1258
 * flags.  All devices are prepared for reopen, and failure of any
1259
 * device will cause all device changes to be abandonded, and intermediate
1260
 * data cleaned up.
1261
 *
1262
 * If all devices prepare successfully, then the changes are committed
1263
 * to all devices.
1264
 *
1265
 */
1266
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1267
{
1268
    int ret = -1;
1269
    BlockReopenQueueEntry *bs_entry, *next;
1270
    Error *local_err = NULL;
1271

    
1272
    assert(bs_queue != NULL);
1273

    
1274
    bdrv_drain_all();
1275

    
1276
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1277
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1278
            error_propagate(errp, local_err);
1279
            goto cleanup;
1280
        }
1281
        bs_entry->prepared = true;
1282
    }
1283

    
1284
    /* If we reach this point, we have success and just need to apply the
1285
     * changes
1286
     */
1287
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1288
        bdrv_reopen_commit(&bs_entry->state);
1289
    }
1290

    
1291
    ret = 0;
1292

    
1293
cleanup:
1294
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1295
        if (ret && bs_entry->prepared) {
1296
            bdrv_reopen_abort(&bs_entry->state);
1297
        }
1298
        g_free(bs_entry);
1299
    }
1300
    g_free(bs_queue);
1301
    return ret;
1302
}
1303

    
1304

    
1305
/* Reopen a single BlockDriverState with the specified flags. */
1306
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1307
{
1308
    int ret = -1;
1309
    Error *local_err = NULL;
1310
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1311

    
1312
    ret = bdrv_reopen_multiple(queue, &local_err);
1313
    if (local_err != NULL) {
1314
        error_propagate(errp, local_err);
1315
    }
1316
    return ret;
1317
}
1318

    
1319

    
1320
/*
1321
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1322
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1323
 * the block driver layer .bdrv_reopen_prepare()
1324
 *
1325
 * bs is the BlockDriverState to reopen
1326
 * flags are the new open flags
1327
 * queue is the reopen queue
1328
 *
1329
 * Returns 0 on success, non-zero on error.  On error errp will be set
1330
 * as well.
1331
 *
1332
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1333
 * It is the responsibility of the caller to then call the abort() or
1334
 * commit() for any other BDS that have been left in a prepare() state
1335
 *
1336
 */
1337
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1338
                        Error **errp)
1339
{
1340
    int ret = -1;
1341
    Error *local_err = NULL;
1342
    BlockDriver *drv;
1343

    
1344
    assert(reopen_state != NULL);
1345
    assert(reopen_state->bs->drv != NULL);
1346
    drv = reopen_state->bs->drv;
1347

    
1348
    /* if we are to stay read-only, do not allow permission change
1349
     * to r/w */
1350
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1351
        reopen_state->flags & BDRV_O_RDWR) {
1352
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1353
                  reopen_state->bs->device_name);
1354
        goto error;
1355
    }
1356

    
1357

    
1358
    ret = bdrv_flush(reopen_state->bs);
1359
    if (ret) {
1360
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1361
                  strerror(-ret));
1362
        goto error;
1363
    }
1364

    
1365
    if (drv->bdrv_reopen_prepare) {
1366
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1367
        if (ret) {
1368
            if (local_err != NULL) {
1369
                error_propagate(errp, local_err);
1370
            } else {
1371
                error_setg(errp, "failed while preparing to reopen image '%s'",
1372
                           reopen_state->bs->filename);
1373
            }
1374
            goto error;
1375
        }
1376
    } else {
1377
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1378
         * handler for each supported drv. */
1379
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1380
                  drv->format_name, reopen_state->bs->device_name,
1381
                 "reopening of file");
1382
        ret = -1;
1383
        goto error;
1384
    }
1385

    
1386
    ret = 0;
1387

    
1388
error:
1389
    return ret;
1390
}
1391

    
1392
/*
1393
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1394
 * makes them final by swapping the staging BlockDriverState contents into
1395
 * the active BlockDriverState contents.
1396
 */
1397
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1398
{
1399
    BlockDriver *drv;
1400

    
1401
    assert(reopen_state != NULL);
1402
    drv = reopen_state->bs->drv;
1403
    assert(drv != NULL);
1404

    
1405
    /* If there are any driver level actions to take */
1406
    if (drv->bdrv_reopen_commit) {
1407
        drv->bdrv_reopen_commit(reopen_state);
1408
    }
1409

    
1410
    /* set BDS specific flags now */
1411
    reopen_state->bs->open_flags         = reopen_state->flags;
1412
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1413
                                              BDRV_O_CACHE_WB);
1414
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1415
}
1416

    
1417
/*
1418
 * Abort the reopen, and delete and free the staged changes in
1419
 * reopen_state
1420
 */
1421
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1422
{
1423
    BlockDriver *drv;
1424

    
1425
    assert(reopen_state != NULL);
1426
    drv = reopen_state->bs->drv;
1427
    assert(drv != NULL);
1428

    
1429
    if (drv->bdrv_reopen_abort) {
1430
        drv->bdrv_reopen_abort(reopen_state);
1431
    }
1432
}
1433

    
1434

    
1435
void bdrv_close(BlockDriverState *bs)
1436
{
1437
    if (bs->job) {
1438
        block_job_cancel_sync(bs->job);
1439
    }
1440
    bdrv_drain_all(); /* complete I/O */
1441
    bdrv_flush(bs);
1442
    bdrv_drain_all(); /* in case flush left pending I/O */
1443
    notifier_list_notify(&bs->close_notifiers, bs);
1444

    
1445
    if (bs->drv) {
1446
        if (bs->backing_hd) {
1447
            bdrv_unref(bs->backing_hd);
1448
            bs->backing_hd = NULL;
1449
        }
1450
        bs->drv->bdrv_close(bs);
1451
        g_free(bs->opaque);
1452
#ifdef _WIN32
1453
        if (bs->is_temporary) {
1454
            unlink(bs->filename);
1455
        }
1456
#endif
1457
        bs->opaque = NULL;
1458
        bs->drv = NULL;
1459
        bs->copy_on_read = 0;
1460
        bs->backing_file[0] = '\0';
1461
        bs->backing_format[0] = '\0';
1462
        bs->total_sectors = 0;
1463
        bs->encrypted = 0;
1464
        bs->valid_key = 0;
1465
        bs->sg = 0;
1466
        bs->growable = 0;
1467
        bs->zero_beyond_eof = false;
1468
        QDECREF(bs->options);
1469
        bs->options = NULL;
1470

    
1471
        if (bs->file != NULL) {
1472
            bdrv_unref(bs->file);
1473
            bs->file = NULL;
1474
        }
1475
    }
1476

    
1477
    bdrv_dev_change_media_cb(bs, false);
1478

    
1479
    /*throttling disk I/O limits*/
1480
    if (bs->io_limits_enabled) {
1481
        bdrv_io_limits_disable(bs);
1482
    }
1483
}
1484

    
1485
void bdrv_close_all(void)
1486
{
1487
    BlockDriverState *bs;
1488

    
1489
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1490
        bdrv_close(bs);
1491
    }
1492
}
1493

    
1494
/* Check if any requests are in-flight (including throttled requests) */
1495
static bool bdrv_requests_pending(BlockDriverState *bs)
1496
{
1497
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1498
        return true;
1499
    }
1500
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1501
        return true;
1502
    }
1503
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1504
        return true;
1505
    }
1506
    if (bs->file && bdrv_requests_pending(bs->file)) {
1507
        return true;
1508
    }
1509
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1510
        return true;
1511
    }
1512
    return false;
1513
}
1514

    
1515
static bool bdrv_requests_pending_all(void)
1516
{
1517
    BlockDriverState *bs;
1518
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1519
        if (bdrv_requests_pending(bs)) {
1520
            return true;
1521
        }
1522
    }
1523
    return false;
1524
}
1525

    
1526
/*
1527
 * Wait for pending requests to complete across all BlockDriverStates
1528
 *
1529
 * This function does not flush data to disk, use bdrv_flush_all() for that
1530
 * after calling this function.
1531
 *
1532
 * Note that completion of an asynchronous I/O operation can trigger any
1533
 * number of other I/O operations on other devices---for example a coroutine
1534
 * can be arbitrarily complex and a constant flow of I/O can come until the
1535
 * coroutine is complete.  Because of this, it is not possible to have a
1536
 * function to drain a single device's I/O queue.
1537
 */
1538
void bdrv_drain_all(void)
1539
{
1540
    /* Always run first iteration so any pending completion BHs run */
1541
    bool busy = true;
1542
    BlockDriverState *bs;
1543

    
1544
    while (busy) {
1545
        /* FIXME: We do not have timer support here, so this is effectively
1546
         * a busy wait.
1547
         */
1548
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1549
            if (bdrv_start_throttled_reqs(bs)) {
1550
                busy = true;
1551
            }
1552
        }
1553

    
1554
        busy = bdrv_requests_pending_all();
1555
        busy |= aio_poll(qemu_get_aio_context(), busy);
1556
    }
1557
}
1558

    
1559
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1560
   Also, NULL terminate the device_name to prevent double remove */
1561
void bdrv_make_anon(BlockDriverState *bs)
1562
{
1563
    if (bs->device_name[0] != '\0') {
1564
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1565
    }
1566
    bs->device_name[0] = '\0';
1567
}
1568

    
1569
static void bdrv_rebind(BlockDriverState *bs)
1570
{
1571
    if (bs->drv && bs->drv->bdrv_rebind) {
1572
        bs->drv->bdrv_rebind(bs);
1573
    }
1574
}
1575

    
1576
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1577
                                     BlockDriverState *bs_src)
1578
{
1579
    /* move some fields that need to stay attached to the device */
1580
    bs_dest->open_flags         = bs_src->open_flags;
1581

    
1582
    /* dev info */
1583
    bs_dest->dev_ops            = bs_src->dev_ops;
1584
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1585
    bs_dest->dev                = bs_src->dev;
1586
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1587
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1588

    
1589
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1590

    
1591
    /* i/o throttled req */
1592
    memcpy(&bs_dest->throttle_state,
1593
           &bs_src->throttle_state,
1594
           sizeof(ThrottleState));
1595
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1596
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1597
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1598

    
1599
    /* r/w error */
1600
    bs_dest->on_read_error      = bs_src->on_read_error;
1601
    bs_dest->on_write_error     = bs_src->on_write_error;
1602

    
1603
    /* i/o status */
1604
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1605
    bs_dest->iostatus           = bs_src->iostatus;
1606

    
1607
    /* dirty bitmap */
1608
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1609

    
1610
    /* reference count */
1611
    bs_dest->refcnt             = bs_src->refcnt;
1612

    
1613
    /* job */
1614
    bs_dest->in_use             = bs_src->in_use;
1615
    bs_dest->job                = bs_src->job;
1616

    
1617
    /* keep the same entry in bdrv_states */
1618
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1619
            bs_src->device_name);
1620
    bs_dest->list = bs_src->list;
1621
}
1622

    
1623
/*
1624
 * Swap bs contents for two image chains while they are live,
1625
 * while keeping required fields on the BlockDriverState that is
1626
 * actually attached to a device.
1627
 *
1628
 * This will modify the BlockDriverState fields, and swap contents
1629
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1630
 *
1631
 * bs_new is required to be anonymous.
1632
 *
1633
 * This function does not create any image files.
1634
 */
1635
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1636
{
1637
    BlockDriverState tmp;
1638

    
1639
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1640
    assert(bs_new->device_name[0] == '\0');
1641
    assert(bs_new->dirty_bitmap == NULL);
1642
    assert(bs_new->job == NULL);
1643
    assert(bs_new->dev == NULL);
1644
    assert(bs_new->in_use == 0);
1645
    assert(bs_new->io_limits_enabled == false);
1646
    assert(!throttle_have_timer(&bs_new->throttle_state));
1647

    
1648
    tmp = *bs_new;
1649
    *bs_new = *bs_old;
1650
    *bs_old = tmp;
1651

    
1652
    /* there are some fields that should not be swapped, move them back */
1653
    bdrv_move_feature_fields(&tmp, bs_old);
1654
    bdrv_move_feature_fields(bs_old, bs_new);
1655
    bdrv_move_feature_fields(bs_new, &tmp);
1656

    
1657
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1658
    assert(bs_new->device_name[0] == '\0');
1659

    
1660
    /* Check a few fields that should remain attached to the device */
1661
    assert(bs_new->dev == NULL);
1662
    assert(bs_new->job == NULL);
1663
    assert(bs_new->in_use == 0);
1664
    assert(bs_new->io_limits_enabled == false);
1665
    assert(!throttle_have_timer(&bs_new->throttle_state));
1666

    
1667
    bdrv_rebind(bs_new);
1668
    bdrv_rebind(bs_old);
1669
}
1670

    
1671
/*
1672
 * Add new bs contents at the top of an image chain while the chain is
1673
 * live, while keeping required fields on the top layer.
1674
 *
1675
 * This will modify the BlockDriverState fields, and swap contents
1676
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1677
 *
1678
 * bs_new is required to be anonymous.
1679
 *
1680
 * This function does not create any image files.
1681
 */
1682
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1683
{
1684
    bdrv_swap(bs_new, bs_top);
1685

    
1686
    /* The contents of 'tmp' will become bs_top, as we are
1687
     * swapping bs_new and bs_top contents. */
1688
    bs_top->backing_hd = bs_new;
1689
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1690
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1691
            bs_new->filename);
1692
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1693
            bs_new->drv ? bs_new->drv->format_name : "");
1694
}
1695

    
1696
static void bdrv_delete(BlockDriverState *bs)
1697
{
1698
    assert(!bs->dev);
1699
    assert(!bs->job);
1700
    assert(!bs->in_use);
1701
    assert(!bs->refcnt);
1702

    
1703
    bdrv_close(bs);
1704

    
1705
    /* remove from list, if necessary */
1706
    bdrv_make_anon(bs);
1707

    
1708
    g_free(bs);
1709
}
1710

    
1711
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1712
/* TODO change to DeviceState *dev when all users are qdevified */
1713
{
1714
    if (bs->dev) {
1715
        return -EBUSY;
1716
    }
1717
    bs->dev = dev;
1718
    bdrv_iostatus_reset(bs);
1719
    return 0;
1720
}
1721

    
1722
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1723
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1724
{
1725
    if (bdrv_attach_dev(bs, dev) < 0) {
1726
        abort();
1727
    }
1728
}
1729

    
1730
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1731
/* TODO change to DeviceState *dev when all users are qdevified */
1732
{
1733
    assert(bs->dev == dev);
1734
    bs->dev = NULL;
1735
    bs->dev_ops = NULL;
1736
    bs->dev_opaque = NULL;
1737
    bs->buffer_alignment = 512;
1738
}
1739

    
1740
/* TODO change to return DeviceState * when all users are qdevified */
1741
void *bdrv_get_attached_dev(BlockDriverState *bs)
1742
{
1743
    return bs->dev;
1744
}
1745

    
1746
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1747
                      void *opaque)
1748
{
1749
    bs->dev_ops = ops;
1750
    bs->dev_opaque = opaque;
1751
}
1752

    
1753
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1754
                               enum MonitorEvent ev,
1755
                               BlockErrorAction action, bool is_read)
1756
{
1757
    QObject *data;
1758
    const char *action_str;
1759

    
1760
    switch (action) {
1761
    case BDRV_ACTION_REPORT:
1762
        action_str = "report";
1763
        break;
1764
    case BDRV_ACTION_IGNORE:
1765
        action_str = "ignore";
1766
        break;
1767
    case BDRV_ACTION_STOP:
1768
        action_str = "stop";
1769
        break;
1770
    default:
1771
        abort();
1772
    }
1773

    
1774
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1775
                              bdrv->device_name,
1776
                              action_str,
1777
                              is_read ? "read" : "write");
1778
    monitor_protocol_event(ev, data);
1779

    
1780
    qobject_decref(data);
1781
}
1782

    
1783
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1784
{
1785
    QObject *data;
1786

    
1787
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1788
                              bdrv_get_device_name(bs), ejected);
1789
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1790

    
1791
    qobject_decref(data);
1792
}
1793

    
1794
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1795
{
1796
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1797
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1798
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1799
        if (tray_was_closed) {
1800
            /* tray open */
1801
            bdrv_emit_qmp_eject_event(bs, true);
1802
        }
1803
        if (load) {
1804
            /* tray close */
1805
            bdrv_emit_qmp_eject_event(bs, false);
1806
        }
1807
    }
1808
}
1809

    
1810
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1811
{
1812
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1813
}
1814

    
1815
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1816
{
1817
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1818
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1819
    }
1820
}
1821

    
1822
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1823
{
1824
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1825
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1826
    }
1827
    return false;
1828
}
1829

    
1830
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1831
{
1832
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1833
        bs->dev_ops->resize_cb(bs->dev_opaque);
1834
    }
1835
}
1836

    
1837
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1838
{
1839
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1840
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1841
    }
1842
    return false;
1843
}
1844

    
1845
/*
1846
 * Run consistency checks on an image
1847
 *
1848
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1849
 * free of errors) or -errno when an internal error occurred. The results of the
1850
 * check are stored in res.
1851
 */
1852
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1853
{
1854
    if (bs->drv->bdrv_check == NULL) {
1855
        return -ENOTSUP;
1856
    }
1857

    
1858
    memset(res, 0, sizeof(*res));
1859
    return bs->drv->bdrv_check(bs, res, fix);
1860
}
1861

    
1862
#define COMMIT_BUF_SECTORS 2048
1863

    
1864
/* commit COW file into the raw image */
1865
int bdrv_commit(BlockDriverState *bs)
1866
{
1867
    BlockDriver *drv = bs->drv;
1868
    int64_t sector, total_sectors;
1869
    int n, ro, open_flags;
1870
    int ret = 0;
1871
    uint8_t *buf;
1872
    char filename[PATH_MAX];
1873

    
1874
    if (!drv)
1875
        return -ENOMEDIUM;
1876
    
1877
    if (!bs->backing_hd) {
1878
        return -ENOTSUP;
1879
    }
1880

    
1881
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1882
        return -EBUSY;
1883
    }
1884

    
1885
    ro = bs->backing_hd->read_only;
1886
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1887
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1888
    open_flags =  bs->backing_hd->open_flags;
1889

    
1890
    if (ro) {
1891
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1892
            return -EACCES;
1893
        }
1894
    }
1895

    
1896
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1897
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1898

    
1899
    for (sector = 0; sector < total_sectors; sector += n) {
1900
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
1901
        if (ret < 0) {
1902
            goto ro_cleanup;
1903
        }
1904
        if (ret) {
1905
            if (bdrv_read(bs, sector, buf, n) != 0) {
1906
                ret = -EIO;
1907
                goto ro_cleanup;
1908
            }
1909

    
1910
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1911
                ret = -EIO;
1912
                goto ro_cleanup;
1913
            }
1914
        }
1915
    }
1916

    
1917
    if (drv->bdrv_make_empty) {
1918
        ret = drv->bdrv_make_empty(bs);
1919
        bdrv_flush(bs);
1920
    }
1921

    
1922
    /*
1923
     * Make sure all data we wrote to the backing device is actually
1924
     * stable on disk.
1925
     */
1926
    if (bs->backing_hd)
1927
        bdrv_flush(bs->backing_hd);
1928

    
1929
ro_cleanup:
1930
    g_free(buf);
1931

    
1932
    if (ro) {
1933
        /* ignoring error return here */
1934
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1935
    }
1936

    
1937
    return ret;
1938
}
1939

    
1940
int bdrv_commit_all(void)
1941
{
1942
    BlockDriverState *bs;
1943

    
1944
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1945
        if (bs->drv && bs->backing_hd) {
1946
            int ret = bdrv_commit(bs);
1947
            if (ret < 0) {
1948
                return ret;
1949
            }
1950
        }
1951
    }
1952
    return 0;
1953
}
1954

    
1955
/**
1956
 * Remove an active request from the tracked requests list
1957
 *
1958
 * This function should be called when a tracked request is completing.
1959
 */
1960
static void tracked_request_end(BdrvTrackedRequest *req)
1961
{
1962
    QLIST_REMOVE(req, list);
1963
    qemu_co_queue_restart_all(&req->wait_queue);
1964
}
1965

    
1966
/**
1967
 * Add an active request to the tracked requests list
1968
 */
1969
static void tracked_request_begin(BdrvTrackedRequest *req,
1970
                                  BlockDriverState *bs,
1971
                                  int64_t sector_num,
1972
                                  int nb_sectors, bool is_write)
1973
{
1974
    *req = (BdrvTrackedRequest){
1975
        .bs = bs,
1976
        .sector_num = sector_num,
1977
        .nb_sectors = nb_sectors,
1978
        .is_write = is_write,
1979
        .co = qemu_coroutine_self(),
1980
    };
1981

    
1982
    qemu_co_queue_init(&req->wait_queue);
1983

    
1984
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1985
}
1986

    
1987
/**
1988
 * Round a region to cluster boundaries
1989
 */
1990
void bdrv_round_to_clusters(BlockDriverState *bs,
1991
                            int64_t sector_num, int nb_sectors,
1992
                            int64_t *cluster_sector_num,
1993
                            int *cluster_nb_sectors)
1994
{
1995
    BlockDriverInfo bdi;
1996

    
1997
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1998
        *cluster_sector_num = sector_num;
1999
        *cluster_nb_sectors = nb_sectors;
2000
    } else {
2001
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2002
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2003
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2004
                                            nb_sectors, c);
2005
    }
2006
}
2007

    
2008
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2009
                                     int64_t sector_num, int nb_sectors) {
2010
    /*        aaaa   bbbb */
2011
    if (sector_num >= req->sector_num + req->nb_sectors) {
2012
        return false;
2013
    }
2014
    /* bbbb   aaaa        */
2015
    if (req->sector_num >= sector_num + nb_sectors) {
2016
        return false;
2017
    }
2018
    return true;
2019
}
2020

    
2021
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2022
        int64_t sector_num, int nb_sectors)
2023
{
2024
    BdrvTrackedRequest *req;
2025
    int64_t cluster_sector_num;
2026
    int cluster_nb_sectors;
2027
    bool retry;
2028

    
2029
    /* If we touch the same cluster it counts as an overlap.  This guarantees
2030
     * that allocating writes will be serialized and not race with each other
2031
     * for the same cluster.  For example, in copy-on-read it ensures that the
2032
     * CoR read and write operations are atomic and guest writes cannot
2033
     * interleave between them.
2034
     */
2035
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2036
                           &cluster_sector_num, &cluster_nb_sectors);
2037

    
2038
    do {
2039
        retry = false;
2040
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2041
            if (tracked_request_overlaps(req, cluster_sector_num,
2042
                                         cluster_nb_sectors)) {
2043
                /* Hitting this means there was a reentrant request, for
2044
                 * example, a block driver issuing nested requests.  This must
2045
                 * never happen since it means deadlock.
2046
                 */
2047
                assert(qemu_coroutine_self() != req->co);
2048

    
2049
                qemu_co_queue_wait(&req->wait_queue);
2050
                retry = true;
2051
                break;
2052
            }
2053
        }
2054
    } while (retry);
2055
}
2056

    
2057
/*
2058
 * Return values:
2059
 * 0        - success
2060
 * -EINVAL  - backing format specified, but no file
2061
 * -ENOSPC  - can't update the backing file because no space is left in the
2062
 *            image file header
2063
 * -ENOTSUP - format driver doesn't support changing the backing file
2064
 */
2065
int bdrv_change_backing_file(BlockDriverState *bs,
2066
    const char *backing_file, const char *backing_fmt)
2067
{
2068
    BlockDriver *drv = bs->drv;
2069
    int ret;
2070

    
2071
    /* Backing file format doesn't make sense without a backing file */
2072
    if (backing_fmt && !backing_file) {
2073
        return -EINVAL;
2074
    }
2075

    
2076
    if (drv->bdrv_change_backing_file != NULL) {
2077
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2078
    } else {
2079
        ret = -ENOTSUP;
2080
    }
2081

    
2082
    if (ret == 0) {
2083
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2084
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2085
    }
2086
    return ret;
2087
}
2088

    
2089
/*
2090
 * Finds the image layer in the chain that has 'bs' as its backing file.
2091
 *
2092
 * active is the current topmost image.
2093
 *
2094
 * Returns NULL if bs is not found in active's image chain,
2095
 * or if active == bs.
2096
 */
2097
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2098
                                    BlockDriverState *bs)
2099
{
2100
    BlockDriverState *overlay = NULL;
2101
    BlockDriverState *intermediate;
2102

    
2103
    assert(active != NULL);
2104
    assert(bs != NULL);
2105

    
2106
    /* if bs is the same as active, then by definition it has no overlay
2107
     */
2108
    if (active == bs) {
2109
        return NULL;
2110
    }
2111

    
2112
    intermediate = active;
2113
    while (intermediate->backing_hd) {
2114
        if (intermediate->backing_hd == bs) {
2115
            overlay = intermediate;
2116
            break;
2117
        }
2118
        intermediate = intermediate->backing_hd;
2119
    }
2120

    
2121
    return overlay;
2122
}
2123

    
2124
typedef struct BlkIntermediateStates {
2125
    BlockDriverState *bs;
2126
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2127
} BlkIntermediateStates;
2128

    
2129

    
2130
/*
2131
 * Drops images above 'base' up to and including 'top', and sets the image
2132
 * above 'top' to have base as its backing file.
2133
 *
2134
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2135
 * information in 'bs' can be properly updated.
2136
 *
2137
 * E.g., this will convert the following chain:
2138
 * bottom <- base <- intermediate <- top <- active
2139
 *
2140
 * to
2141
 *
2142
 * bottom <- base <- active
2143
 *
2144
 * It is allowed for bottom==base, in which case it converts:
2145
 *
2146
 * base <- intermediate <- top <- active
2147
 *
2148
 * to
2149
 *
2150
 * base <- active
2151
 *
2152
 * Error conditions:
2153
 *  if active == top, that is considered an error
2154
 *
2155
 */
2156
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2157
                           BlockDriverState *base)
2158
{
2159
    BlockDriverState *intermediate;
2160
    BlockDriverState *base_bs = NULL;
2161
    BlockDriverState *new_top_bs = NULL;
2162
    BlkIntermediateStates *intermediate_state, *next;
2163
    int ret = -EIO;
2164

    
2165
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2166
    QSIMPLEQ_INIT(&states_to_delete);
2167

    
2168
    if (!top->drv || !base->drv) {
2169
        goto exit;
2170
    }
2171

    
2172
    new_top_bs = bdrv_find_overlay(active, top);
2173

    
2174
    if (new_top_bs == NULL) {
2175
        /* we could not find the image above 'top', this is an error */
2176
        goto exit;
2177
    }
2178

    
2179
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2180
     * to do, no intermediate images */
2181
    if (new_top_bs->backing_hd == base) {
2182
        ret = 0;
2183
        goto exit;
2184
    }
2185

    
2186
    intermediate = top;
2187

    
2188
    /* now we will go down through the list, and add each BDS we find
2189
     * into our deletion queue, until we hit the 'base'
2190
     */
2191
    while (intermediate) {
2192
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2193
        intermediate_state->bs = intermediate;
2194
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2195

    
2196
        if (intermediate->backing_hd == base) {
2197
            base_bs = intermediate->backing_hd;
2198
            break;
2199
        }
2200
        intermediate = intermediate->backing_hd;
2201
    }
2202
    if (base_bs == NULL) {
2203
        /* something went wrong, we did not end at the base. safely
2204
         * unravel everything, and exit with error */
2205
        goto exit;
2206
    }
2207

    
2208
    /* success - we can delete the intermediate states, and link top->base */
2209
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2210
                                   base_bs->drv ? base_bs->drv->format_name : "");
2211
    if (ret) {
2212
        goto exit;
2213
    }
2214
    new_top_bs->backing_hd = base_bs;
2215

    
2216

    
2217
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2218
        /* so that bdrv_close() does not recursively close the chain */
2219
        intermediate_state->bs->backing_hd = NULL;
2220
        bdrv_unref(intermediate_state->bs);
2221
    }
2222
    ret = 0;
2223

    
2224
exit:
2225
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2226
        g_free(intermediate_state);
2227
    }
2228
    return ret;
2229
}
2230

    
2231

    
2232
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2233
                                   size_t size)
2234
{
2235
    int64_t len;
2236

    
2237
    if (!bdrv_is_inserted(bs))
2238
        return -ENOMEDIUM;
2239

    
2240
    if (bs->growable)
2241
        return 0;
2242

    
2243
    len = bdrv_getlength(bs);
2244

    
2245
    if (offset < 0)
2246
        return -EIO;
2247

    
2248
    if ((offset > len) || (len - offset < size))
2249
        return -EIO;
2250

    
2251
    return 0;
2252
}
2253

    
2254
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2255
                              int nb_sectors)
2256
{
2257
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2258
                                   nb_sectors * BDRV_SECTOR_SIZE);
2259
}
2260

    
2261
typedef struct RwCo {
2262
    BlockDriverState *bs;
2263
    int64_t sector_num;
2264
    int nb_sectors;
2265
    QEMUIOVector *qiov;
2266
    bool is_write;
2267
    int ret;
2268
    BdrvRequestFlags flags;
2269
} RwCo;
2270

    
2271
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2272
{
2273
    RwCo *rwco = opaque;
2274

    
2275
    if (!rwco->is_write) {
2276
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2277
                                     rwco->nb_sectors, rwco->qiov,
2278
                                     rwco->flags);
2279
    } else {
2280
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2281
                                      rwco->nb_sectors, rwco->qiov,
2282
                                      rwco->flags);
2283
    }
2284
}
2285

    
2286
/*
2287
 * Process a vectored synchronous request using coroutines
2288
 */
2289
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2290
                       QEMUIOVector *qiov, bool is_write,
2291
                       BdrvRequestFlags flags)
2292
{
2293
    Coroutine *co;
2294
    RwCo rwco = {
2295
        .bs = bs,
2296
        .sector_num = sector_num,
2297
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2298
        .qiov = qiov,
2299
        .is_write = is_write,
2300
        .ret = NOT_DONE,
2301
        .flags = flags,
2302
    };
2303
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2304

    
2305
    /**
2306
     * In sync call context, when the vcpu is blocked, this throttling timer
2307
     * will not fire; so the I/O throttling function has to be disabled here
2308
     * if it has been enabled.
2309
     */
2310
    if (bs->io_limits_enabled) {
2311
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2312
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2313
        bdrv_io_limits_disable(bs);
2314
    }
2315

    
2316
    if (qemu_in_coroutine()) {
2317
        /* Fast-path if already in coroutine context */
2318
        bdrv_rw_co_entry(&rwco);
2319
    } else {
2320
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2321
        qemu_coroutine_enter(co, &rwco);
2322
        while (rwco.ret == NOT_DONE) {
2323
            qemu_aio_wait();
2324
        }
2325
    }
2326
    return rwco.ret;
2327
}
2328

    
2329
/*
2330
 * Process a synchronous request using coroutines
2331
 */
2332
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2333
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2334
{
2335
    QEMUIOVector qiov;
2336
    struct iovec iov = {
2337
        .iov_base = (void *)buf,
2338
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2339
    };
2340

    
2341
    qemu_iovec_init_external(&qiov, &iov, 1);
2342
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2343
}
2344

    
2345
/* return < 0 if error. See bdrv_write() for the return codes */
2346
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2347
              uint8_t *buf, int nb_sectors)
2348
{
2349
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2350
}
2351

    
2352
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2353
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2354
                          uint8_t *buf, int nb_sectors)
2355
{
2356
    bool enabled;
2357
    int ret;
2358

    
2359
    enabled = bs->io_limits_enabled;
2360
    bs->io_limits_enabled = false;
2361
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2362
    bs->io_limits_enabled = enabled;
2363
    return ret;
2364
}
2365

    
2366
/* Return < 0 if error. Important errors are:
2367
  -EIO         generic I/O error (may happen for all errors)
2368
  -ENOMEDIUM   No media inserted.
2369
  -EINVAL      Invalid sector number or nb_sectors
2370
  -EACCES      Trying to write a read-only device
2371
*/
2372
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2373
               const uint8_t *buf, int nb_sectors)
2374
{
2375
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2376
}
2377

    
2378
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2379
{
2380
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2381
}
2382

    
2383
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2384
{
2385
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2386
                      BDRV_REQ_ZERO_WRITE);
2387
}
2388

    
2389
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2390
               void *buf, int count1)
2391
{
2392
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2393
    int len, nb_sectors, count;
2394
    int64_t sector_num;
2395
    int ret;
2396

    
2397
    count = count1;
2398
    /* first read to align to sector start */
2399
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2400
    if (len > count)
2401
        len = count;
2402
    sector_num = offset >> BDRV_SECTOR_BITS;
2403
    if (len > 0) {
2404
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2405
            return ret;
2406
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2407
        count -= len;
2408
        if (count == 0)
2409
            return count1;
2410
        sector_num++;
2411
        buf += len;
2412
    }
2413

    
2414
    /* read the sectors "in place" */
2415
    nb_sectors = count >> BDRV_SECTOR_BITS;
2416
    if (nb_sectors > 0) {
2417
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2418
            return ret;
2419
        sector_num += nb_sectors;
2420
        len = nb_sectors << BDRV_SECTOR_BITS;
2421
        buf += len;
2422
        count -= len;
2423
    }
2424

    
2425
    /* add data from the last sector */
2426
    if (count > 0) {
2427
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2428
            return ret;
2429
        memcpy(buf, tmp_buf, count);
2430
    }
2431
    return count1;
2432
}
2433

    
2434
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2435
{
2436
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2437
    int len, nb_sectors, count;
2438
    int64_t sector_num;
2439
    int ret;
2440

    
2441
    count = qiov->size;
2442

    
2443
    /* first write to align to sector start */
2444
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2445
    if (len > count)
2446
        len = count;
2447
    sector_num = offset >> BDRV_SECTOR_BITS;
2448
    if (len > 0) {
2449
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2450
            return ret;
2451
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2452
                          len);
2453
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2454
            return ret;
2455
        count -= len;
2456
        if (count == 0)
2457
            return qiov->size;
2458
        sector_num++;
2459
    }
2460

    
2461
    /* write the sectors "in place" */
2462
    nb_sectors = count >> BDRV_SECTOR_BITS;
2463
    if (nb_sectors > 0) {
2464
        QEMUIOVector qiov_inplace;
2465

    
2466
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2467
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2468
                          nb_sectors << BDRV_SECTOR_BITS);
2469
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2470
        qemu_iovec_destroy(&qiov_inplace);
2471
        if (ret < 0) {
2472
            return ret;
2473
        }
2474

    
2475
        sector_num += nb_sectors;
2476
        len = nb_sectors << BDRV_SECTOR_BITS;
2477
        count -= len;
2478
    }
2479

    
2480
    /* add data from the last sector */
2481
    if (count > 0) {
2482
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2483
            return ret;
2484
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2485
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2486
            return ret;
2487
    }
2488
    return qiov->size;
2489
}
2490

    
2491
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2492
                const void *buf, int count1)
2493
{
2494
    QEMUIOVector qiov;
2495
    struct iovec iov = {
2496
        .iov_base   = (void *) buf,
2497
        .iov_len    = count1,
2498
    };
2499

    
2500
    qemu_iovec_init_external(&qiov, &iov, 1);
2501
    return bdrv_pwritev(bs, offset, &qiov);
2502
}
2503

    
2504
/*
2505
 * Writes to the file and ensures that no writes are reordered across this
2506
 * request (acts as a barrier)
2507
 *
2508
 * Returns 0 on success, -errno in error cases.
2509
 */
2510
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2511
    const void *buf, int count)
2512
{
2513
    int ret;
2514

    
2515
    ret = bdrv_pwrite(bs, offset, buf, count);
2516
    if (ret < 0) {
2517
        return ret;
2518
    }
2519

    
2520
    /* No flush needed for cache modes that already do it */
2521
    if (bs->enable_write_cache) {
2522
        bdrv_flush(bs);
2523
    }
2524

    
2525
    return 0;
2526
}
2527

    
2528
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2529
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2530
{
2531
    /* Perform I/O through a temporary buffer so that users who scribble over
2532
     * their read buffer while the operation is in progress do not end up
2533
     * modifying the image file.  This is critical for zero-copy guest I/O
2534
     * where anything might happen inside guest memory.
2535
     */
2536
    void *bounce_buffer;
2537

    
2538
    BlockDriver *drv = bs->drv;
2539
    struct iovec iov;
2540
    QEMUIOVector bounce_qiov;
2541
    int64_t cluster_sector_num;
2542
    int cluster_nb_sectors;
2543
    size_t skip_bytes;
2544
    int ret;
2545

    
2546
    /* Cover entire cluster so no additional backing file I/O is required when
2547
     * allocating cluster in the image file.
2548
     */
2549
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2550
                           &cluster_sector_num, &cluster_nb_sectors);
2551

    
2552
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2553
                                   cluster_sector_num, cluster_nb_sectors);
2554

    
2555
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2556
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2557
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2558

    
2559
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2560
                             &bounce_qiov);
2561
    if (ret < 0) {
2562
        goto err;
2563
    }
2564

    
2565
    if (drv->bdrv_co_write_zeroes &&
2566
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2567
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2568
                                      cluster_nb_sectors);
2569
    } else {
2570
        /* This does not change the data on the disk, it is not necessary
2571
         * to flush even in cache=writethrough mode.
2572
         */
2573
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2574
                                  &bounce_qiov);
2575
    }
2576

    
2577
    if (ret < 0) {
2578
        /* It might be okay to ignore write errors for guest requests.  If this
2579
         * is a deliberate copy-on-read then we don't want to ignore the error.
2580
         * Simply report it in all cases.
2581
         */
2582
        goto err;
2583
    }
2584

    
2585
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2586
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2587
                        nb_sectors * BDRV_SECTOR_SIZE);
2588

    
2589
err:
2590
    qemu_vfree(bounce_buffer);
2591
    return ret;
2592
}
2593

    
2594
/*
2595
 * Handle a read request in coroutine context
2596
 */
2597
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2598
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2599
    BdrvRequestFlags flags)
2600
{
2601
    BlockDriver *drv = bs->drv;
2602
    BdrvTrackedRequest req;
2603
    int ret;
2604

    
2605
    if (!drv) {
2606
        return -ENOMEDIUM;
2607
    }
2608
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2609
        return -EIO;
2610
    }
2611

    
2612
    if (bs->copy_on_read) {
2613
        flags |= BDRV_REQ_COPY_ON_READ;
2614
    }
2615
    if (flags & BDRV_REQ_COPY_ON_READ) {
2616
        bs->copy_on_read_in_flight++;
2617
    }
2618

    
2619
    if (bs->copy_on_read_in_flight) {
2620
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2621
    }
2622

    
2623
    /* throttling disk I/O */
2624
    if (bs->io_limits_enabled) {
2625
        bdrv_io_limits_intercept(bs, nb_sectors, false);
2626
    }
2627

    
2628
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2629

    
2630
    if (flags & BDRV_REQ_COPY_ON_READ) {
2631
        int pnum;
2632

    
2633
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2634
        if (ret < 0) {
2635
            goto out;
2636
        }
2637

    
2638
        if (!ret || pnum != nb_sectors) {
2639
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2640
            goto out;
2641
        }
2642
    }
2643

    
2644
    if (!(bs->zero_beyond_eof && bs->growable)) {
2645
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2646
    } else {
2647
        /* Read zeros after EOF of growable BDSes */
2648
        int64_t len, total_sectors, max_nb_sectors;
2649

    
2650
        len = bdrv_getlength(bs);
2651
        if (len < 0) {
2652
            ret = len;
2653
            goto out;
2654
        }
2655

    
2656
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2657
        max_nb_sectors = MAX(0, total_sectors - sector_num);
2658
        if (max_nb_sectors > 0) {
2659
            ret = drv->bdrv_co_readv(bs, sector_num,
2660
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2661
        } else {
2662
            ret = 0;
2663
        }
2664

    
2665
        /* Reading beyond end of file is supposed to produce zeroes */
2666
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2667
            uint64_t offset = MAX(0, total_sectors - sector_num);
2668
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2669
                              BDRV_SECTOR_SIZE;
2670
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2671
        }
2672
    }
2673

    
2674
out:
2675
    tracked_request_end(&req);
2676

    
2677
    if (flags & BDRV_REQ_COPY_ON_READ) {
2678
        bs->copy_on_read_in_flight--;
2679
    }
2680

    
2681
    return ret;
2682
}
2683

    
2684
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2685
    int nb_sectors, QEMUIOVector *qiov)
2686
{
2687
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2688

    
2689
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2690
}
2691

    
2692
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2693
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2694
{
2695
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2696

    
2697
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2698
                            BDRV_REQ_COPY_ON_READ);
2699
}
2700

    
2701
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2702
    int64_t sector_num, int nb_sectors)
2703
{
2704
    BlockDriver *drv = bs->drv;
2705
    QEMUIOVector qiov;
2706
    struct iovec iov;
2707
    int ret;
2708

    
2709
    /* TODO Emulate only part of misaligned requests instead of letting block
2710
     * drivers return -ENOTSUP and emulate everything */
2711

    
2712
    /* First try the efficient write zeroes operation */
2713
    if (drv->bdrv_co_write_zeroes) {
2714
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2715
        if (ret != -ENOTSUP) {
2716
            return ret;
2717
        }
2718
    }
2719

    
2720
    /* Fall back to bounce buffer if write zeroes is unsupported */
2721
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2722
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2723
    memset(iov.iov_base, 0, iov.iov_len);
2724
    qemu_iovec_init_external(&qiov, &iov, 1);
2725

    
2726
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2727

    
2728
    qemu_vfree(iov.iov_base);
2729
    return ret;
2730
}
2731

    
2732
/*
2733
 * Handle a write request in coroutine context
2734
 */
2735
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2736
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2737
    BdrvRequestFlags flags)
2738
{
2739
    BlockDriver *drv = bs->drv;
2740
    BdrvTrackedRequest req;
2741
    int ret;
2742

    
2743
    if (!bs->drv) {
2744
        return -ENOMEDIUM;
2745
    }
2746
    if (bs->read_only) {
2747
        return -EACCES;
2748
    }
2749
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2750
        return -EIO;
2751
    }
2752

    
2753
    if (bs->copy_on_read_in_flight) {
2754
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2755
    }
2756

    
2757
    /* throttling disk I/O */
2758
    if (bs->io_limits_enabled) {
2759
        bdrv_io_limits_intercept(bs, nb_sectors, true);
2760
    }
2761

    
2762
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2763

    
2764
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2765

    
2766
    if (ret < 0) {
2767
        /* Do nothing, write notifier decided to fail this request */
2768
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
2769
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2770
    } else {
2771
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2772
    }
2773

    
2774
    if (ret == 0 && !bs->enable_write_cache) {
2775
        ret = bdrv_co_flush(bs);
2776
    }
2777

    
2778
    if (bs->dirty_bitmap) {
2779
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2780
    }
2781

    
2782
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2783
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2784
    }
2785
    if (bs->growable && ret >= 0) {
2786
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
2787
    }
2788

    
2789
    tracked_request_end(&req);
2790

    
2791
    return ret;
2792
}
2793

    
2794
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2795
    int nb_sectors, QEMUIOVector *qiov)
2796
{
2797
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2798

    
2799
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2800
}
2801

    
2802
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2803
                                      int64_t sector_num, int nb_sectors)
2804
{
2805
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2806

    
2807
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2808
                             BDRV_REQ_ZERO_WRITE);
2809
}
2810

    
2811
/**
2812
 * Truncate file to 'offset' bytes (needed only for file protocols)
2813
 */
2814
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2815
{
2816
    BlockDriver *drv = bs->drv;
2817
    int ret;
2818
    if (!drv)
2819
        return -ENOMEDIUM;
2820
    if (!drv->bdrv_truncate)
2821
        return -ENOTSUP;
2822
    if (bs->read_only)
2823
        return -EACCES;
2824
    if (bdrv_in_use(bs))
2825
        return -EBUSY;
2826
    ret = drv->bdrv_truncate(bs, offset);
2827
    if (ret == 0) {
2828
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2829
        bdrv_dev_resize_cb(bs);
2830
    }
2831
    return ret;
2832
}
2833

    
2834
/**
2835
 * Length of a allocated file in bytes. Sparse files are counted by actual
2836
 * allocated space. Return < 0 if error or unknown.
2837
 */
2838
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2839
{
2840
    BlockDriver *drv = bs->drv;
2841
    if (!drv) {
2842
        return -ENOMEDIUM;
2843
    }
2844
    if (drv->bdrv_get_allocated_file_size) {
2845
        return drv->bdrv_get_allocated_file_size(bs);
2846
    }
2847
    if (bs->file) {
2848
        return bdrv_get_allocated_file_size(bs->file);
2849
    }
2850
    return -ENOTSUP;
2851
}
2852

    
2853
/**
2854
 * Length of a file in bytes. Return < 0 if error or unknown.
2855
 */
2856
int64_t bdrv_getlength(BlockDriverState *bs)
2857
{
2858
    BlockDriver *drv = bs->drv;
2859
    if (!drv)
2860
        return -ENOMEDIUM;
2861

    
2862
    if (bdrv_dev_has_removable_media(bs)) {
2863
        if (drv->bdrv_getlength) {
2864
            return drv->bdrv_getlength(bs);
2865
        }
2866
    }
2867
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2868
}
2869

    
2870
/* return 0 as number of sectors if no device present or error */
2871
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2872
{
2873
    int64_t length;
2874
    length = bdrv_getlength(bs);
2875
    if (length < 0)
2876
        length = 0;
2877
    else
2878
        length = length >> BDRV_SECTOR_BITS;
2879
    *nb_sectors_ptr = length;
2880
}
2881

    
2882
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2883
                       BlockdevOnError on_write_error)
2884
{
2885
    bs->on_read_error = on_read_error;
2886
    bs->on_write_error = on_write_error;
2887
}
2888

    
2889
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2890
{
2891
    return is_read ? bs->on_read_error : bs->on_write_error;
2892
}
2893

    
2894
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2895
{
2896
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2897

    
2898
    switch (on_err) {
2899
    case BLOCKDEV_ON_ERROR_ENOSPC:
2900
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2901
    case BLOCKDEV_ON_ERROR_STOP:
2902
        return BDRV_ACTION_STOP;
2903
    case BLOCKDEV_ON_ERROR_REPORT:
2904
        return BDRV_ACTION_REPORT;
2905
    case BLOCKDEV_ON_ERROR_IGNORE:
2906
        return BDRV_ACTION_IGNORE;
2907
    default:
2908
        abort();
2909
    }
2910
}
2911

    
2912
/* This is done by device models because, while the block layer knows
2913
 * about the error, it does not know whether an operation comes from
2914
 * the device or the block layer (from a job, for example).
2915
 */
2916
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2917
                       bool is_read, int error)
2918
{
2919
    assert(error >= 0);
2920
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2921
    if (action == BDRV_ACTION_STOP) {
2922
        vm_stop(RUN_STATE_IO_ERROR);
2923
        bdrv_iostatus_set_err(bs, error);
2924
    }
2925
}
2926

    
2927
int bdrv_is_read_only(BlockDriverState *bs)
2928
{
2929
    return bs->read_only;
2930
}
2931

    
2932
int bdrv_is_sg(BlockDriverState *bs)
2933
{
2934
    return bs->sg;
2935
}
2936

    
2937
int bdrv_enable_write_cache(BlockDriverState *bs)
2938
{
2939
    return bs->enable_write_cache;
2940
}
2941

    
2942
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2943
{
2944
    bs->enable_write_cache = wce;
2945

    
2946
    /* so a reopen() will preserve wce */
2947
    if (wce) {
2948
        bs->open_flags |= BDRV_O_CACHE_WB;
2949
    } else {
2950
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2951
    }
2952
}
2953

    
2954
int bdrv_is_encrypted(BlockDriverState *bs)
2955
{
2956
    if (bs->backing_hd && bs->backing_hd->encrypted)
2957
        return 1;
2958
    return bs->encrypted;
2959
}
2960

    
2961
int bdrv_key_required(BlockDriverState *bs)
2962
{
2963
    BlockDriverState *backing_hd = bs->backing_hd;
2964

    
2965
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2966
        return 1;
2967
    return (bs->encrypted && !bs->valid_key);
2968
}
2969

    
2970
int bdrv_set_key(BlockDriverState *bs, const char *key)
2971
{
2972
    int ret;
2973
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2974
        ret = bdrv_set_key(bs->backing_hd, key);
2975
        if (ret < 0)
2976
            return ret;
2977
        if (!bs->encrypted)
2978
            return 0;
2979
    }
2980
    if (!bs->encrypted) {
2981
        return -EINVAL;
2982
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2983
        return -ENOMEDIUM;
2984
    }
2985
    ret = bs->drv->bdrv_set_key(bs, key);
2986
    if (ret < 0) {
2987
        bs->valid_key = 0;
2988
    } else if (!bs->valid_key) {
2989
        bs->valid_key = 1;
2990
        /* call the change callback now, we skipped it on open */
2991
        bdrv_dev_change_media_cb(bs, true);
2992
    }
2993
    return ret;
2994
}
2995

    
2996
const char *bdrv_get_format_name(BlockDriverState *bs)
2997
{
2998
    return bs->drv ? bs->drv->format_name : NULL;
2999
}
3000

    
3001
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3002
                         void *opaque)
3003
{
3004
    BlockDriver *drv;
3005

    
3006
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3007
        it(opaque, drv->format_name);
3008
    }
3009
}
3010

    
3011
BlockDriverState *bdrv_find(const char *name)
3012
{
3013
    BlockDriverState *bs;
3014

    
3015
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3016
        if (!strcmp(name, bs->device_name)) {
3017
            return bs;
3018
        }
3019
    }
3020
    return NULL;
3021
}
3022

    
3023
BlockDriverState *bdrv_next(BlockDriverState *bs)
3024
{
3025
    if (!bs) {
3026
        return QTAILQ_FIRST(&bdrv_states);
3027
    }
3028
    return QTAILQ_NEXT(bs, list);
3029
}
3030

    
3031
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3032
{
3033
    BlockDriverState *bs;
3034

    
3035
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3036
        it(opaque, bs);
3037
    }
3038
}
3039

    
3040
const char *bdrv_get_device_name(BlockDriverState *bs)
3041
{
3042
    return bs->device_name;
3043
}
3044

    
3045
int bdrv_get_flags(BlockDriverState *bs)
3046
{
3047
    return bs->open_flags;
3048
}
3049

    
3050
int bdrv_flush_all(void)
3051
{
3052
    BlockDriverState *bs;
3053
    int result = 0;
3054

    
3055
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3056
        int ret = bdrv_flush(bs);
3057
        if (ret < 0 && !result) {
3058
            result = ret;
3059
        }
3060
    }
3061

    
3062
    return result;
3063
}
3064

    
3065
int bdrv_has_zero_init_1(BlockDriverState *bs)
3066
{
3067
    return 1;
3068
}
3069

    
3070
int bdrv_has_zero_init(BlockDriverState *bs)
3071
{
3072
    assert(bs->drv);
3073

    
3074
    /* If BS is a copy on write image, it is initialized to
3075
       the contents of the base image, which may not be zeroes.  */
3076
    if (bs->backing_hd) {
3077
        return 0;
3078
    }
3079
    if (bs->drv->bdrv_has_zero_init) {
3080
        return bs->drv->bdrv_has_zero_init(bs);
3081
    }
3082

    
3083
    /* safe default */
3084
    return 0;
3085
}
3086

    
3087
typedef struct BdrvCoGetBlockStatusData {
3088
    BlockDriverState *bs;
3089
    BlockDriverState *base;
3090
    int64_t sector_num;
3091
    int nb_sectors;
3092
    int *pnum;
3093
    int64_t ret;
3094
    bool done;
3095
} BdrvCoGetBlockStatusData;
3096

    
3097
/*
3098
 * Returns true iff the specified sector is present in the disk image. Drivers
3099
 * not implementing the functionality are assumed to not support backing files,
3100
 * hence all their sectors are reported as allocated.
3101
 *
3102
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3103
 * and 'pnum' is set to 0.
3104
 *
3105
 * 'pnum' is set to the number of sectors (including and immediately following
3106
 * the specified sector) that are known to be in the same
3107
 * allocated/unallocated state.
3108
 *
3109
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3110
 * beyond the end of the disk image it will be clamped.
3111
 */
3112
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3113
                                                     int64_t sector_num,
3114
                                                     int nb_sectors, int *pnum)
3115
{
3116
    int64_t length;
3117
    int64_t n;
3118
    int64_t ret, ret2;
3119

    
3120
    length = bdrv_getlength(bs);
3121
    if (length < 0) {
3122
        return length;
3123
    }
3124

    
3125
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3126
        *pnum = 0;
3127
        return 0;
3128
    }
3129

    
3130
    n = bs->total_sectors - sector_num;
3131
    if (n < nb_sectors) {
3132
        nb_sectors = n;
3133
    }
3134

    
3135
    if (!bs->drv->bdrv_co_get_block_status) {
3136
        *pnum = nb_sectors;
3137
        ret = BDRV_BLOCK_DATA;
3138
        if (bs->drv->protocol_name) {
3139
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3140
        }
3141
        return ret;
3142
    }
3143

    
3144
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3145
    if (ret < 0) {
3146
        *pnum = 0;
3147
        return ret;
3148
    }
3149

    
3150
    if (ret & BDRV_BLOCK_RAW) {
3151
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3152
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3153
                                     *pnum, pnum);
3154
    }
3155

    
3156
    if (!(ret & BDRV_BLOCK_DATA)) {
3157
        if (bdrv_has_zero_init(bs)) {
3158
            ret |= BDRV_BLOCK_ZERO;
3159
        } else if (bs->backing_hd) {
3160
            BlockDriverState *bs2 = bs->backing_hd;
3161
            int64_t length2 = bdrv_getlength(bs2);
3162
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3163
                ret |= BDRV_BLOCK_ZERO;
3164
            }
3165
        }
3166
    }
3167

    
3168
    if (bs->file &&
3169
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3170
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3171
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3172
                                        *pnum, pnum);
3173
        if (ret2 >= 0) {
3174
            /* Ignore errors.  This is just providing extra information, it
3175
             * is useful but not necessary.
3176
             */
3177
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3178
        }
3179
    }
3180

    
3181
    return ret;
3182
}
3183

    
3184
/* Coroutine wrapper for bdrv_get_block_status() */
3185
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3186
{
3187
    BdrvCoGetBlockStatusData *data = opaque;
3188
    BlockDriverState *bs = data->bs;
3189

    
3190
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3191
                                         data->pnum);
3192
    data->done = true;
3193
}
3194

    
3195
/*
3196
 * Synchronous wrapper around bdrv_co_get_block_status().
3197
 *
3198
 * See bdrv_co_get_block_status() for details.
3199
 */
3200
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3201
                              int nb_sectors, int *pnum)
3202
{
3203
    Coroutine *co;
3204
    BdrvCoGetBlockStatusData data = {
3205
        .bs = bs,
3206
        .sector_num = sector_num,
3207
        .nb_sectors = nb_sectors,
3208
        .pnum = pnum,
3209
        .done = false,
3210
    };
3211

    
3212
    if (qemu_in_coroutine()) {
3213
        /* Fast-path if already in coroutine context */
3214
        bdrv_get_block_status_co_entry(&data);
3215
    } else {
3216
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3217
        qemu_coroutine_enter(co, &data);
3218
        while (!data.done) {
3219
            qemu_aio_wait();
3220
        }
3221
    }
3222
    return data.ret;
3223
}
3224

    
3225
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3226
                                   int nb_sectors, int *pnum)
3227
{
3228
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3229
    if (ret < 0) {
3230
        return ret;
3231
    }
3232
    return
3233
        (ret & BDRV_BLOCK_DATA) ||
3234
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3235
}
3236

    
3237
/*
3238
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3239
 *
3240
 * Return true if the given sector is allocated in any image between
3241
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3242
 * sector is allocated in any image of the chain.  Return false otherwise.
3243
 *
3244
 * 'pnum' is set to the number of sectors (including and immediately following
3245
 *  the specified sector) that are known to be in the same
3246
 *  allocated/unallocated state.
3247
 *
3248
 */
3249
int bdrv_is_allocated_above(BlockDriverState *top,
3250
                            BlockDriverState *base,
3251
                            int64_t sector_num,
3252
                            int nb_sectors, int *pnum)
3253
{
3254
    BlockDriverState *intermediate;
3255
    int ret, n = nb_sectors;
3256

    
3257
    intermediate = top;
3258
    while (intermediate && intermediate != base) {
3259
        int pnum_inter;
3260
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3261
                                &pnum_inter);
3262
        if (ret < 0) {
3263
            return ret;
3264
        } else if (ret) {
3265
            *pnum = pnum_inter;
3266
            return 1;
3267
        }
3268

    
3269
        /*
3270
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3271
         * might have
3272
         *
3273
         * [sector_num+x, nr_sectors] allocated.
3274
         */
3275
        if (n > pnum_inter &&
3276
            (intermediate == top ||
3277
             sector_num + pnum_inter < intermediate->total_sectors)) {
3278
            n = pnum_inter;
3279
        }
3280

    
3281
        intermediate = intermediate->backing_hd;
3282
    }
3283

    
3284
    *pnum = n;
3285
    return 0;
3286
}
3287

    
3288
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3289
{
3290
    if (bs->backing_hd && bs->backing_hd->encrypted)
3291
        return bs->backing_file;
3292
    else if (bs->encrypted)
3293
        return bs->filename;
3294
    else
3295
        return NULL;
3296
}
3297

    
3298
void bdrv_get_backing_filename(BlockDriverState *bs,
3299
                               char *filename, int filename_size)
3300
{
3301
    pstrcpy(filename, filename_size, bs->backing_file);
3302
}
3303

    
3304
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3305
                          const uint8_t *buf, int nb_sectors)
3306
{
3307
    BlockDriver *drv = bs->drv;
3308
    if (!drv)
3309
        return -ENOMEDIUM;
3310
    if (!drv->bdrv_write_compressed)
3311
        return -ENOTSUP;
3312
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3313
        return -EIO;
3314

    
3315
    assert(!bs->dirty_bitmap);
3316

    
3317
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3318
}
3319

    
3320
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3321
{
3322
    BlockDriver *drv = bs->drv;
3323
    if (!drv)
3324
        return -ENOMEDIUM;
3325
    if (!drv->bdrv_get_info)
3326
        return -ENOTSUP;
3327
    memset(bdi, 0, sizeof(*bdi));
3328
    return drv->bdrv_get_info(bs, bdi);
3329
}
3330

    
3331
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3332
{
3333
    BlockDriver *drv = bs->drv;
3334
    if (drv && drv->bdrv_get_specific_info) {
3335
        return drv->bdrv_get_specific_info(bs);
3336
    }
3337
    return NULL;
3338
}
3339

    
3340
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3341
                      int64_t pos, int size)
3342
{
3343
    QEMUIOVector qiov;
3344
    struct iovec iov = {
3345
        .iov_base   = (void *) buf,
3346
        .iov_len    = size,
3347
    };
3348

    
3349
    qemu_iovec_init_external(&qiov, &iov, 1);
3350
    return bdrv_writev_vmstate(bs, &qiov, pos);
3351
}
3352

    
3353
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3354
{
3355
    BlockDriver *drv = bs->drv;
3356

    
3357
    if (!drv) {
3358
        return -ENOMEDIUM;
3359
    } else if (drv->bdrv_save_vmstate) {
3360
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3361
    } else if (bs->file) {
3362
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3363
    }
3364

    
3365
    return -ENOTSUP;
3366
}
3367

    
3368
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3369
                      int64_t pos, int size)
3370
{
3371
    BlockDriver *drv = bs->drv;
3372
    if (!drv)
3373
        return -ENOMEDIUM;
3374
    if (drv->bdrv_load_vmstate)
3375
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3376
    if (bs->file)
3377
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3378
    return -ENOTSUP;
3379
}
3380

    
3381
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3382
{
3383
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3384
        return;
3385
    }
3386

    
3387
    bs->drv->bdrv_debug_event(bs, event);
3388
}
3389

    
3390
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3391
                          const char *tag)
3392
{
3393
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3394
        bs = bs->file;
3395
    }
3396

    
3397
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3398
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3399
    }
3400

    
3401
    return -ENOTSUP;
3402
}
3403

    
3404
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3405
{
3406
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3407
        bs = bs->file;
3408
    }
3409

    
3410
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3411
        return bs->drv->bdrv_debug_resume(bs, tag);
3412
    }
3413

    
3414
    return -ENOTSUP;
3415
}
3416

    
3417
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3418
{
3419
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3420
        bs = bs->file;
3421
    }
3422

    
3423
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3424
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3425
    }
3426

    
3427
    return false;
3428
}
3429

    
3430
int bdrv_is_snapshot(BlockDriverState *bs)
3431
{
3432
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3433
}
3434

    
3435
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3436
 * relative, it must be relative to the chain.  So, passing in bs->filename
3437
 * from a BDS as backing_file should not be done, as that may be relative to
3438
 * the CWD rather than the chain. */
3439
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3440
        const char *backing_file)
3441
{
3442
    char *filename_full = NULL;
3443
    char *backing_file_full = NULL;
3444
    char *filename_tmp = NULL;
3445
    int is_protocol = 0;
3446
    BlockDriverState *curr_bs = NULL;
3447
    BlockDriverState *retval = NULL;
3448

    
3449
    if (!bs || !bs->drv || !backing_file) {
3450
        return NULL;
3451
    }
3452

    
3453
    filename_full     = g_malloc(PATH_MAX);
3454
    backing_file_full = g_malloc(PATH_MAX);
3455
    filename_tmp      = g_malloc(PATH_MAX);
3456

    
3457
    is_protocol = path_has_protocol(backing_file);
3458

    
3459
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3460

    
3461
        /* If either of the filename paths is actually a protocol, then
3462
         * compare unmodified paths; otherwise make paths relative */
3463
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3464
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3465
                retval = curr_bs->backing_hd;
3466
                break;
3467
            }
3468
        } else {
3469
            /* If not an absolute filename path, make it relative to the current
3470
             * image's filename path */
3471
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3472
                         backing_file);
3473

    
3474
            /* We are going to compare absolute pathnames */
3475
            if (!realpath(filename_tmp, filename_full)) {
3476
                continue;
3477
            }
3478

    
3479
            /* We need to make sure the backing filename we are comparing against
3480
             * is relative to the current image filename (or absolute) */
3481
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3482
                         curr_bs->backing_file);
3483

    
3484
            if (!realpath(filename_tmp, backing_file_full)) {
3485
                continue;
3486
            }
3487

    
3488
            if (strcmp(backing_file_full, filename_full) == 0) {
3489
                retval = curr_bs->backing_hd;
3490
                break;
3491
            }
3492
        }
3493
    }
3494

    
3495
    g_free(filename_full);
3496
    g_free(backing_file_full);
3497
    g_free(filename_tmp);
3498
    return retval;
3499
}
3500

    
3501
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3502
{
3503
    if (!bs->drv) {
3504
        return 0;
3505
    }
3506

    
3507
    if (!bs->backing_hd) {
3508
        return 0;
3509
    }
3510

    
3511
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3512
}
3513

    
3514
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3515
{
3516
    BlockDriverState *curr_bs = NULL;
3517

    
3518
    if (!bs) {
3519
        return NULL;
3520
    }
3521

    
3522
    curr_bs = bs;
3523

    
3524
    while (curr_bs->backing_hd) {
3525
        curr_bs = curr_bs->backing_hd;
3526
    }
3527
    return curr_bs;
3528
}
3529

    
3530
/**************************************************************/
3531
/* async I/Os */
3532

    
3533
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3534
                                 QEMUIOVector *qiov, int nb_sectors,
3535
                                 BlockDriverCompletionFunc *cb, void *opaque)
3536
{
3537
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3538

    
3539
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3540
                                 cb, opaque, false);
3541
}
3542

    
3543
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3544
                                  QEMUIOVector *qiov, int nb_sectors,
3545
                                  BlockDriverCompletionFunc *cb, void *opaque)
3546
{
3547
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3548

    
3549
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3550
                                 cb, opaque, true);
3551
}
3552

    
3553

    
3554
typedef struct MultiwriteCB {
3555
    int error;
3556
    int num_requests;
3557
    int num_callbacks;
3558
    struct {
3559
        BlockDriverCompletionFunc *cb;
3560
        void *opaque;
3561
        QEMUIOVector *free_qiov;
3562
    } callbacks[];
3563
} MultiwriteCB;
3564

    
3565
static void multiwrite_user_cb(MultiwriteCB *mcb)
3566
{
3567
    int i;
3568

    
3569
    for (i = 0; i < mcb->num_callbacks; i++) {
3570
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3571
        if (mcb->callbacks[i].free_qiov) {
3572
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3573
        }
3574
        g_free(mcb->callbacks[i].free_qiov);
3575
    }
3576
}
3577

    
3578
static void multiwrite_cb(void *opaque, int ret)
3579
{
3580
    MultiwriteCB *mcb = opaque;
3581

    
3582
    trace_multiwrite_cb(mcb, ret);
3583

    
3584
    if (ret < 0 && !mcb->error) {
3585
        mcb->error = ret;
3586
    }
3587

    
3588
    mcb->num_requests--;
3589
    if (mcb->num_requests == 0) {
3590
        multiwrite_user_cb(mcb);
3591
        g_free(mcb);
3592
    }
3593
}
3594

    
3595
static int multiwrite_req_compare(const void *a, const void *b)
3596
{
3597
    const BlockRequest *req1 = a, *req2 = b;
3598

    
3599
    /*
3600
     * Note that we can't simply subtract req2->sector from req1->sector
3601
     * here as that could overflow the return value.
3602
     */
3603
    if (req1->sector > req2->sector) {
3604
        return 1;
3605
    } else if (req1->sector < req2->sector) {
3606
        return -1;
3607
    } else {
3608
        return 0;
3609
    }
3610
}
3611

    
3612
/*
3613
 * Takes a bunch of requests and tries to merge them. Returns the number of
3614
 * requests that remain after merging.
3615
 */
3616
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3617
    int num_reqs, MultiwriteCB *mcb)
3618
{
3619
    int i, outidx;
3620

    
3621
    // Sort requests by start sector
3622
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3623

    
3624
    // Check if adjacent requests touch the same clusters. If so, combine them,
3625
    // filling up gaps with zero sectors.
3626
    outidx = 0;
3627
    for (i = 1; i < num_reqs; i++) {
3628
        int merge = 0;
3629
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3630

    
3631
        // Handle exactly sequential writes and overlapping writes.
3632
        if (reqs[i].sector <= oldreq_last) {
3633
            merge = 1;
3634
        }
3635

    
3636
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3637
            merge = 0;
3638
        }
3639

    
3640
        if (merge) {
3641
            size_t size;
3642
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3643
            qemu_iovec_init(qiov,
3644
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3645

    
3646
            // Add the first request to the merged one. If the requests are
3647
            // overlapping, drop the last sectors of the first request.
3648
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3649
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3650

    
3651
            // We should need to add any zeros between the two requests
3652
            assert (reqs[i].sector <= oldreq_last);
3653

    
3654
            // Add the second request
3655
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3656

    
3657
            reqs[outidx].nb_sectors = qiov->size >> 9;
3658
            reqs[outidx].qiov = qiov;
3659

    
3660
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3661
        } else {
3662
            outidx++;
3663
            reqs[outidx].sector     = reqs[i].sector;
3664
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3665
            reqs[outidx].qiov       = reqs[i].qiov;
3666
        }
3667
    }
3668

    
3669
    return outidx + 1;
3670
}
3671

    
3672
/*
3673
 * Submit multiple AIO write requests at once.
3674
 *
3675
 * On success, the function returns 0 and all requests in the reqs array have
3676
 * been submitted. In error case this function returns -1, and any of the
3677
 * requests may or may not be submitted yet. In particular, this means that the
3678
 * callback will be called for some of the requests, for others it won't. The
3679
 * caller must check the error field of the BlockRequest to wait for the right
3680
 * callbacks (if error != 0, no callback will be called).
3681
 *
3682
 * The implementation may modify the contents of the reqs array, e.g. to merge
3683
 * requests. However, the fields opaque and error are left unmodified as they
3684
 * are used to signal failure for a single request to the caller.
3685
 */
3686
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3687
{
3688
    MultiwriteCB *mcb;
3689
    int i;
3690

    
3691
    /* don't submit writes if we don't have a medium */
3692
    if (bs->drv == NULL) {
3693
        for (i = 0; i < num_reqs; i++) {
3694
            reqs[i].error = -ENOMEDIUM;
3695
        }
3696
        return -1;
3697
    }
3698

    
3699
    if (num_reqs == 0) {
3700
        return 0;
3701
    }
3702

    
3703
    // Create MultiwriteCB structure
3704
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3705
    mcb->num_requests = 0;
3706
    mcb->num_callbacks = num_reqs;
3707

    
3708
    for (i = 0; i < num_reqs; i++) {
3709
        mcb->callbacks[i].cb = reqs[i].cb;
3710
        mcb->callbacks[i].opaque = reqs[i].opaque;
3711
    }
3712

    
3713
    // Check for mergable requests
3714
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3715

    
3716
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3717

    
3718
    /* Run the aio requests. */
3719
    mcb->num_requests = num_reqs;
3720
    for (i = 0; i < num_reqs; i++) {
3721
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3722
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3723
    }
3724

    
3725
    return 0;
3726
}
3727

    
3728
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3729
{
3730
    acb->aiocb_info->cancel(acb);
3731
}
3732

    
3733
/**************************************************************/
3734
/* async block device emulation */
3735

    
3736
typedef struct BlockDriverAIOCBSync {
3737
    BlockDriverAIOCB common;
3738
    QEMUBH *bh;
3739
    int ret;
3740
    /* vector translation state */
3741
    QEMUIOVector *qiov;
3742
    uint8_t *bounce;
3743
    int is_write;
3744
} BlockDriverAIOCBSync;
3745

    
3746
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3747
{
3748
    BlockDriverAIOCBSync *acb =
3749
        container_of(blockacb, BlockDriverAIOCBSync, common);
3750
    qemu_bh_delete(acb->bh);
3751
    acb->bh = NULL;
3752
    qemu_aio_release(acb);
3753
}
3754

    
3755
static const AIOCBInfo bdrv_em_aiocb_info = {
3756
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3757
    .cancel             = bdrv_aio_cancel_em,
3758
};
3759

    
3760
static void bdrv_aio_bh_cb(void *opaque)
3761
{
3762
    BlockDriverAIOCBSync *acb = opaque;
3763

    
3764
    if (!acb->is_write)
3765
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3766
    qemu_vfree(acb->bounce);
3767
    acb->common.cb(acb->common.opaque, acb->ret);
3768
    qemu_bh_delete(acb->bh);
3769
    acb->bh = NULL;
3770
    qemu_aio_release(acb);
3771
}
3772

    
3773
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3774
                                            int64_t sector_num,
3775
                                            QEMUIOVector *qiov,
3776
                                            int nb_sectors,
3777
                                            BlockDriverCompletionFunc *cb,
3778
                                            void *opaque,
3779
                                            int is_write)
3780

    
3781
{
3782
    BlockDriverAIOCBSync *acb;
3783

    
3784
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3785
    acb->is_write = is_write;
3786
    acb->qiov = qiov;
3787
    acb->bounce = qemu_blockalign(bs, qiov->size);
3788
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3789

    
3790
    if (is_write) {
3791
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3792
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3793
    } else {
3794
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3795
    }
3796

    
3797
    qemu_bh_schedule(acb->bh);
3798

    
3799
    return &acb->common;
3800
}
3801

    
3802
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3803
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3804
        BlockDriverCompletionFunc *cb, void *opaque)
3805
{
3806
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3807
}
3808

    
3809
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3810
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3811
        BlockDriverCompletionFunc *cb, void *opaque)
3812
{
3813
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3814
}
3815

    
3816

    
3817
typedef struct BlockDriverAIOCBCoroutine {
3818
    BlockDriverAIOCB common;
3819
    BlockRequest req;
3820
    bool is_write;
3821
    bool *done;
3822
    QEMUBH* bh;
3823
} BlockDriverAIOCBCoroutine;
3824

    
3825
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3826
{
3827
    BlockDriverAIOCBCoroutine *acb =
3828
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3829
    bool done = false;
3830

    
3831
    acb->done = &done;
3832
    while (!done) {
3833
        qemu_aio_wait();
3834
    }
3835
}
3836

    
3837
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3838
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3839
    .cancel             = bdrv_aio_co_cancel_em,
3840
};
3841

    
3842
static void bdrv_co_em_bh(void *opaque)
3843
{
3844
    BlockDriverAIOCBCoroutine *acb = opaque;
3845

    
3846
    acb->common.cb(acb->common.opaque, acb->req.error);
3847

    
3848
    if (acb->done) {
3849
        *acb->done = true;
3850
    }
3851

    
3852
    qemu_bh_delete(acb->bh);
3853
    qemu_aio_release(acb);
3854
}
3855

    
3856
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3857
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3858
{
3859
    BlockDriverAIOCBCoroutine *acb = opaque;
3860
    BlockDriverState *bs = acb->common.bs;
3861

    
3862
    if (!acb->is_write) {
3863
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3864
            acb->req.nb_sectors, acb->req.qiov, 0);
3865
    } else {
3866
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3867
            acb->req.nb_sectors, acb->req.qiov, 0);
3868
    }
3869

    
3870
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3871
    qemu_bh_schedule(acb->bh);
3872
}
3873

    
3874
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3875
                                               int64_t sector_num,
3876
                                               QEMUIOVector *qiov,
3877
                                               int nb_sectors,
3878
                                               BlockDriverCompletionFunc *cb,
3879
                                               void *opaque,
3880
                                               bool is_write)
3881
{
3882
    Coroutine *co;
3883
    BlockDriverAIOCBCoroutine *acb;
3884

    
3885
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3886
    acb->req.sector = sector_num;
3887
    acb->req.nb_sectors = nb_sectors;
3888
    acb->req.qiov = qiov;
3889
    acb->is_write = is_write;
3890
    acb->done = NULL;
3891

    
3892
    co = qemu_coroutine_create(bdrv_co_do_rw);
3893
    qemu_coroutine_enter(co, acb);
3894

    
3895
    return &acb->common;
3896
}
3897

    
3898
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3899
{
3900
    BlockDriverAIOCBCoroutine *acb = opaque;
3901
    BlockDriverState *bs = acb->common.bs;
3902

    
3903
    acb->req.error = bdrv_co_flush(bs);
3904
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3905
    qemu_bh_schedule(acb->bh);
3906
}
3907

    
3908
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3909
        BlockDriverCompletionFunc *cb, void *opaque)
3910
{
3911
    trace_bdrv_aio_flush(bs, opaque);
3912

    
3913
    Coroutine *co;
3914
    BlockDriverAIOCBCoroutine *acb;
3915

    
3916
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3917
    acb->done = NULL;
3918

    
3919
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3920
    qemu_coroutine_enter(co, acb);
3921

    
3922
    return &acb->common;
3923
}
3924

    
3925
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3926
{
3927
    BlockDriverAIOCBCoroutine *acb = opaque;
3928
    BlockDriverState *bs = acb->common.bs;
3929

    
3930
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3931
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3932
    qemu_bh_schedule(acb->bh);
3933
}
3934

    
3935
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3936
        int64_t sector_num, int nb_sectors,
3937
        BlockDriverCompletionFunc *cb, void *opaque)
3938
{
3939
    Coroutine *co;
3940
    BlockDriverAIOCBCoroutine *acb;
3941

    
3942
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3943

    
3944
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3945
    acb->req.sector = sector_num;
3946
    acb->req.nb_sectors = nb_sectors;
3947
    acb->done = NULL;
3948
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3949
    qemu_coroutine_enter(co, acb);
3950

    
3951
    return &acb->common;
3952
}
3953

    
3954
void bdrv_init(void)
3955
{
3956
    module_call_init(MODULE_INIT_BLOCK);
3957
}
3958

    
3959
void bdrv_init_with_whitelist(void)
3960
{
3961
    use_bdrv_whitelist = 1;
3962
    bdrv_init();
3963
}
3964

    
3965
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
3966
                   BlockDriverCompletionFunc *cb, void *opaque)
3967
{
3968
    BlockDriverAIOCB *acb;
3969

    
3970
    acb = g_slice_alloc(aiocb_info->aiocb_size);
3971
    acb->aiocb_info = aiocb_info;
3972
    acb->bs = bs;
3973
    acb->cb = cb;
3974
    acb->opaque = opaque;
3975
    return acb;
3976
}
3977

    
3978
void qemu_aio_release(void *p)
3979
{
3980
    BlockDriverAIOCB *acb = p;
3981
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
3982
}
3983

    
3984
/**************************************************************/
3985
/* Coroutine block device emulation */
3986

    
3987
typedef struct CoroutineIOCompletion {
3988
    Coroutine *coroutine;
3989
    int ret;
3990
} CoroutineIOCompletion;
3991

    
3992
static void bdrv_co_io_em_complete(void *opaque, int ret)
3993
{
3994
    CoroutineIOCompletion *co = opaque;
3995

    
3996
    co->ret = ret;
3997
    qemu_coroutine_enter(co->coroutine, NULL);
3998
}
3999

    
4000
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4001
                                      int nb_sectors, QEMUIOVector *iov,
4002
                                      bool is_write)
4003
{
4004
    CoroutineIOCompletion co = {
4005
        .coroutine = qemu_coroutine_self(),
4006
    };
4007
    BlockDriverAIOCB *acb;
4008

    
4009
    if (is_write) {
4010
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4011
                                       bdrv_co_io_em_complete, &co);
4012
    } else {
4013
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4014
                                      bdrv_co_io_em_complete, &co);
4015
    }
4016

    
4017
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4018
    if (!acb) {
4019
        return -EIO;
4020
    }
4021
    qemu_coroutine_yield();
4022

    
4023
    return co.ret;
4024
}
4025

    
4026
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4027
                                         int64_t sector_num, int nb_sectors,
4028
                                         QEMUIOVector *iov)
4029
{
4030
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4031
}
4032

    
4033
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4034
                                         int64_t sector_num, int nb_sectors,
4035
                                         QEMUIOVector *iov)
4036
{
4037
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4038
}
4039

    
4040
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4041
{
4042
    RwCo *rwco = opaque;
4043

    
4044
    rwco->ret = bdrv_co_flush(rwco->bs);
4045
}
4046

    
4047
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4048
{
4049
    int ret;
4050

    
4051
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4052
        return 0;
4053
    }
4054

    
4055
    /* Write back cached data to the OS even with cache=unsafe */
4056
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4057
    if (bs->drv->bdrv_co_flush_to_os) {
4058
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4059
        if (ret < 0) {
4060
            return ret;
4061
        }
4062
    }
4063

    
4064
    /* But don't actually force it to the disk with cache=unsafe */
4065
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4066
        goto flush_parent;
4067
    }
4068

    
4069
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4070
    if (bs->drv->bdrv_co_flush_to_disk) {
4071
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4072
    } else if (bs->drv->bdrv_aio_flush) {
4073
        BlockDriverAIOCB *acb;
4074
        CoroutineIOCompletion co = {
4075
            .coroutine = qemu_coroutine_self(),
4076
        };
4077

    
4078
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4079
        if (acb == NULL) {
4080
            ret = -EIO;
4081
        } else {
4082
            qemu_coroutine_yield();
4083
            ret = co.ret;
4084
        }
4085
    } else {
4086
        /*
4087
         * Some block drivers always operate in either writethrough or unsafe
4088
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4089
         * know how the server works (because the behaviour is hardcoded or
4090
         * depends on server-side configuration), so we can't ensure that
4091
         * everything is safe on disk. Returning an error doesn't work because
4092
         * that would break guests even if the server operates in writethrough
4093
         * mode.
4094
         *
4095
         * Let's hope the user knows what he's doing.
4096
         */
4097
        ret = 0;
4098
    }
4099
    if (ret < 0) {
4100
        return ret;
4101
    }
4102

    
4103
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4104
     * in the case of cache=unsafe, so there are no useless flushes.
4105
     */
4106
flush_parent:
4107
    return bdrv_co_flush(bs->file);
4108
}
4109

    
4110
void bdrv_invalidate_cache(BlockDriverState *bs)
4111
{
4112
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4113
        bs->drv->bdrv_invalidate_cache(bs);
4114
    }
4115
}
4116

    
4117
void bdrv_invalidate_cache_all(void)
4118
{
4119
    BlockDriverState *bs;
4120

    
4121
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4122
        bdrv_invalidate_cache(bs);
4123
    }
4124
}
4125

    
4126
void bdrv_clear_incoming_migration_all(void)
4127
{
4128
    BlockDriverState *bs;
4129

    
4130
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4131
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4132
    }
4133
}
4134

    
4135
int bdrv_flush(BlockDriverState *bs)
4136
{
4137
    Coroutine *co;
4138
    RwCo rwco = {
4139
        .bs = bs,
4140
        .ret = NOT_DONE,
4141
    };
4142

    
4143
    if (qemu_in_coroutine()) {
4144
        /* Fast-path if already in coroutine context */
4145
        bdrv_flush_co_entry(&rwco);
4146
    } else {
4147
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4148
        qemu_coroutine_enter(co, &rwco);
4149
        while (rwco.ret == NOT_DONE) {
4150
            qemu_aio_wait();
4151
        }
4152
    }
4153

    
4154
    return rwco.ret;
4155
}
4156

    
4157
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4158
{
4159
    RwCo *rwco = opaque;
4160

    
4161
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4162
}
4163

    
4164
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4165
                                 int nb_sectors)
4166
{
4167
    if (!bs->drv) {
4168
        return -ENOMEDIUM;
4169
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4170
        return -EIO;
4171
    } else if (bs->read_only) {
4172
        return -EROFS;
4173
    }
4174

    
4175
    if (bs->dirty_bitmap) {
4176
        bdrv_reset_dirty(bs, sector_num, nb_sectors);
4177
    }
4178

    
4179
    /* Do nothing if disabled.  */
4180
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4181
        return 0;
4182
    }
4183

    
4184
    if (bs->drv->bdrv_co_discard) {
4185
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4186
    } else if (bs->drv->bdrv_aio_discard) {
4187
        BlockDriverAIOCB *acb;
4188
        CoroutineIOCompletion co = {
4189
            .coroutine = qemu_coroutine_self(),
4190
        };
4191

    
4192
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4193
                                        bdrv_co_io_em_complete, &co);
4194
        if (acb == NULL) {
4195
            return -EIO;
4196
        } else {
4197
            qemu_coroutine_yield();
4198
            return co.ret;
4199
        }
4200
    } else {
4201
        return 0;
4202
    }
4203
}
4204

    
4205
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4206
{
4207
    Coroutine *co;
4208
    RwCo rwco = {
4209
        .bs = bs,
4210
        .sector_num = sector_num,
4211
        .nb_sectors = nb_sectors,
4212
        .ret = NOT_DONE,
4213
    };
4214

    
4215
    if (qemu_in_coroutine()) {
4216
        /* Fast-path if already in coroutine context */
4217
        bdrv_discard_co_entry(&rwco);
4218
    } else {
4219
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4220
        qemu_coroutine_enter(co, &rwco);
4221
        while (rwco.ret == NOT_DONE) {
4222
            qemu_aio_wait();
4223
        }
4224
    }
4225

    
4226
    return rwco.ret;
4227
}
4228

    
4229
/**************************************************************/
4230
/* removable device support */
4231

    
4232
/**
4233
 * Return TRUE if the media is present
4234
 */
4235
int bdrv_is_inserted(BlockDriverState *bs)
4236
{
4237
    BlockDriver *drv = bs->drv;
4238

    
4239
    if (!drv)
4240
        return 0;
4241
    if (!drv->bdrv_is_inserted)
4242
        return 1;
4243
    return drv->bdrv_is_inserted(bs);
4244
}
4245

    
4246
/**
4247
 * Return whether the media changed since the last call to this
4248
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4249
 */
4250
int bdrv_media_changed(BlockDriverState *bs)
4251
{
4252
    BlockDriver *drv = bs->drv;
4253

    
4254
    if (drv && drv->bdrv_media_changed) {
4255
        return drv->bdrv_media_changed(bs);
4256
    }
4257
    return -ENOTSUP;
4258
}
4259

    
4260
/**
4261
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4262
 */
4263
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4264
{
4265
    BlockDriver *drv = bs->drv;
4266

    
4267
    if (drv && drv->bdrv_eject) {
4268
        drv->bdrv_eject(bs, eject_flag);
4269
    }
4270

    
4271
    if (bs->device_name[0] != '\0') {
4272
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4273
    }
4274
}
4275

    
4276
/**
4277
 * Lock or unlock the media (if it is locked, the user won't be able
4278
 * to eject it manually).
4279
 */
4280
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4281
{
4282
    BlockDriver *drv = bs->drv;
4283

    
4284
    trace_bdrv_lock_medium(bs, locked);
4285

    
4286
    if (drv && drv->bdrv_lock_medium) {
4287
        drv->bdrv_lock_medium(bs, locked);
4288
    }
4289
}
4290

    
4291
/* needed for generic scsi interface */
4292

    
4293
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4294
{
4295
    BlockDriver *drv = bs->drv;
4296

    
4297
    if (drv && drv->bdrv_ioctl)
4298
        return drv->bdrv_ioctl(bs, req, buf);
4299
    return -ENOTSUP;
4300
}
4301

    
4302
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4303
        unsigned long int req, void *buf,
4304
        BlockDriverCompletionFunc *cb, void *opaque)
4305
{
4306
    BlockDriver *drv = bs->drv;
4307

    
4308
    if (drv && drv->bdrv_aio_ioctl)
4309
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4310
    return NULL;
4311
}
4312

    
4313
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4314
{
4315
    bs->buffer_alignment = align;
4316
}
4317

    
4318
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4319
{
4320
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4321
}
4322

    
4323
/*
4324
 * Check if all memory in this vector is sector aligned.
4325
 */
4326
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4327
{
4328
    int i;
4329

    
4330
    for (i = 0; i < qiov->niov; i++) {
4331
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4332
            return false;
4333
        }
4334
    }
4335

    
4336
    return true;
4337
}
4338

    
4339
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4340
{
4341
    int64_t bitmap_size;
4342

    
4343
    assert((granularity & (granularity - 1)) == 0);
4344

    
4345
    if (granularity) {
4346
        granularity >>= BDRV_SECTOR_BITS;
4347
        assert(!bs->dirty_bitmap);
4348
        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4349
        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4350
    } else {
4351
        if (bs->dirty_bitmap) {
4352
            hbitmap_free(bs->dirty_bitmap);
4353
            bs->dirty_bitmap = NULL;
4354
        }
4355
    }
4356
}
4357

    
4358
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4359
{
4360
    if (bs->dirty_bitmap) {
4361
        return hbitmap_get(bs->dirty_bitmap, sector);
4362
    } else {
4363
        return 0;
4364
    }
4365
}
4366

    
4367
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4368
{
4369
    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4370
}
4371

    
4372
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4373
                    int nr_sectors)
4374
{
4375
    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4376
}
4377

    
4378
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4379
                      int nr_sectors)
4380
{
4381
    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4382
}
4383

    
4384
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4385
{
4386
    if (bs->dirty_bitmap) {
4387
        return hbitmap_count(bs->dirty_bitmap);
4388
    } else {
4389
        return 0;
4390
    }
4391
}
4392

    
4393
/* Get a reference to bs */
4394
void bdrv_ref(BlockDriverState *bs)
4395
{
4396
    bs->refcnt++;
4397
}
4398

    
4399
/* Release a previously grabbed reference to bs.
4400
 * If after releasing, reference count is zero, the BlockDriverState is
4401
 * deleted. */
4402
void bdrv_unref(BlockDriverState *bs)
4403
{
4404
    assert(bs->refcnt > 0);
4405
    if (--bs->refcnt == 0) {
4406
        bdrv_delete(bs);
4407
    }
4408
}
4409

    
4410
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4411
{
4412
    assert(bs->in_use != in_use);
4413
    bs->in_use = in_use;
4414
}
4415

    
4416
int bdrv_in_use(BlockDriverState *bs)
4417
{
4418
    return bs->in_use;
4419
}
4420

    
4421
void bdrv_iostatus_enable(BlockDriverState *bs)
4422
{
4423
    bs->iostatus_enabled = true;
4424
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4425
}
4426

    
4427
/* The I/O status is only enabled if the drive explicitly
4428
 * enables it _and_ the VM is configured to stop on errors */
4429
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4430
{
4431
    return (bs->iostatus_enabled &&
4432
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4433
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4434
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4435
}
4436

    
4437
void bdrv_iostatus_disable(BlockDriverState *bs)
4438
{
4439
    bs->iostatus_enabled = false;
4440
}
4441

    
4442
void bdrv_iostatus_reset(BlockDriverState *bs)
4443
{
4444
    if (bdrv_iostatus_is_enabled(bs)) {
4445
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4446
        if (bs->job) {
4447
            block_job_iostatus_reset(bs->job);
4448
        }
4449
    }
4450
}
4451

    
4452
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4453
{
4454
    assert(bdrv_iostatus_is_enabled(bs));
4455
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4456
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4457
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4458
    }
4459
}
4460

    
4461
void
4462
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4463
        enum BlockAcctType type)
4464
{
4465
    assert(type < BDRV_MAX_IOTYPE);
4466

    
4467
    cookie->bytes = bytes;
4468
    cookie->start_time_ns = get_clock();
4469
    cookie->type = type;
4470
}
4471

    
4472
void
4473
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4474
{
4475
    assert(cookie->type < BDRV_MAX_IOTYPE);
4476

    
4477
    bs->nr_bytes[cookie->type] += cookie->bytes;
4478
    bs->nr_ops[cookie->type]++;
4479
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4480
}
4481

    
4482
void bdrv_img_create(const char *filename, const char *fmt,
4483
                     const char *base_filename, const char *base_fmt,
4484
                     char *options, uint64_t img_size, int flags,
4485
                     Error **errp, bool quiet)
4486
{
4487
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4488
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4489
    BlockDriverState *bs = NULL;
4490
    BlockDriver *drv, *proto_drv;
4491
    BlockDriver *backing_drv = NULL;
4492
    Error *local_err = NULL;
4493
    int ret = 0;
4494

    
4495
    /* Find driver and parse its options */
4496
    drv = bdrv_find_format(fmt);
4497
    if (!drv) {
4498
        error_setg(errp, "Unknown file format '%s'", fmt);
4499
        return;
4500
    }
4501

    
4502
    proto_drv = bdrv_find_protocol(filename, true);
4503
    if (!proto_drv) {
4504
        error_setg(errp, "Unknown protocol '%s'", filename);
4505
        return;
4506
    }
4507

    
4508
    create_options = append_option_parameters(create_options,
4509
                                              drv->create_options);
4510
    create_options = append_option_parameters(create_options,
4511
                                              proto_drv->create_options);
4512

    
4513
    /* Create parameter list with default values */
4514
    param = parse_option_parameters("", create_options, param);
4515

    
4516
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4517

    
4518
    /* Parse -o options */
4519
    if (options) {
4520
        param = parse_option_parameters(options, create_options, param);
4521
        if (param == NULL) {
4522
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4523
            goto out;
4524
        }
4525
    }
4526

    
4527
    if (base_filename) {
4528
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4529
                                 base_filename)) {
4530
            error_setg(errp, "Backing file not supported for file format '%s'",
4531
                       fmt);
4532
            goto out;
4533
        }
4534
    }
4535

    
4536
    if (base_fmt) {
4537
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4538
            error_setg(errp, "Backing file format not supported for file "
4539
                             "format '%s'", fmt);
4540
            goto out;
4541
        }
4542
    }
4543

    
4544
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4545
    if (backing_file && backing_file->value.s) {
4546
        if (!strcmp(filename, backing_file->value.s)) {
4547
            error_setg(errp, "Error: Trying to create an image with the "
4548
                             "same filename as the backing file");
4549
            goto out;
4550
        }
4551
    }
4552

    
4553
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4554
    if (backing_fmt && backing_fmt->value.s) {
4555
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4556
        if (!backing_drv) {
4557
            error_setg(errp, "Unknown backing file format '%s'",
4558
                       backing_fmt->value.s);
4559
            goto out;
4560
        }
4561
    }
4562

    
4563
    // The size for the image must always be specified, with one exception:
4564
    // If we are using a backing file, we can obtain the size from there
4565
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4566
    if (size && size->value.n == -1) {
4567
        if (backing_file && backing_file->value.s) {
4568
            uint64_t size;
4569
            char buf[32];
4570
            int back_flags;
4571

    
4572
            /* backing files always opened read-only */
4573
            back_flags =
4574
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4575

    
4576
            bs = bdrv_new("");
4577

    
4578
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4579
                            backing_drv, &local_err);
4580
            if (ret < 0) {
4581
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
4582
                                 backing_file->value.s,
4583
                                 error_get_pretty(local_err));
4584
                error_free(local_err);
4585
                local_err = NULL;
4586
                goto out;
4587
            }
4588
            bdrv_get_geometry(bs, &size);
4589
            size *= 512;
4590

    
4591
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4592
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4593
        } else {
4594
            error_setg(errp, "Image creation needs a size parameter");
4595
            goto out;
4596
        }
4597
    }
4598

    
4599
    if (!quiet) {
4600
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4601
        print_option_parameters(param);
4602
        puts("");
4603
    }
4604
    ret = bdrv_create(drv, filename, param, &local_err);
4605
    if (ret == -EFBIG) {
4606
        /* This is generally a better message than whatever the driver would
4607
         * deliver (especially because of the cluster_size_hint), since that
4608
         * is most probably not much different from "image too large". */
4609
        const char *cluster_size_hint = "";
4610
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4611
            cluster_size_hint = " (try using a larger cluster size)";
4612
        }
4613
        error_setg(errp, "The image size is too large for file format '%s'"
4614
                   "%s", fmt, cluster_size_hint);
4615
        error_free(local_err);
4616
        local_err = NULL;
4617
    }
4618

    
4619
out:
4620
    free_option_parameters(create_options);
4621
    free_option_parameters(param);
4622

    
4623
    if (bs) {
4624
        bdrv_unref(bs);
4625
    }
4626
    if (error_is_set(&local_err)) {
4627
        error_propagate(errp, local_err);
4628
    }
4629
}
4630

    
4631
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4632
{
4633
    /* Currently BlockDriverState always uses the main loop AioContext */
4634
    return qemu_get_aio_context();
4635
}
4636

    
4637
void bdrv_add_before_write_notifier(BlockDriverState *bs,
4638
                                    NotifierWithReturn *notifier)
4639
{
4640
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4641
}
4642

    
4643
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
4644
{
4645
    if (bs->drv->bdrv_amend_options == NULL) {
4646
        return -ENOTSUP;
4647
    }
4648
    return bs->drv->bdrv_amend_options(bs, options);
4649
}