Statistics
| Branch: | Revision:

root / block.c @ d616b224

History | View | Annotate | Download (125.6 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90
        bool is_write, double elapsed_time, uint64_t *wait);
91
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92
        double elapsed_time, uint64_t *wait);
93
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94
        bool is_write, int64_t *wait);
95

    
96
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
98

    
99
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
101

    
102
/* If non-zero, use only whitelisted block drivers */
103
static int use_bdrv_whitelist;
104

    
105
#ifdef _WIN32
106
static int is_windows_drive_prefix(const char *filename)
107
{
108
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110
            filename[1] == ':');
111
}
112

    
113
int is_windows_drive(const char *filename)
114
{
115
    if (is_windows_drive_prefix(filename) &&
116
        filename[2] == '\0')
117
        return 1;
118
    if (strstart(filename, "\\\\.\\", NULL) ||
119
        strstart(filename, "//./", NULL))
120
        return 1;
121
    return 0;
122
}
123
#endif
124

    
125
/* throttling disk I/O limits */
126
void bdrv_io_limits_disable(BlockDriverState *bs)
127
{
128
    bs->io_limits_enabled = false;
129

    
130
    while (qemu_co_queue_next(&bs->throttled_reqs));
131

    
132
    if (bs->block_timer) {
133
        qemu_del_timer(bs->block_timer);
134
        qemu_free_timer(bs->block_timer);
135
        bs->block_timer = NULL;
136
    }
137

    
138
    bs->slice_start = 0;
139
    bs->slice_end   = 0;
140
}
141

    
142
static void bdrv_block_timer(void *opaque)
143
{
144
    BlockDriverState *bs = opaque;
145

    
146
    qemu_co_queue_next(&bs->throttled_reqs);
147
}
148

    
149
void bdrv_io_limits_enable(BlockDriverState *bs)
150
{
151
    qemu_co_queue_init(&bs->throttled_reqs);
152
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153
    bs->io_limits_enabled = true;
154
}
155

    
156
bool bdrv_io_limits_enabled(BlockDriverState *bs)
157
{
158
    BlockIOLimit *io_limits = &bs->io_limits;
159
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
160
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
161
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
162
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
163
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
164
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
165
}
166

    
167
static void bdrv_io_limits_intercept(BlockDriverState *bs,
168
                                     bool is_write, int nb_sectors)
169
{
170
    int64_t wait_time = -1;
171

    
172
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
173
        qemu_co_queue_wait(&bs->throttled_reqs);
174
    }
175

    
176
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
177
     * throttled requests will not be dequeued until the current request is
178
     * allowed to be serviced. So if the current request still exceeds the
179
     * limits, it will be inserted to the head. All requests followed it will
180
     * be still in throttled_reqs queue.
181
     */
182

    
183
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
184
        qemu_mod_timer(bs->block_timer,
185
                       wait_time + qemu_get_clock_ns(vm_clock));
186
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
187
    }
188

    
189
    qemu_co_queue_next(&bs->throttled_reqs);
190
}
191

    
192
/* check if the path starts with "<protocol>:" */
193
static int path_has_protocol(const char *path)
194
{
195
    const char *p;
196

    
197
#ifdef _WIN32
198
    if (is_windows_drive(path) ||
199
        is_windows_drive_prefix(path)) {
200
        return 0;
201
    }
202
    p = path + strcspn(path, ":/\\");
203
#else
204
    p = path + strcspn(path, ":/");
205
#endif
206

    
207
    return *p == ':';
208
}
209

    
210
int path_is_absolute(const char *path)
211
{
212
#ifdef _WIN32
213
    /* specific case for names like: "\\.\d:" */
214
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
215
        return 1;
216
    }
217
    return (*path == '/' || *path == '\\');
218
#else
219
    return (*path == '/');
220
#endif
221
}
222

    
223
/* if filename is absolute, just copy it to dest. Otherwise, build a
224
   path to it by considering it is relative to base_path. URL are
225
   supported. */
226
void path_combine(char *dest, int dest_size,
227
                  const char *base_path,
228
                  const char *filename)
229
{
230
    const char *p, *p1;
231
    int len;
232

    
233
    if (dest_size <= 0)
234
        return;
235
    if (path_is_absolute(filename)) {
236
        pstrcpy(dest, dest_size, filename);
237
    } else {
238
        p = strchr(base_path, ':');
239
        if (p)
240
            p++;
241
        else
242
            p = base_path;
243
        p1 = strrchr(base_path, '/');
244
#ifdef _WIN32
245
        {
246
            const char *p2;
247
            p2 = strrchr(base_path, '\\');
248
            if (!p1 || p2 > p1)
249
                p1 = p2;
250
        }
251
#endif
252
        if (p1)
253
            p1++;
254
        else
255
            p1 = base_path;
256
        if (p1 > p)
257
            p = p1;
258
        len = p - base_path;
259
        if (len > dest_size - 1)
260
            len = dest_size - 1;
261
        memcpy(dest, base_path, len);
262
        dest[len] = '\0';
263
        pstrcat(dest, dest_size, filename);
264
    }
265
}
266

    
267
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
268
{
269
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
270
        pstrcpy(dest, sz, bs->backing_file);
271
    } else {
272
        path_combine(dest, sz, bs->filename, bs->backing_file);
273
    }
274
}
275

    
276
void bdrv_register(BlockDriver *bdrv)
277
{
278
    /* Block drivers without coroutine functions need emulation */
279
    if (!bdrv->bdrv_co_readv) {
280
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
281
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
282

    
283
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
284
         * the block driver lacks aio we need to emulate that too.
285
         */
286
        if (!bdrv->bdrv_aio_readv) {
287
            /* add AIO emulation layer */
288
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
289
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
290
        }
291
    }
292

    
293
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
294
}
295

    
296
/* create a new block device (by default it is empty) */
297
BlockDriverState *bdrv_new(const char *device_name)
298
{
299
    BlockDriverState *bs;
300

    
301
    bs = g_malloc0(sizeof(BlockDriverState));
302
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
303
    if (device_name[0] != '\0') {
304
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
305
    }
306
    bdrv_iostatus_disable(bs);
307
    notifier_list_init(&bs->close_notifiers);
308
    notifier_with_return_list_init(&bs->before_write_notifiers);
309

    
310
    return bs;
311
}
312

    
313
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
314
{
315
    notifier_list_add(&bs->close_notifiers, notify);
316
}
317

    
318
BlockDriver *bdrv_find_format(const char *format_name)
319
{
320
    BlockDriver *drv1;
321
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
322
        if (!strcmp(drv1->format_name, format_name)) {
323
            return drv1;
324
        }
325
    }
326
    return NULL;
327
}
328

    
329
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
330
{
331
    static const char *whitelist_rw[] = {
332
        CONFIG_BDRV_RW_WHITELIST
333
    };
334
    static const char *whitelist_ro[] = {
335
        CONFIG_BDRV_RO_WHITELIST
336
    };
337
    const char **p;
338

    
339
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
340
        return 1;               /* no whitelist, anything goes */
341
    }
342

    
343
    for (p = whitelist_rw; *p; p++) {
344
        if (!strcmp(drv->format_name, *p)) {
345
            return 1;
346
        }
347
    }
348
    if (read_only) {
349
        for (p = whitelist_ro; *p; p++) {
350
            if (!strcmp(drv->format_name, *p)) {
351
                return 1;
352
            }
353
        }
354
    }
355
    return 0;
356
}
357

    
358
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
359
                                          bool read_only)
360
{
361
    BlockDriver *drv = bdrv_find_format(format_name);
362
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
363
}
364

    
365
typedef struct CreateCo {
366
    BlockDriver *drv;
367
    char *filename;
368
    QEMUOptionParameter *options;
369
    int ret;
370
} CreateCo;
371

    
372
static void coroutine_fn bdrv_create_co_entry(void *opaque)
373
{
374
    CreateCo *cco = opaque;
375
    assert(cco->drv);
376

    
377
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
378
}
379

    
380
int bdrv_create(BlockDriver *drv, const char* filename,
381
    QEMUOptionParameter *options)
382
{
383
    int ret;
384

    
385
    Coroutine *co;
386
    CreateCo cco = {
387
        .drv = drv,
388
        .filename = g_strdup(filename),
389
        .options = options,
390
        .ret = NOT_DONE,
391
    };
392

    
393
    if (!drv->bdrv_create) {
394
        ret = -ENOTSUP;
395
        goto out;
396
    }
397

    
398
    if (qemu_in_coroutine()) {
399
        /* Fast-path if already in coroutine context */
400
        bdrv_create_co_entry(&cco);
401
    } else {
402
        co = qemu_coroutine_create(bdrv_create_co_entry);
403
        qemu_coroutine_enter(co, &cco);
404
        while (cco.ret == NOT_DONE) {
405
            qemu_aio_wait();
406
        }
407
    }
408

    
409
    ret = cco.ret;
410

    
411
out:
412
    g_free(cco.filename);
413
    return ret;
414
}
415

    
416
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
417
{
418
    BlockDriver *drv;
419

    
420
    drv = bdrv_find_protocol(filename);
421
    if (drv == NULL) {
422
        return -ENOENT;
423
    }
424

    
425
    return bdrv_create(drv, filename, options);
426
}
427

    
428
/*
429
 * Create a uniquely-named empty temporary file.
430
 * Return 0 upon success, otherwise a negative errno value.
431
 */
432
int get_tmp_filename(char *filename, int size)
433
{
434
#ifdef _WIN32
435
    char temp_dir[MAX_PATH];
436
    /* GetTempFileName requires that its output buffer (4th param)
437
       have length MAX_PATH or greater.  */
438
    assert(size >= MAX_PATH);
439
    return (GetTempPath(MAX_PATH, temp_dir)
440
            && GetTempFileName(temp_dir, "qem", 0, filename)
441
            ? 0 : -GetLastError());
442
#else
443
    int fd;
444
    const char *tmpdir;
445
    tmpdir = getenv("TMPDIR");
446
    if (!tmpdir)
447
        tmpdir = "/tmp";
448
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
449
        return -EOVERFLOW;
450
    }
451
    fd = mkstemp(filename);
452
    if (fd < 0) {
453
        return -errno;
454
    }
455
    if (close(fd) != 0) {
456
        unlink(filename);
457
        return -errno;
458
    }
459
    return 0;
460
#endif
461
}
462

    
463
/*
464
 * Detect host devices. By convention, /dev/cdrom[N] is always
465
 * recognized as a host CDROM.
466
 */
467
static BlockDriver *find_hdev_driver(const char *filename)
468
{
469
    int score_max = 0, score;
470
    BlockDriver *drv = NULL, *d;
471

    
472
    QLIST_FOREACH(d, &bdrv_drivers, list) {
473
        if (d->bdrv_probe_device) {
474
            score = d->bdrv_probe_device(filename);
475
            if (score > score_max) {
476
                score_max = score;
477
                drv = d;
478
            }
479
        }
480
    }
481

    
482
    return drv;
483
}
484

    
485
BlockDriver *bdrv_find_protocol(const char *filename)
486
{
487
    BlockDriver *drv1;
488
    char protocol[128];
489
    int len;
490
    const char *p;
491

    
492
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
493

    
494
    /*
495
     * XXX(hch): we really should not let host device detection
496
     * override an explicit protocol specification, but moving this
497
     * later breaks access to device names with colons in them.
498
     * Thanks to the brain-dead persistent naming schemes on udev-
499
     * based Linux systems those actually are quite common.
500
     */
501
    drv1 = find_hdev_driver(filename);
502
    if (drv1) {
503
        return drv1;
504
    }
505

    
506
    if (!path_has_protocol(filename)) {
507
        return bdrv_find_format("file");
508
    }
509
    p = strchr(filename, ':');
510
    assert(p != NULL);
511
    len = p - filename;
512
    if (len > sizeof(protocol) - 1)
513
        len = sizeof(protocol) - 1;
514
    memcpy(protocol, filename, len);
515
    protocol[len] = '\0';
516
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
517
        if (drv1->protocol_name &&
518
            !strcmp(drv1->protocol_name, protocol)) {
519
            return drv1;
520
        }
521
    }
522
    return NULL;
523
}
524

    
525
static int find_image_format(BlockDriverState *bs, const char *filename,
526
                             BlockDriver **pdrv)
527
{
528
    int score, score_max;
529
    BlockDriver *drv1, *drv;
530
    uint8_t buf[2048];
531
    int ret = 0;
532

    
533
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
534
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
535
        drv = bdrv_find_format("raw");
536
        if (!drv) {
537
            ret = -ENOENT;
538
        }
539
        *pdrv = drv;
540
        return ret;
541
    }
542

    
543
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
544
    if (ret < 0) {
545
        *pdrv = NULL;
546
        return ret;
547
    }
548

    
549
    score_max = 0;
550
    drv = NULL;
551
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
552
        if (drv1->bdrv_probe) {
553
            score = drv1->bdrv_probe(buf, ret, filename);
554
            if (score > score_max) {
555
                score_max = score;
556
                drv = drv1;
557
            }
558
        }
559
    }
560
    if (!drv) {
561
        ret = -ENOENT;
562
    }
563
    *pdrv = drv;
564
    return ret;
565
}
566

    
567
/**
568
 * Set the current 'total_sectors' value
569
 */
570
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
571
{
572
    BlockDriver *drv = bs->drv;
573

    
574
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
575
    if (bs->sg)
576
        return 0;
577

    
578
    /* query actual device if possible, otherwise just trust the hint */
579
    if (drv->bdrv_getlength) {
580
        int64_t length = drv->bdrv_getlength(bs);
581
        if (length < 0) {
582
            return length;
583
        }
584
        hint = length >> BDRV_SECTOR_BITS;
585
    }
586

    
587
    bs->total_sectors = hint;
588
    return 0;
589
}
590

    
591
/**
592
 * Set open flags for a given discard mode
593
 *
594
 * Return 0 on success, -1 if the discard mode was invalid.
595
 */
596
int bdrv_parse_discard_flags(const char *mode, int *flags)
597
{
598
    *flags &= ~BDRV_O_UNMAP;
599

    
600
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
601
        /* do nothing */
602
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
603
        *flags |= BDRV_O_UNMAP;
604
    } else {
605
        return -1;
606
    }
607

    
608
    return 0;
609
}
610

    
611
/**
612
 * Set open flags for a given cache mode
613
 *
614
 * Return 0 on success, -1 if the cache mode was invalid.
615
 */
616
int bdrv_parse_cache_flags(const char *mode, int *flags)
617
{
618
    *flags &= ~BDRV_O_CACHE_MASK;
619

    
620
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
621
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
622
    } else if (!strcmp(mode, "directsync")) {
623
        *flags |= BDRV_O_NOCACHE;
624
    } else if (!strcmp(mode, "writeback")) {
625
        *flags |= BDRV_O_CACHE_WB;
626
    } else if (!strcmp(mode, "unsafe")) {
627
        *flags |= BDRV_O_CACHE_WB;
628
        *flags |= BDRV_O_NO_FLUSH;
629
    } else if (!strcmp(mode, "writethrough")) {
630
        /* this is the default */
631
    } else {
632
        return -1;
633
    }
634

    
635
    return 0;
636
}
637

    
638
/**
639
 * The copy-on-read flag is actually a reference count so multiple users may
640
 * use the feature without worrying about clobbering its previous state.
641
 * Copy-on-read stays enabled until all users have called to disable it.
642
 */
643
void bdrv_enable_copy_on_read(BlockDriverState *bs)
644
{
645
    bs->copy_on_read++;
646
}
647

    
648
void bdrv_disable_copy_on_read(BlockDriverState *bs)
649
{
650
    assert(bs->copy_on_read > 0);
651
    bs->copy_on_read--;
652
}
653

    
654
static int bdrv_open_flags(BlockDriverState *bs, int flags)
655
{
656
    int open_flags = flags | BDRV_O_CACHE_WB;
657

    
658
    /*
659
     * Clear flags that are internal to the block layer before opening the
660
     * image.
661
     */
662
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
663

    
664
    /*
665
     * Snapshots should be writable.
666
     */
667
    if (bs->is_temporary) {
668
        open_flags |= BDRV_O_RDWR;
669
    }
670

    
671
    return open_flags;
672
}
673

    
674
/*
675
 * Common part for opening disk images and files
676
 *
677
 * Removes all processed options from *options.
678
 */
679
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
680
    QDict *options, int flags, BlockDriver *drv)
681
{
682
    int ret, open_flags;
683
    const char *filename;
684

    
685
    assert(drv != NULL);
686
    assert(bs->file == NULL);
687
    assert(options != NULL && bs->options != options);
688

    
689
    if (file != NULL) {
690
        filename = file->filename;
691
    } else {
692
        filename = qdict_get_try_str(options, "filename");
693
    }
694

    
695
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
696

    
697
    /* bdrv_open() with directly using a protocol as drv. This layer is already
698
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
699
     * and return immediately. */
700
    if (file != NULL && drv->bdrv_file_open) {
701
        bdrv_swap(file, bs);
702
        return 0;
703
    }
704

    
705
    bs->open_flags = flags;
706
    bs->buffer_alignment = 512;
707
    open_flags = bdrv_open_flags(bs, flags);
708
    bs->read_only = !(open_flags & BDRV_O_RDWR);
709

    
710
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
711
        return -ENOTSUP;
712
    }
713

    
714
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
715
    if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) {
716
        bdrv_enable_copy_on_read(bs);
717
    }
718

    
719
    if (filename != NULL) {
720
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
721
    } else {
722
        bs->filename[0] = '\0';
723
    }
724

    
725
    bs->drv = drv;
726
    bs->opaque = g_malloc0(drv->instance_size);
727

    
728
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
729

    
730
    /* Open the image, either directly or using a protocol */
731
    if (drv->bdrv_file_open) {
732
        assert(file == NULL);
733
        assert(drv->bdrv_parse_filename || filename != NULL);
734
        ret = drv->bdrv_file_open(bs, options, open_flags);
735
    } else {
736
        if (file == NULL) {
737
            qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't use '%s' as a "
738
                          "block driver for the protocol level",
739
                          drv->format_name);
740
            ret = -EINVAL;
741
            goto free_and_fail;
742
        }
743
        assert(file != NULL);
744
        bs->file = file;
745
        ret = drv->bdrv_open(bs, options, open_flags);
746
    }
747

    
748
    if (ret < 0) {
749
        goto free_and_fail;
750
    }
751

    
752
    ret = refresh_total_sectors(bs, bs->total_sectors);
753
    if (ret < 0) {
754
        goto free_and_fail;
755
    }
756

    
757
#ifndef _WIN32
758
    if (bs->is_temporary) {
759
        assert(filename != NULL);
760
        unlink(filename);
761
    }
762
#endif
763
    return 0;
764

    
765
free_and_fail:
766
    bs->file = NULL;
767
    g_free(bs->opaque);
768
    bs->opaque = NULL;
769
    bs->drv = NULL;
770
    return ret;
771
}
772

    
773
/*
774
 * Opens a file using a protocol (file, host_device, nbd, ...)
775
 *
776
 * options is a QDict of options to pass to the block drivers, or NULL for an
777
 * empty set of options. The reference to the QDict belongs to the block layer
778
 * after the call (even on failure), so if the caller intends to reuse the
779
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
780
 */
781
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
782
                   QDict *options, int flags)
783
{
784
    BlockDriverState *bs;
785
    BlockDriver *drv;
786
    const char *drvname;
787
    int ret;
788

    
789
    /* NULL means an empty set of options */
790
    if (options == NULL) {
791
        options = qdict_new();
792
    }
793

    
794
    bs = bdrv_new("");
795
    bs->options = options;
796
    options = qdict_clone_shallow(options);
797

    
798
    /* Fetch the file name from the options QDict if necessary */
799
    if (!filename) {
800
        filename = qdict_get_try_str(options, "filename");
801
    } else if (filename && !qdict_haskey(options, "filename")) {
802
        qdict_put(options, "filename", qstring_from_str(filename));
803
    } else {
804
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't specify 'file' and "
805
                      "'filename' options at the same time");
806
        ret = -EINVAL;
807
        goto fail;
808
    }
809

    
810
    /* Find the right block driver */
811
    drvname = qdict_get_try_str(options, "driver");
812
    if (drvname) {
813
        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
814
        qdict_del(options, "driver");
815
    } else if (filename) {
816
        drv = bdrv_find_protocol(filename);
817
    } else {
818
        qerror_report(ERROR_CLASS_GENERIC_ERROR,
819
                      "Must specify either driver or file");
820
        drv = NULL;
821
    }
822

    
823
    if (!drv) {
824
        ret = -ENOENT;
825
        goto fail;
826
    }
827

    
828
    /* Parse the filename and open it */
829
    if (drv->bdrv_parse_filename && filename) {
830
        Error *local_err = NULL;
831
        drv->bdrv_parse_filename(filename, options, &local_err);
832
        if (error_is_set(&local_err)) {
833
            qerror_report_err(local_err);
834
            error_free(local_err);
835
            ret = -EINVAL;
836
            goto fail;
837
        }
838
        qdict_del(options, "filename");
839
    } else if (!drv->bdrv_parse_filename && !filename) {
840
        qerror_report(ERROR_CLASS_GENERIC_ERROR,
841
                      "The '%s' block driver requires a file name",
842
                      drv->format_name);
843
        ret = -EINVAL;
844
        goto fail;
845
    }
846

    
847
    ret = bdrv_open_common(bs, NULL, options, flags, drv);
848
    if (ret < 0) {
849
        goto fail;
850
    }
851

    
852
    /* Check if any unknown options were used */
853
    if (qdict_size(options) != 0) {
854
        const QDictEntry *entry = qdict_first(options);
855
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't "
856
                      "support the option '%s'",
857
                      drv->format_name, entry->key);
858
        ret = -EINVAL;
859
        goto fail;
860
    }
861
    QDECREF(options);
862

    
863
    bs->growable = 1;
864
    *pbs = bs;
865
    return 0;
866

    
867
fail:
868
    QDECREF(options);
869
    if (!bs->drv) {
870
        QDECREF(bs->options);
871
    }
872
    bdrv_delete(bs);
873
    return ret;
874
}
875

    
876
/*
877
 * Opens the backing file for a BlockDriverState if not yet open
878
 *
879
 * options is a QDict of options to pass to the block drivers, or NULL for an
880
 * empty set of options. The reference to the QDict is transferred to this
881
 * function (even on failure), so if the caller intends to reuse the dictionary,
882
 * it needs to use QINCREF() before calling bdrv_file_open.
883
 */
884
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options)
885
{
886
    char backing_filename[PATH_MAX];
887
    int back_flags, ret;
888
    BlockDriver *back_drv = NULL;
889

    
890
    if (bs->backing_hd != NULL) {
891
        QDECREF(options);
892
        return 0;
893
    }
894

    
895
    /* NULL means an empty set of options */
896
    if (options == NULL) {
897
        options = qdict_new();
898
    }
899

    
900
    bs->open_flags &= ~BDRV_O_NO_BACKING;
901
    if (qdict_haskey(options, "file.filename")) {
902
        backing_filename[0] = '\0';
903
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
904
        QDECREF(options);
905
        return 0;
906
    }
907

    
908
    bs->backing_hd = bdrv_new("");
909
    bdrv_get_full_backing_filename(bs, backing_filename,
910
                                   sizeof(backing_filename));
911

    
912
    if (bs->backing_format[0] != '\0') {
913
        back_drv = bdrv_find_format(bs->backing_format);
914
    }
915

    
916
    /* backing files always opened read-only */
917
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
918

    
919
    ret = bdrv_open(bs->backing_hd,
920
                    *backing_filename ? backing_filename : NULL, options,
921
                    back_flags, back_drv);
922
    if (ret < 0) {
923
        bdrv_delete(bs->backing_hd);
924
        bs->backing_hd = NULL;
925
        bs->open_flags |= BDRV_O_NO_BACKING;
926
        return ret;
927
    }
928
    return 0;
929
}
930

    
931
static void extract_subqdict(QDict *src, QDict **dst, const char *start)
932
{
933
    const QDictEntry *entry, *next;
934
    const char *p;
935

    
936
    *dst = qdict_new();
937
    entry = qdict_first(src);
938

    
939
    while (entry != NULL) {
940
        next = qdict_next(src, entry);
941
        if (strstart(entry->key, start, &p)) {
942
            qobject_incref(entry->value);
943
            qdict_put_obj(*dst, p, entry->value);
944
            qdict_del(src, entry->key);
945
        }
946
        entry = next;
947
    }
948
}
949

    
950
/*
951
 * Opens a disk image (raw, qcow2, vmdk, ...)
952
 *
953
 * options is a QDict of options to pass to the block drivers, or NULL for an
954
 * empty set of options. The reference to the QDict belongs to the block layer
955
 * after the call (even on failure), so if the caller intends to reuse the
956
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
957
 */
958
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
959
              int flags, BlockDriver *drv)
960
{
961
    int ret;
962
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
963
    char tmp_filename[PATH_MAX + 1];
964
    BlockDriverState *file = NULL;
965
    QDict *file_options = NULL;
966

    
967
    /* NULL means an empty set of options */
968
    if (options == NULL) {
969
        options = qdict_new();
970
    }
971

    
972
    bs->options = options;
973
    options = qdict_clone_shallow(options);
974

    
975
    /* For snapshot=on, create a temporary qcow2 overlay */
976
    if (flags & BDRV_O_SNAPSHOT) {
977
        BlockDriverState *bs1;
978
        int64_t total_size;
979
        BlockDriver *bdrv_qcow2;
980
        QEMUOptionParameter *create_options;
981
        char backing_filename[PATH_MAX];
982

    
983
        if (qdict_size(options) != 0) {
984
            error_report("Can't use snapshot=on with driver-specific options");
985
            ret = -EINVAL;
986
            goto fail;
987
        }
988
        assert(filename != NULL);
989

    
990
        /* if snapshot, we create a temporary backing file and open it
991
           instead of opening 'filename' directly */
992

    
993
        /* if there is a backing file, use it */
994
        bs1 = bdrv_new("");
995
        ret = bdrv_open(bs1, filename, NULL, 0, drv);
996
        if (ret < 0) {
997
            bdrv_delete(bs1);
998
            goto fail;
999
        }
1000
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1001

    
1002
        bdrv_delete(bs1);
1003

    
1004
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1005
        if (ret < 0) {
1006
            goto fail;
1007
        }
1008

    
1009
        /* Real path is meaningless for protocols */
1010
        if (path_has_protocol(filename)) {
1011
            snprintf(backing_filename, sizeof(backing_filename),
1012
                     "%s", filename);
1013
        } else if (!realpath(filename, backing_filename)) {
1014
            ret = -errno;
1015
            goto fail;
1016
        }
1017

    
1018
        bdrv_qcow2 = bdrv_find_format("qcow2");
1019
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1020
                                                 NULL);
1021

    
1022
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1023
        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
1024
                             backing_filename);
1025
        if (drv) {
1026
            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
1027
                drv->format_name);
1028
        }
1029

    
1030
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options);
1031
        free_option_parameters(create_options);
1032
        if (ret < 0) {
1033
            goto fail;
1034
        }
1035

    
1036
        filename = tmp_filename;
1037
        drv = bdrv_qcow2;
1038
        bs->is_temporary = 1;
1039
    }
1040

    
1041
    /* Open image file without format layer */
1042
    if (flags & BDRV_O_RDWR) {
1043
        flags |= BDRV_O_ALLOW_RDWR;
1044
    }
1045

    
1046
    extract_subqdict(options, &file_options, "file.");
1047

    
1048
    ret = bdrv_file_open(&file, filename, file_options,
1049
                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP));
1050
    if (ret < 0) {
1051
        goto fail;
1052
    }
1053

    
1054
    /* Find the right image format driver */
1055
    if (!drv) {
1056
        ret = find_image_format(file, filename, &drv);
1057
    }
1058

    
1059
    if (!drv) {
1060
        goto unlink_and_fail;
1061
    }
1062

    
1063
    /* Open the image */
1064
    ret = bdrv_open_common(bs, file, options, flags, drv);
1065
    if (ret < 0) {
1066
        goto unlink_and_fail;
1067
    }
1068

    
1069
    if (bs->file != file) {
1070
        bdrv_delete(file);
1071
        file = NULL;
1072
    }
1073

    
1074
    /* If there is a backing file, use it */
1075
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1076
        QDict *backing_options;
1077

    
1078
        extract_subqdict(options, &backing_options, "backing.");
1079
        ret = bdrv_open_backing_file(bs, backing_options);
1080
        if (ret < 0) {
1081
            goto close_and_fail;
1082
        }
1083
    }
1084

    
1085
    /* Check if any unknown options were used */
1086
    if (qdict_size(options) != 0) {
1087
        const QDictEntry *entry = qdict_first(options);
1088
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by "
1089
            "device '%s' doesn't support the option '%s'",
1090
            drv->format_name, bs->device_name, entry->key);
1091

    
1092
        ret = -EINVAL;
1093
        goto close_and_fail;
1094
    }
1095
    QDECREF(options);
1096

    
1097
    if (!bdrv_key_required(bs)) {
1098
        bdrv_dev_change_media_cb(bs, true);
1099
    }
1100

    
1101
    /* throttling disk I/O limits */
1102
    if (bs->io_limits_enabled) {
1103
        bdrv_io_limits_enable(bs);
1104
    }
1105

    
1106
    return 0;
1107

    
1108
unlink_and_fail:
1109
    if (file != NULL) {
1110
        bdrv_delete(file);
1111
    }
1112
    if (bs->is_temporary) {
1113
        unlink(filename);
1114
    }
1115
fail:
1116
    QDECREF(bs->options);
1117
    QDECREF(options);
1118
    bs->options = NULL;
1119
    return ret;
1120

    
1121
close_and_fail:
1122
    bdrv_close(bs);
1123
    QDECREF(options);
1124
    return ret;
1125
}
1126

    
1127
typedef struct BlockReopenQueueEntry {
1128
     bool prepared;
1129
     BDRVReopenState state;
1130
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1131
} BlockReopenQueueEntry;
1132

    
1133
/*
1134
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1135
 * reopen of multiple devices.
1136
 *
1137
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1138
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1139
 * be created and initialized. This newly created BlockReopenQueue should be
1140
 * passed back in for subsequent calls that are intended to be of the same
1141
 * atomic 'set'.
1142
 *
1143
 * bs is the BlockDriverState to add to the reopen queue.
1144
 *
1145
 * flags contains the open flags for the associated bs
1146
 *
1147
 * returns a pointer to bs_queue, which is either the newly allocated
1148
 * bs_queue, or the existing bs_queue being used.
1149
 *
1150
 */
1151
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1152
                                    BlockDriverState *bs, int flags)
1153
{
1154
    assert(bs != NULL);
1155

    
1156
    BlockReopenQueueEntry *bs_entry;
1157
    if (bs_queue == NULL) {
1158
        bs_queue = g_new0(BlockReopenQueue, 1);
1159
        QSIMPLEQ_INIT(bs_queue);
1160
    }
1161

    
1162
    if (bs->file) {
1163
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1164
    }
1165

    
1166
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1167
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1168

    
1169
    bs_entry->state.bs = bs;
1170
    bs_entry->state.flags = flags;
1171

    
1172
    return bs_queue;
1173
}
1174

    
1175
/*
1176
 * Reopen multiple BlockDriverStates atomically & transactionally.
1177
 *
1178
 * The queue passed in (bs_queue) must have been built up previous
1179
 * via bdrv_reopen_queue().
1180
 *
1181
 * Reopens all BDS specified in the queue, with the appropriate
1182
 * flags.  All devices are prepared for reopen, and failure of any
1183
 * device will cause all device changes to be abandonded, and intermediate
1184
 * data cleaned up.
1185
 *
1186
 * If all devices prepare successfully, then the changes are committed
1187
 * to all devices.
1188
 *
1189
 */
1190
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1191
{
1192
    int ret = -1;
1193
    BlockReopenQueueEntry *bs_entry, *next;
1194
    Error *local_err = NULL;
1195

    
1196
    assert(bs_queue != NULL);
1197

    
1198
    bdrv_drain_all();
1199

    
1200
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1201
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1202
            error_propagate(errp, local_err);
1203
            goto cleanup;
1204
        }
1205
        bs_entry->prepared = true;
1206
    }
1207

    
1208
    /* If we reach this point, we have success and just need to apply the
1209
     * changes
1210
     */
1211
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1212
        bdrv_reopen_commit(&bs_entry->state);
1213
    }
1214

    
1215
    ret = 0;
1216

    
1217
cleanup:
1218
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1219
        if (ret && bs_entry->prepared) {
1220
            bdrv_reopen_abort(&bs_entry->state);
1221
        }
1222
        g_free(bs_entry);
1223
    }
1224
    g_free(bs_queue);
1225
    return ret;
1226
}
1227

    
1228

    
1229
/* Reopen a single BlockDriverState with the specified flags. */
1230
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1231
{
1232
    int ret = -1;
1233
    Error *local_err = NULL;
1234
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1235

    
1236
    ret = bdrv_reopen_multiple(queue, &local_err);
1237
    if (local_err != NULL) {
1238
        error_propagate(errp, local_err);
1239
    }
1240
    return ret;
1241
}
1242

    
1243

    
1244
/*
1245
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1246
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1247
 * the block driver layer .bdrv_reopen_prepare()
1248
 *
1249
 * bs is the BlockDriverState to reopen
1250
 * flags are the new open flags
1251
 * queue is the reopen queue
1252
 *
1253
 * Returns 0 on success, non-zero on error.  On error errp will be set
1254
 * as well.
1255
 *
1256
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1257
 * It is the responsibility of the caller to then call the abort() or
1258
 * commit() for any other BDS that have been left in a prepare() state
1259
 *
1260
 */
1261
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1262
                        Error **errp)
1263
{
1264
    int ret = -1;
1265
    Error *local_err = NULL;
1266
    BlockDriver *drv;
1267

    
1268
    assert(reopen_state != NULL);
1269
    assert(reopen_state->bs->drv != NULL);
1270
    drv = reopen_state->bs->drv;
1271

    
1272
    /* if we are to stay read-only, do not allow permission change
1273
     * to r/w */
1274
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1275
        reopen_state->flags & BDRV_O_RDWR) {
1276
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1277
                  reopen_state->bs->device_name);
1278
        goto error;
1279
    }
1280

    
1281

    
1282
    ret = bdrv_flush(reopen_state->bs);
1283
    if (ret) {
1284
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1285
                  strerror(-ret));
1286
        goto error;
1287
    }
1288

    
1289
    if (drv->bdrv_reopen_prepare) {
1290
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1291
        if (ret) {
1292
            if (local_err != NULL) {
1293
                error_propagate(errp, local_err);
1294
            } else {
1295
                error_setg(errp, "failed while preparing to reopen image '%s'",
1296
                           reopen_state->bs->filename);
1297
            }
1298
            goto error;
1299
        }
1300
    } else {
1301
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1302
         * handler for each supported drv. */
1303
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1304
                  drv->format_name, reopen_state->bs->device_name,
1305
                 "reopening of file");
1306
        ret = -1;
1307
        goto error;
1308
    }
1309

    
1310
    ret = 0;
1311

    
1312
error:
1313
    return ret;
1314
}
1315

    
1316
/*
1317
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1318
 * makes them final by swapping the staging BlockDriverState contents into
1319
 * the active BlockDriverState contents.
1320
 */
1321
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1322
{
1323
    BlockDriver *drv;
1324

    
1325
    assert(reopen_state != NULL);
1326
    drv = reopen_state->bs->drv;
1327
    assert(drv != NULL);
1328

    
1329
    /* If there are any driver level actions to take */
1330
    if (drv->bdrv_reopen_commit) {
1331
        drv->bdrv_reopen_commit(reopen_state);
1332
    }
1333

    
1334
    /* set BDS specific flags now */
1335
    reopen_state->bs->open_flags         = reopen_state->flags;
1336
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1337
                                              BDRV_O_CACHE_WB);
1338
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1339
}
1340

    
1341
/*
1342
 * Abort the reopen, and delete and free the staged changes in
1343
 * reopen_state
1344
 */
1345
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1346
{
1347
    BlockDriver *drv;
1348

    
1349
    assert(reopen_state != NULL);
1350
    drv = reopen_state->bs->drv;
1351
    assert(drv != NULL);
1352

    
1353
    if (drv->bdrv_reopen_abort) {
1354
        drv->bdrv_reopen_abort(reopen_state);
1355
    }
1356
}
1357

    
1358

    
1359
void bdrv_close(BlockDriverState *bs)
1360
{
1361
    bdrv_flush(bs);
1362
    if (bs->job) {
1363
        block_job_cancel_sync(bs->job);
1364
    }
1365
    bdrv_drain_all();
1366
    notifier_list_notify(&bs->close_notifiers, bs);
1367

    
1368
    if (bs->drv) {
1369
        if (bs->backing_hd) {
1370
            bdrv_delete(bs->backing_hd);
1371
            bs->backing_hd = NULL;
1372
        }
1373
        bs->drv->bdrv_close(bs);
1374
        g_free(bs->opaque);
1375
#ifdef _WIN32
1376
        if (bs->is_temporary) {
1377
            unlink(bs->filename);
1378
        }
1379
#endif
1380
        bs->opaque = NULL;
1381
        bs->drv = NULL;
1382
        bs->copy_on_read = 0;
1383
        bs->backing_file[0] = '\0';
1384
        bs->backing_format[0] = '\0';
1385
        bs->total_sectors = 0;
1386
        bs->encrypted = 0;
1387
        bs->valid_key = 0;
1388
        bs->sg = 0;
1389
        bs->growable = 0;
1390
        QDECREF(bs->options);
1391
        bs->options = NULL;
1392

    
1393
        if (bs->file != NULL) {
1394
            bdrv_delete(bs->file);
1395
            bs->file = NULL;
1396
        }
1397
    }
1398

    
1399
    bdrv_dev_change_media_cb(bs, false);
1400

    
1401
    /*throttling disk I/O limits*/
1402
    if (bs->io_limits_enabled) {
1403
        bdrv_io_limits_disable(bs);
1404
    }
1405
}
1406

    
1407
void bdrv_close_all(void)
1408
{
1409
    BlockDriverState *bs;
1410

    
1411
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1412
        bdrv_close(bs);
1413
    }
1414
}
1415

    
1416
/*
1417
 * Wait for pending requests to complete across all BlockDriverStates
1418
 *
1419
 * This function does not flush data to disk, use bdrv_flush_all() for that
1420
 * after calling this function.
1421
 *
1422
 * Note that completion of an asynchronous I/O operation can trigger any
1423
 * number of other I/O operations on other devices---for example a coroutine
1424
 * can be arbitrarily complex and a constant flow of I/O can come until the
1425
 * coroutine is complete.  Because of this, it is not possible to have a
1426
 * function to drain a single device's I/O queue.
1427
 */
1428
void bdrv_drain_all(void)
1429
{
1430
    BlockDriverState *bs;
1431
    bool busy;
1432

    
1433
    do {
1434
        busy = qemu_aio_wait();
1435

    
1436
        /* FIXME: We do not have timer support here, so this is effectively
1437
         * a busy wait.
1438
         */
1439
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1440
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1441
                qemu_co_queue_restart_all(&bs->throttled_reqs);
1442
                busy = true;
1443
            }
1444
        }
1445
    } while (busy);
1446

    
1447
    /* If requests are still pending there is a bug somewhere */
1448
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1449
        assert(QLIST_EMPTY(&bs->tracked_requests));
1450
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
1451
    }
1452
}
1453

    
1454
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1455
   Also, NULL terminate the device_name to prevent double remove */
1456
void bdrv_make_anon(BlockDriverState *bs)
1457
{
1458
    if (bs->device_name[0] != '\0') {
1459
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1460
    }
1461
    bs->device_name[0] = '\0';
1462
}
1463

    
1464
static void bdrv_rebind(BlockDriverState *bs)
1465
{
1466
    if (bs->drv && bs->drv->bdrv_rebind) {
1467
        bs->drv->bdrv_rebind(bs);
1468
    }
1469
}
1470

    
1471
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1472
                                     BlockDriverState *bs_src)
1473
{
1474
    /* move some fields that need to stay attached to the device */
1475
    bs_dest->open_flags         = bs_src->open_flags;
1476

    
1477
    /* dev info */
1478
    bs_dest->dev_ops            = bs_src->dev_ops;
1479
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1480
    bs_dest->dev                = bs_src->dev;
1481
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1482
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1483

    
1484
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1485

    
1486
    /* i/o timing parameters */
1487
    bs_dest->slice_start        = bs_src->slice_start;
1488
    bs_dest->slice_end          = bs_src->slice_end;
1489
    bs_dest->slice_submitted    = bs_src->slice_submitted;
1490
    bs_dest->io_limits          = bs_src->io_limits;
1491
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1492
    bs_dest->block_timer        = bs_src->block_timer;
1493
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1494

    
1495
    /* r/w error */
1496
    bs_dest->on_read_error      = bs_src->on_read_error;
1497
    bs_dest->on_write_error     = bs_src->on_write_error;
1498

    
1499
    /* i/o status */
1500
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1501
    bs_dest->iostatus           = bs_src->iostatus;
1502

    
1503
    /* dirty bitmap */
1504
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1505

    
1506
    /* job */
1507
    bs_dest->in_use             = bs_src->in_use;
1508
    bs_dest->job                = bs_src->job;
1509

    
1510
    /* keep the same entry in bdrv_states */
1511
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1512
            bs_src->device_name);
1513
    bs_dest->list = bs_src->list;
1514
}
1515

    
1516
/*
1517
 * Swap bs contents for two image chains while they are live,
1518
 * while keeping required fields on the BlockDriverState that is
1519
 * actually attached to a device.
1520
 *
1521
 * This will modify the BlockDriverState fields, and swap contents
1522
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1523
 *
1524
 * bs_new is required to be anonymous.
1525
 *
1526
 * This function does not create any image files.
1527
 */
1528
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1529
{
1530
    BlockDriverState tmp;
1531

    
1532
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1533
    assert(bs_new->device_name[0] == '\0');
1534
    assert(bs_new->dirty_bitmap == NULL);
1535
    assert(bs_new->job == NULL);
1536
    assert(bs_new->dev == NULL);
1537
    assert(bs_new->in_use == 0);
1538
    assert(bs_new->io_limits_enabled == false);
1539
    assert(bs_new->block_timer == NULL);
1540

    
1541
    tmp = *bs_new;
1542
    *bs_new = *bs_old;
1543
    *bs_old = tmp;
1544

    
1545
    /* there are some fields that should not be swapped, move them back */
1546
    bdrv_move_feature_fields(&tmp, bs_old);
1547
    bdrv_move_feature_fields(bs_old, bs_new);
1548
    bdrv_move_feature_fields(bs_new, &tmp);
1549

    
1550
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1551
    assert(bs_new->device_name[0] == '\0');
1552

    
1553
    /* Check a few fields that should remain attached to the device */
1554
    assert(bs_new->dev == NULL);
1555
    assert(bs_new->job == NULL);
1556
    assert(bs_new->in_use == 0);
1557
    assert(bs_new->io_limits_enabled == false);
1558
    assert(bs_new->block_timer == NULL);
1559

    
1560
    bdrv_rebind(bs_new);
1561
    bdrv_rebind(bs_old);
1562
}
1563

    
1564
/*
1565
 * Add new bs contents at the top of an image chain while the chain is
1566
 * live, while keeping required fields on the top layer.
1567
 *
1568
 * This will modify the BlockDriverState fields, and swap contents
1569
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1570
 *
1571
 * bs_new is required to be anonymous.
1572
 *
1573
 * This function does not create any image files.
1574
 */
1575
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1576
{
1577
    bdrv_swap(bs_new, bs_top);
1578

    
1579
    /* The contents of 'tmp' will become bs_top, as we are
1580
     * swapping bs_new and bs_top contents. */
1581
    bs_top->backing_hd = bs_new;
1582
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1583
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1584
            bs_new->filename);
1585
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1586
            bs_new->drv ? bs_new->drv->format_name : "");
1587
}
1588

    
1589
void bdrv_delete(BlockDriverState *bs)
1590
{
1591
    assert(!bs->dev);
1592
    assert(!bs->job);
1593
    assert(!bs->in_use);
1594

    
1595
    /* remove from list, if necessary */
1596
    bdrv_make_anon(bs);
1597

    
1598
    bdrv_close(bs);
1599

    
1600
    g_free(bs);
1601
}
1602

    
1603
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1604
/* TODO change to DeviceState *dev when all users are qdevified */
1605
{
1606
    if (bs->dev) {
1607
        return -EBUSY;
1608
    }
1609
    bs->dev = dev;
1610
    bdrv_iostatus_reset(bs);
1611
    return 0;
1612
}
1613

    
1614
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1615
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1616
{
1617
    if (bdrv_attach_dev(bs, dev) < 0) {
1618
        abort();
1619
    }
1620
}
1621

    
1622
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1623
/* TODO change to DeviceState *dev when all users are qdevified */
1624
{
1625
    assert(bs->dev == dev);
1626
    bs->dev = NULL;
1627
    bs->dev_ops = NULL;
1628
    bs->dev_opaque = NULL;
1629
    bs->buffer_alignment = 512;
1630
}
1631

    
1632
/* TODO change to return DeviceState * when all users are qdevified */
1633
void *bdrv_get_attached_dev(BlockDriverState *bs)
1634
{
1635
    return bs->dev;
1636
}
1637

    
1638
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1639
                      void *opaque)
1640
{
1641
    bs->dev_ops = ops;
1642
    bs->dev_opaque = opaque;
1643
}
1644

    
1645
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1646
                               enum MonitorEvent ev,
1647
                               BlockErrorAction action, bool is_read)
1648
{
1649
    QObject *data;
1650
    const char *action_str;
1651

    
1652
    switch (action) {
1653
    case BDRV_ACTION_REPORT:
1654
        action_str = "report";
1655
        break;
1656
    case BDRV_ACTION_IGNORE:
1657
        action_str = "ignore";
1658
        break;
1659
    case BDRV_ACTION_STOP:
1660
        action_str = "stop";
1661
        break;
1662
    default:
1663
        abort();
1664
    }
1665

    
1666
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1667
                              bdrv->device_name,
1668
                              action_str,
1669
                              is_read ? "read" : "write");
1670
    monitor_protocol_event(ev, data);
1671

    
1672
    qobject_decref(data);
1673
}
1674

    
1675
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1676
{
1677
    QObject *data;
1678

    
1679
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1680
                              bdrv_get_device_name(bs), ejected);
1681
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1682

    
1683
    qobject_decref(data);
1684
}
1685

    
1686
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1687
{
1688
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1689
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1690
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1691
        if (tray_was_closed) {
1692
            /* tray open */
1693
            bdrv_emit_qmp_eject_event(bs, true);
1694
        }
1695
        if (load) {
1696
            /* tray close */
1697
            bdrv_emit_qmp_eject_event(bs, false);
1698
        }
1699
    }
1700
}
1701

    
1702
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1703
{
1704
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1705
}
1706

    
1707
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1708
{
1709
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1710
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1711
    }
1712
}
1713

    
1714
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1715
{
1716
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1717
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1718
    }
1719
    return false;
1720
}
1721

    
1722
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1723
{
1724
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1725
        bs->dev_ops->resize_cb(bs->dev_opaque);
1726
    }
1727
}
1728

    
1729
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1730
{
1731
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1732
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1733
    }
1734
    return false;
1735
}
1736

    
1737
/*
1738
 * Run consistency checks on an image
1739
 *
1740
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1741
 * free of errors) or -errno when an internal error occurred. The results of the
1742
 * check are stored in res.
1743
 */
1744
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1745
{
1746
    if (bs->drv->bdrv_check == NULL) {
1747
        return -ENOTSUP;
1748
    }
1749

    
1750
    memset(res, 0, sizeof(*res));
1751
    return bs->drv->bdrv_check(bs, res, fix);
1752
}
1753

    
1754
#define COMMIT_BUF_SECTORS 2048
1755

    
1756
/* commit COW file into the raw image */
1757
int bdrv_commit(BlockDriverState *bs)
1758
{
1759
    BlockDriver *drv = bs->drv;
1760
    int64_t sector, total_sectors;
1761
    int n, ro, open_flags;
1762
    int ret = 0;
1763
    uint8_t *buf;
1764
    char filename[PATH_MAX];
1765

    
1766
    if (!drv)
1767
        return -ENOMEDIUM;
1768
    
1769
    if (!bs->backing_hd) {
1770
        return -ENOTSUP;
1771
    }
1772

    
1773
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1774
        return -EBUSY;
1775
    }
1776

    
1777
    ro = bs->backing_hd->read_only;
1778
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1779
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1780
    open_flags =  bs->backing_hd->open_flags;
1781

    
1782
    if (ro) {
1783
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1784
            return -EACCES;
1785
        }
1786
    }
1787

    
1788
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1789
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1790

    
1791
    for (sector = 0; sector < total_sectors; sector += n) {
1792
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1793

    
1794
            if (bdrv_read(bs, sector, buf, n) != 0) {
1795
                ret = -EIO;
1796
                goto ro_cleanup;
1797
            }
1798

    
1799
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1800
                ret = -EIO;
1801
                goto ro_cleanup;
1802
            }
1803
        }
1804
    }
1805

    
1806
    if (drv->bdrv_make_empty) {
1807
        ret = drv->bdrv_make_empty(bs);
1808
        bdrv_flush(bs);
1809
    }
1810

    
1811
    /*
1812
     * Make sure all data we wrote to the backing device is actually
1813
     * stable on disk.
1814
     */
1815
    if (bs->backing_hd)
1816
        bdrv_flush(bs->backing_hd);
1817

    
1818
ro_cleanup:
1819
    g_free(buf);
1820

    
1821
    if (ro) {
1822
        /* ignoring error return here */
1823
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1824
    }
1825

    
1826
    return ret;
1827
}
1828

    
1829
int bdrv_commit_all(void)
1830
{
1831
    BlockDriverState *bs;
1832

    
1833
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1834
        if (bs->drv && bs->backing_hd) {
1835
            int ret = bdrv_commit(bs);
1836
            if (ret < 0) {
1837
                return ret;
1838
            }
1839
        }
1840
    }
1841
    return 0;
1842
}
1843

    
1844
/**
1845
 * Remove an active request from the tracked requests list
1846
 *
1847
 * This function should be called when a tracked request is completing.
1848
 */
1849
static void tracked_request_end(BdrvTrackedRequest *req)
1850
{
1851
    QLIST_REMOVE(req, list);
1852
    qemu_co_queue_restart_all(&req->wait_queue);
1853
}
1854

    
1855
/**
1856
 * Add an active request to the tracked requests list
1857
 */
1858
static void tracked_request_begin(BdrvTrackedRequest *req,
1859
                                  BlockDriverState *bs,
1860
                                  int64_t sector_num,
1861
                                  int nb_sectors, bool is_write)
1862
{
1863
    *req = (BdrvTrackedRequest){
1864
        .bs = bs,
1865
        .sector_num = sector_num,
1866
        .nb_sectors = nb_sectors,
1867
        .is_write = is_write,
1868
        .co = qemu_coroutine_self(),
1869
    };
1870

    
1871
    qemu_co_queue_init(&req->wait_queue);
1872

    
1873
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1874
}
1875

    
1876
/**
1877
 * Round a region to cluster boundaries
1878
 */
1879
void bdrv_round_to_clusters(BlockDriverState *bs,
1880
                            int64_t sector_num, int nb_sectors,
1881
                            int64_t *cluster_sector_num,
1882
                            int *cluster_nb_sectors)
1883
{
1884
    BlockDriverInfo bdi;
1885

    
1886
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1887
        *cluster_sector_num = sector_num;
1888
        *cluster_nb_sectors = nb_sectors;
1889
    } else {
1890
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1891
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1892
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1893
                                            nb_sectors, c);
1894
    }
1895
}
1896

    
1897
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1898
                                     int64_t sector_num, int nb_sectors) {
1899
    /*        aaaa   bbbb */
1900
    if (sector_num >= req->sector_num + req->nb_sectors) {
1901
        return false;
1902
    }
1903
    /* bbbb   aaaa        */
1904
    if (req->sector_num >= sector_num + nb_sectors) {
1905
        return false;
1906
    }
1907
    return true;
1908
}
1909

    
1910
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1911
        int64_t sector_num, int nb_sectors)
1912
{
1913
    BdrvTrackedRequest *req;
1914
    int64_t cluster_sector_num;
1915
    int cluster_nb_sectors;
1916
    bool retry;
1917

    
1918
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1919
     * that allocating writes will be serialized and not race with each other
1920
     * for the same cluster.  For example, in copy-on-read it ensures that the
1921
     * CoR read and write operations are atomic and guest writes cannot
1922
     * interleave between them.
1923
     */
1924
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
1925
                           &cluster_sector_num, &cluster_nb_sectors);
1926

    
1927
    do {
1928
        retry = false;
1929
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1930
            if (tracked_request_overlaps(req, cluster_sector_num,
1931
                                         cluster_nb_sectors)) {
1932
                /* Hitting this means there was a reentrant request, for
1933
                 * example, a block driver issuing nested requests.  This must
1934
                 * never happen since it means deadlock.
1935
                 */
1936
                assert(qemu_coroutine_self() != req->co);
1937

    
1938
                qemu_co_queue_wait(&req->wait_queue);
1939
                retry = true;
1940
                break;
1941
            }
1942
        }
1943
    } while (retry);
1944
}
1945

    
1946
/*
1947
 * Return values:
1948
 * 0        - success
1949
 * -EINVAL  - backing format specified, but no file
1950
 * -ENOSPC  - can't update the backing file because no space is left in the
1951
 *            image file header
1952
 * -ENOTSUP - format driver doesn't support changing the backing file
1953
 */
1954
int bdrv_change_backing_file(BlockDriverState *bs,
1955
    const char *backing_file, const char *backing_fmt)
1956
{
1957
    BlockDriver *drv = bs->drv;
1958
    int ret;
1959

    
1960
    /* Backing file format doesn't make sense without a backing file */
1961
    if (backing_fmt && !backing_file) {
1962
        return -EINVAL;
1963
    }
1964

    
1965
    if (drv->bdrv_change_backing_file != NULL) {
1966
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1967
    } else {
1968
        ret = -ENOTSUP;
1969
    }
1970

    
1971
    if (ret == 0) {
1972
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1973
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1974
    }
1975
    return ret;
1976
}
1977

    
1978
/*
1979
 * Finds the image layer in the chain that has 'bs' as its backing file.
1980
 *
1981
 * active is the current topmost image.
1982
 *
1983
 * Returns NULL if bs is not found in active's image chain,
1984
 * or if active == bs.
1985
 */
1986
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1987
                                    BlockDriverState *bs)
1988
{
1989
    BlockDriverState *overlay = NULL;
1990
    BlockDriverState *intermediate;
1991

    
1992
    assert(active != NULL);
1993
    assert(bs != NULL);
1994

    
1995
    /* if bs is the same as active, then by definition it has no overlay
1996
     */
1997
    if (active == bs) {
1998
        return NULL;
1999
    }
2000

    
2001
    intermediate = active;
2002
    while (intermediate->backing_hd) {
2003
        if (intermediate->backing_hd == bs) {
2004
            overlay = intermediate;
2005
            break;
2006
        }
2007
        intermediate = intermediate->backing_hd;
2008
    }
2009

    
2010
    return overlay;
2011
}
2012

    
2013
typedef struct BlkIntermediateStates {
2014
    BlockDriverState *bs;
2015
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2016
} BlkIntermediateStates;
2017

    
2018

    
2019
/*
2020
 * Drops images above 'base' up to and including 'top', and sets the image
2021
 * above 'top' to have base as its backing file.
2022
 *
2023
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2024
 * information in 'bs' can be properly updated.
2025
 *
2026
 * E.g., this will convert the following chain:
2027
 * bottom <- base <- intermediate <- top <- active
2028
 *
2029
 * to
2030
 *
2031
 * bottom <- base <- active
2032
 *
2033
 * It is allowed for bottom==base, in which case it converts:
2034
 *
2035
 * base <- intermediate <- top <- active
2036
 *
2037
 * to
2038
 *
2039
 * base <- active
2040
 *
2041
 * Error conditions:
2042
 *  if active == top, that is considered an error
2043
 *
2044
 */
2045
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2046
                           BlockDriverState *base)
2047
{
2048
    BlockDriverState *intermediate;
2049
    BlockDriverState *base_bs = NULL;
2050
    BlockDriverState *new_top_bs = NULL;
2051
    BlkIntermediateStates *intermediate_state, *next;
2052
    int ret = -EIO;
2053

    
2054
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2055
    QSIMPLEQ_INIT(&states_to_delete);
2056

    
2057
    if (!top->drv || !base->drv) {
2058
        goto exit;
2059
    }
2060

    
2061
    new_top_bs = bdrv_find_overlay(active, top);
2062

    
2063
    if (new_top_bs == NULL) {
2064
        /* we could not find the image above 'top', this is an error */
2065
        goto exit;
2066
    }
2067

    
2068
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2069
     * to do, no intermediate images */
2070
    if (new_top_bs->backing_hd == base) {
2071
        ret = 0;
2072
        goto exit;
2073
    }
2074

    
2075
    intermediate = top;
2076

    
2077
    /* now we will go down through the list, and add each BDS we find
2078
     * into our deletion queue, until we hit the 'base'
2079
     */
2080
    while (intermediate) {
2081
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2082
        intermediate_state->bs = intermediate;
2083
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2084

    
2085
        if (intermediate->backing_hd == base) {
2086
            base_bs = intermediate->backing_hd;
2087
            break;
2088
        }
2089
        intermediate = intermediate->backing_hd;
2090
    }
2091
    if (base_bs == NULL) {
2092
        /* something went wrong, we did not end at the base. safely
2093
         * unravel everything, and exit with error */
2094
        goto exit;
2095
    }
2096

    
2097
    /* success - we can delete the intermediate states, and link top->base */
2098
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2099
                                   base_bs->drv ? base_bs->drv->format_name : "");
2100
    if (ret) {
2101
        goto exit;
2102
    }
2103
    new_top_bs->backing_hd = base_bs;
2104

    
2105

    
2106
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2107
        /* so that bdrv_close() does not recursively close the chain */
2108
        intermediate_state->bs->backing_hd = NULL;
2109
        bdrv_delete(intermediate_state->bs);
2110
    }
2111
    ret = 0;
2112

    
2113
exit:
2114
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2115
        g_free(intermediate_state);
2116
    }
2117
    return ret;
2118
}
2119

    
2120

    
2121
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2122
                                   size_t size)
2123
{
2124
    int64_t len;
2125

    
2126
    if (!bdrv_is_inserted(bs))
2127
        return -ENOMEDIUM;
2128

    
2129
    if (bs->growable)
2130
        return 0;
2131

    
2132
    len = bdrv_getlength(bs);
2133

    
2134
    if (offset < 0)
2135
        return -EIO;
2136

    
2137
    if ((offset > len) || (len - offset < size))
2138
        return -EIO;
2139

    
2140
    return 0;
2141
}
2142

    
2143
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2144
                              int nb_sectors)
2145
{
2146
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2147
                                   nb_sectors * BDRV_SECTOR_SIZE);
2148
}
2149

    
2150
typedef struct RwCo {
2151
    BlockDriverState *bs;
2152
    int64_t sector_num;
2153
    int nb_sectors;
2154
    QEMUIOVector *qiov;
2155
    bool is_write;
2156
    int ret;
2157
} RwCo;
2158

    
2159
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2160
{
2161
    RwCo *rwco = opaque;
2162

    
2163
    if (!rwco->is_write) {
2164
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2165
                                     rwco->nb_sectors, rwco->qiov, 0);
2166
    } else {
2167
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2168
                                      rwco->nb_sectors, rwco->qiov, 0);
2169
    }
2170
}
2171

    
2172
/*
2173
 * Process a vectored synchronous request using coroutines
2174
 */
2175
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2176
                       QEMUIOVector *qiov, bool is_write)
2177
{
2178
    Coroutine *co;
2179
    RwCo rwco = {
2180
        .bs = bs,
2181
        .sector_num = sector_num,
2182
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2183
        .qiov = qiov,
2184
        .is_write = is_write,
2185
        .ret = NOT_DONE,
2186
    };
2187
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2188

    
2189
    /**
2190
     * In sync call context, when the vcpu is blocked, this throttling timer
2191
     * will not fire; so the I/O throttling function has to be disabled here
2192
     * if it has been enabled.
2193
     */
2194
    if (bs->io_limits_enabled) {
2195
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2196
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2197
        bdrv_io_limits_disable(bs);
2198
    }
2199

    
2200
    if (qemu_in_coroutine()) {
2201
        /* Fast-path if already in coroutine context */
2202
        bdrv_rw_co_entry(&rwco);
2203
    } else {
2204
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2205
        qemu_coroutine_enter(co, &rwco);
2206
        while (rwco.ret == NOT_DONE) {
2207
            qemu_aio_wait();
2208
        }
2209
    }
2210
    return rwco.ret;
2211
}
2212

    
2213
/*
2214
 * Process a synchronous request using coroutines
2215
 */
2216
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2217
                      int nb_sectors, bool is_write)
2218
{
2219
    QEMUIOVector qiov;
2220
    struct iovec iov = {
2221
        .iov_base = (void *)buf,
2222
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2223
    };
2224

    
2225
    qemu_iovec_init_external(&qiov, &iov, 1);
2226
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write);
2227
}
2228

    
2229
/* return < 0 if error. See bdrv_write() for the return codes */
2230
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2231
              uint8_t *buf, int nb_sectors)
2232
{
2233
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
2234
}
2235

    
2236
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2237
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2238
                          uint8_t *buf, int nb_sectors)
2239
{
2240
    bool enabled;
2241
    int ret;
2242

    
2243
    enabled = bs->io_limits_enabled;
2244
    bs->io_limits_enabled = false;
2245
    ret = bdrv_read(bs, 0, buf, 1);
2246
    bs->io_limits_enabled = enabled;
2247
    return ret;
2248
}
2249

    
2250
/* Return < 0 if error. Important errors are:
2251
  -EIO         generic I/O error (may happen for all errors)
2252
  -ENOMEDIUM   No media inserted.
2253
  -EINVAL      Invalid sector number or nb_sectors
2254
  -EACCES      Trying to write a read-only device
2255
*/
2256
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2257
               const uint8_t *buf, int nb_sectors)
2258
{
2259
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
2260
}
2261

    
2262
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2263
{
2264
    return bdrv_rwv_co(bs, sector_num, qiov, true);
2265
}
2266

    
2267
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2268
               void *buf, int count1)
2269
{
2270
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2271
    int len, nb_sectors, count;
2272
    int64_t sector_num;
2273
    int ret;
2274

    
2275
    count = count1;
2276
    /* first read to align to sector start */
2277
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2278
    if (len > count)
2279
        len = count;
2280
    sector_num = offset >> BDRV_SECTOR_BITS;
2281
    if (len > 0) {
2282
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2283
            return ret;
2284
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2285
        count -= len;
2286
        if (count == 0)
2287
            return count1;
2288
        sector_num++;
2289
        buf += len;
2290
    }
2291

    
2292
    /* read the sectors "in place" */
2293
    nb_sectors = count >> BDRV_SECTOR_BITS;
2294
    if (nb_sectors > 0) {
2295
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2296
            return ret;
2297
        sector_num += nb_sectors;
2298
        len = nb_sectors << BDRV_SECTOR_BITS;
2299
        buf += len;
2300
        count -= len;
2301
    }
2302

    
2303
    /* add data from the last sector */
2304
    if (count > 0) {
2305
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2306
            return ret;
2307
        memcpy(buf, tmp_buf, count);
2308
    }
2309
    return count1;
2310
}
2311

    
2312
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2313
{
2314
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2315
    int len, nb_sectors, count;
2316
    int64_t sector_num;
2317
    int ret;
2318

    
2319
    count = qiov->size;
2320

    
2321
    /* first write to align to sector start */
2322
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2323
    if (len > count)
2324
        len = count;
2325
    sector_num = offset >> BDRV_SECTOR_BITS;
2326
    if (len > 0) {
2327
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2328
            return ret;
2329
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2330
                          len);
2331
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2332
            return ret;
2333
        count -= len;
2334
        if (count == 0)
2335
            return qiov->size;
2336
        sector_num++;
2337
    }
2338

    
2339
    /* write the sectors "in place" */
2340
    nb_sectors = count >> BDRV_SECTOR_BITS;
2341
    if (nb_sectors > 0) {
2342
        QEMUIOVector qiov_inplace;
2343

    
2344
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2345
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2346
                          nb_sectors << BDRV_SECTOR_BITS);
2347
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2348
        qemu_iovec_destroy(&qiov_inplace);
2349
        if (ret < 0) {
2350
            return ret;
2351
        }
2352

    
2353
        sector_num += nb_sectors;
2354
        len = nb_sectors << BDRV_SECTOR_BITS;
2355
        count -= len;
2356
    }
2357

    
2358
    /* add data from the last sector */
2359
    if (count > 0) {
2360
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2361
            return ret;
2362
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2363
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2364
            return ret;
2365
    }
2366
    return qiov->size;
2367
}
2368

    
2369
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2370
                const void *buf, int count1)
2371
{
2372
    QEMUIOVector qiov;
2373
    struct iovec iov = {
2374
        .iov_base   = (void *) buf,
2375
        .iov_len    = count1,
2376
    };
2377

    
2378
    qemu_iovec_init_external(&qiov, &iov, 1);
2379
    return bdrv_pwritev(bs, offset, &qiov);
2380
}
2381

    
2382
/*
2383
 * Writes to the file and ensures that no writes are reordered across this
2384
 * request (acts as a barrier)
2385
 *
2386
 * Returns 0 on success, -errno in error cases.
2387
 */
2388
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2389
    const void *buf, int count)
2390
{
2391
    int ret;
2392

    
2393
    ret = bdrv_pwrite(bs, offset, buf, count);
2394
    if (ret < 0) {
2395
        return ret;
2396
    }
2397

    
2398
    /* No flush needed for cache modes that already do it */
2399
    if (bs->enable_write_cache) {
2400
        bdrv_flush(bs);
2401
    }
2402

    
2403
    return 0;
2404
}
2405

    
2406
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2407
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2408
{
2409
    /* Perform I/O through a temporary buffer so that users who scribble over
2410
     * their read buffer while the operation is in progress do not end up
2411
     * modifying the image file.  This is critical for zero-copy guest I/O
2412
     * where anything might happen inside guest memory.
2413
     */
2414
    void *bounce_buffer;
2415

    
2416
    BlockDriver *drv = bs->drv;
2417
    struct iovec iov;
2418
    QEMUIOVector bounce_qiov;
2419
    int64_t cluster_sector_num;
2420
    int cluster_nb_sectors;
2421
    size_t skip_bytes;
2422
    int ret;
2423

    
2424
    /* Cover entire cluster so no additional backing file I/O is required when
2425
     * allocating cluster in the image file.
2426
     */
2427
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2428
                           &cluster_sector_num, &cluster_nb_sectors);
2429

    
2430
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2431
                                   cluster_sector_num, cluster_nb_sectors);
2432

    
2433
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2434
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2435
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2436

    
2437
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2438
                             &bounce_qiov);
2439
    if (ret < 0) {
2440
        goto err;
2441
    }
2442

    
2443
    if (drv->bdrv_co_write_zeroes &&
2444
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2445
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2446
                                      cluster_nb_sectors);
2447
    } else {
2448
        /* This does not change the data on the disk, it is not necessary
2449
         * to flush even in cache=writethrough mode.
2450
         */
2451
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2452
                                  &bounce_qiov);
2453
    }
2454

    
2455
    if (ret < 0) {
2456
        /* It might be okay to ignore write errors for guest requests.  If this
2457
         * is a deliberate copy-on-read then we don't want to ignore the error.
2458
         * Simply report it in all cases.
2459
         */
2460
        goto err;
2461
    }
2462

    
2463
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2464
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2465
                        nb_sectors * BDRV_SECTOR_SIZE);
2466

    
2467
err:
2468
    qemu_vfree(bounce_buffer);
2469
    return ret;
2470
}
2471

    
2472
/*
2473
 * Handle a read request in coroutine context
2474
 */
2475
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2476
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2477
    BdrvRequestFlags flags)
2478
{
2479
    BlockDriver *drv = bs->drv;
2480
    BdrvTrackedRequest req;
2481
    int ret;
2482

    
2483
    if (!drv) {
2484
        return -ENOMEDIUM;
2485
    }
2486
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2487
        return -EIO;
2488
    }
2489

    
2490
    /* throttling disk read I/O */
2491
    if (bs->io_limits_enabled) {
2492
        bdrv_io_limits_intercept(bs, false, nb_sectors);
2493
    }
2494

    
2495
    if (bs->copy_on_read) {
2496
        flags |= BDRV_REQ_COPY_ON_READ;
2497
    }
2498
    if (flags & BDRV_REQ_COPY_ON_READ) {
2499
        bs->copy_on_read_in_flight++;
2500
    }
2501

    
2502
    if (bs->copy_on_read_in_flight) {
2503
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2504
    }
2505

    
2506
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2507

    
2508
    if (flags & BDRV_REQ_COPY_ON_READ) {
2509
        int pnum;
2510

    
2511
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2512
        if (ret < 0) {
2513
            goto out;
2514
        }
2515

    
2516
        if (!ret || pnum != nb_sectors) {
2517
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2518
            goto out;
2519
        }
2520
    }
2521

    
2522
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2523

    
2524
out:
2525
    tracked_request_end(&req);
2526

    
2527
    if (flags & BDRV_REQ_COPY_ON_READ) {
2528
        bs->copy_on_read_in_flight--;
2529
    }
2530

    
2531
    return ret;
2532
}
2533

    
2534
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2535
    int nb_sectors, QEMUIOVector *qiov)
2536
{
2537
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2538

    
2539
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2540
}
2541

    
2542
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2543
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2544
{
2545
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2546

    
2547
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2548
                            BDRV_REQ_COPY_ON_READ);
2549
}
2550

    
2551
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2552
    int64_t sector_num, int nb_sectors)
2553
{
2554
    BlockDriver *drv = bs->drv;
2555
    QEMUIOVector qiov;
2556
    struct iovec iov;
2557
    int ret;
2558

    
2559
    /* TODO Emulate only part of misaligned requests instead of letting block
2560
     * drivers return -ENOTSUP and emulate everything */
2561

    
2562
    /* First try the efficient write zeroes operation */
2563
    if (drv->bdrv_co_write_zeroes) {
2564
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2565
        if (ret != -ENOTSUP) {
2566
            return ret;
2567
        }
2568
    }
2569

    
2570
    /* Fall back to bounce buffer if write zeroes is unsupported */
2571
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2572
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2573
    memset(iov.iov_base, 0, iov.iov_len);
2574
    qemu_iovec_init_external(&qiov, &iov, 1);
2575

    
2576
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2577

    
2578
    qemu_vfree(iov.iov_base);
2579
    return ret;
2580
}
2581

    
2582
/*
2583
 * Handle a write request in coroutine context
2584
 */
2585
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2586
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2587
    BdrvRequestFlags flags)
2588
{
2589
    BlockDriver *drv = bs->drv;
2590
    BdrvTrackedRequest req;
2591
    int ret;
2592

    
2593
    if (!bs->drv) {
2594
        return -ENOMEDIUM;
2595
    }
2596
    if (bs->read_only) {
2597
        return -EACCES;
2598
    }
2599
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2600
        return -EIO;
2601
    }
2602

    
2603
    /* throttling disk write I/O */
2604
    if (bs->io_limits_enabled) {
2605
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2606
    }
2607

    
2608
    if (bs->copy_on_read_in_flight) {
2609
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2610
    }
2611

    
2612
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2613

    
2614
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2615

    
2616
    if (ret < 0) {
2617
        /* Do nothing, write notifier decided to fail this request */
2618
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
2619
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2620
    } else {
2621
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2622
    }
2623

    
2624
    if (ret == 0 && !bs->enable_write_cache) {
2625
        ret = bdrv_co_flush(bs);
2626
    }
2627

    
2628
    if (bs->dirty_bitmap) {
2629
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2630
    }
2631

    
2632
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2633
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2634
    }
2635

    
2636
    tracked_request_end(&req);
2637

    
2638
    return ret;
2639
}
2640

    
2641
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2642
    int nb_sectors, QEMUIOVector *qiov)
2643
{
2644
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2645

    
2646
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2647
}
2648

    
2649
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2650
                                      int64_t sector_num, int nb_sectors)
2651
{
2652
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2653

    
2654
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2655
                             BDRV_REQ_ZERO_WRITE);
2656
}
2657

    
2658
/**
2659
 * Truncate file to 'offset' bytes (needed only for file protocols)
2660
 */
2661
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2662
{
2663
    BlockDriver *drv = bs->drv;
2664
    int ret;
2665
    if (!drv)
2666
        return -ENOMEDIUM;
2667
    if (!drv->bdrv_truncate)
2668
        return -ENOTSUP;
2669
    if (bs->read_only)
2670
        return -EACCES;
2671
    if (bdrv_in_use(bs))
2672
        return -EBUSY;
2673
    ret = drv->bdrv_truncate(bs, offset);
2674
    if (ret == 0) {
2675
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2676
        bdrv_dev_resize_cb(bs);
2677
    }
2678
    return ret;
2679
}
2680

    
2681
/**
2682
 * Length of a allocated file in bytes. Sparse files are counted by actual
2683
 * allocated space. Return < 0 if error or unknown.
2684
 */
2685
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2686
{
2687
    BlockDriver *drv = bs->drv;
2688
    if (!drv) {
2689
        return -ENOMEDIUM;
2690
    }
2691
    if (drv->bdrv_get_allocated_file_size) {
2692
        return drv->bdrv_get_allocated_file_size(bs);
2693
    }
2694
    if (bs->file) {
2695
        return bdrv_get_allocated_file_size(bs->file);
2696
    }
2697
    return -ENOTSUP;
2698
}
2699

    
2700
/**
2701
 * Length of a file in bytes. Return < 0 if error or unknown.
2702
 */
2703
int64_t bdrv_getlength(BlockDriverState *bs)
2704
{
2705
    BlockDriver *drv = bs->drv;
2706
    if (!drv)
2707
        return -ENOMEDIUM;
2708

    
2709
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2710
        if (drv->bdrv_getlength) {
2711
            return drv->bdrv_getlength(bs);
2712
        }
2713
    }
2714
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2715
}
2716

    
2717
/* return 0 as number of sectors if no device present or error */
2718
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2719
{
2720
    int64_t length;
2721
    length = bdrv_getlength(bs);
2722
    if (length < 0)
2723
        length = 0;
2724
    else
2725
        length = length >> BDRV_SECTOR_BITS;
2726
    *nb_sectors_ptr = length;
2727
}
2728

    
2729
/* throttling disk io limits */
2730
void bdrv_set_io_limits(BlockDriverState *bs,
2731
                        BlockIOLimit *io_limits)
2732
{
2733
    bs->io_limits = *io_limits;
2734
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2735
}
2736

    
2737
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2738
                       BlockdevOnError on_write_error)
2739
{
2740
    bs->on_read_error = on_read_error;
2741
    bs->on_write_error = on_write_error;
2742
}
2743

    
2744
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2745
{
2746
    return is_read ? bs->on_read_error : bs->on_write_error;
2747
}
2748

    
2749
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2750
{
2751
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2752

    
2753
    switch (on_err) {
2754
    case BLOCKDEV_ON_ERROR_ENOSPC:
2755
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2756
    case BLOCKDEV_ON_ERROR_STOP:
2757
        return BDRV_ACTION_STOP;
2758
    case BLOCKDEV_ON_ERROR_REPORT:
2759
        return BDRV_ACTION_REPORT;
2760
    case BLOCKDEV_ON_ERROR_IGNORE:
2761
        return BDRV_ACTION_IGNORE;
2762
    default:
2763
        abort();
2764
    }
2765
}
2766

    
2767
/* This is done by device models because, while the block layer knows
2768
 * about the error, it does not know whether an operation comes from
2769
 * the device or the block layer (from a job, for example).
2770
 */
2771
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2772
                       bool is_read, int error)
2773
{
2774
    assert(error >= 0);
2775
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2776
    if (action == BDRV_ACTION_STOP) {
2777
        vm_stop(RUN_STATE_IO_ERROR);
2778
        bdrv_iostatus_set_err(bs, error);
2779
    }
2780
}
2781

    
2782
int bdrv_is_read_only(BlockDriverState *bs)
2783
{
2784
    return bs->read_only;
2785
}
2786

    
2787
int bdrv_is_sg(BlockDriverState *bs)
2788
{
2789
    return bs->sg;
2790
}
2791

    
2792
int bdrv_enable_write_cache(BlockDriverState *bs)
2793
{
2794
    return bs->enable_write_cache;
2795
}
2796

    
2797
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2798
{
2799
    bs->enable_write_cache = wce;
2800

    
2801
    /* so a reopen() will preserve wce */
2802
    if (wce) {
2803
        bs->open_flags |= BDRV_O_CACHE_WB;
2804
    } else {
2805
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2806
    }
2807
}
2808

    
2809
int bdrv_is_encrypted(BlockDriverState *bs)
2810
{
2811
    if (bs->backing_hd && bs->backing_hd->encrypted)
2812
        return 1;
2813
    return bs->encrypted;
2814
}
2815

    
2816
int bdrv_key_required(BlockDriverState *bs)
2817
{
2818
    BlockDriverState *backing_hd = bs->backing_hd;
2819

    
2820
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2821
        return 1;
2822
    return (bs->encrypted && !bs->valid_key);
2823
}
2824

    
2825
int bdrv_set_key(BlockDriverState *bs, const char *key)
2826
{
2827
    int ret;
2828
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2829
        ret = bdrv_set_key(bs->backing_hd, key);
2830
        if (ret < 0)
2831
            return ret;
2832
        if (!bs->encrypted)
2833
            return 0;
2834
    }
2835
    if (!bs->encrypted) {
2836
        return -EINVAL;
2837
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2838
        return -ENOMEDIUM;
2839
    }
2840
    ret = bs->drv->bdrv_set_key(bs, key);
2841
    if (ret < 0) {
2842
        bs->valid_key = 0;
2843
    } else if (!bs->valid_key) {
2844
        bs->valid_key = 1;
2845
        /* call the change callback now, we skipped it on open */
2846
        bdrv_dev_change_media_cb(bs, true);
2847
    }
2848
    return ret;
2849
}
2850

    
2851
const char *bdrv_get_format_name(BlockDriverState *bs)
2852
{
2853
    return bs->drv ? bs->drv->format_name : NULL;
2854
}
2855

    
2856
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2857
                         void *opaque)
2858
{
2859
    BlockDriver *drv;
2860

    
2861
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2862
        it(opaque, drv->format_name);
2863
    }
2864
}
2865

    
2866
BlockDriverState *bdrv_find(const char *name)
2867
{
2868
    BlockDriverState *bs;
2869

    
2870
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2871
        if (!strcmp(name, bs->device_name)) {
2872
            return bs;
2873
        }
2874
    }
2875
    return NULL;
2876
}
2877

    
2878
BlockDriverState *bdrv_next(BlockDriverState *bs)
2879
{
2880
    if (!bs) {
2881
        return QTAILQ_FIRST(&bdrv_states);
2882
    }
2883
    return QTAILQ_NEXT(bs, list);
2884
}
2885

    
2886
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2887
{
2888
    BlockDriverState *bs;
2889

    
2890
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2891
        it(opaque, bs);
2892
    }
2893
}
2894

    
2895
const char *bdrv_get_device_name(BlockDriverState *bs)
2896
{
2897
    return bs->device_name;
2898
}
2899

    
2900
int bdrv_get_flags(BlockDriverState *bs)
2901
{
2902
    return bs->open_flags;
2903
}
2904

    
2905
void bdrv_flush_all(void)
2906
{
2907
    BlockDriverState *bs;
2908

    
2909
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2910
        bdrv_flush(bs);
2911
    }
2912
}
2913

    
2914
int bdrv_has_zero_init(BlockDriverState *bs)
2915
{
2916
    assert(bs->drv);
2917

    
2918
    if (bs->drv->bdrv_has_zero_init) {
2919
        return bs->drv->bdrv_has_zero_init(bs);
2920
    }
2921

    
2922
    return 1;
2923
}
2924

    
2925
typedef struct BdrvCoIsAllocatedData {
2926
    BlockDriverState *bs;
2927
    BlockDriverState *base;
2928
    int64_t sector_num;
2929
    int nb_sectors;
2930
    int *pnum;
2931
    int ret;
2932
    bool done;
2933
} BdrvCoIsAllocatedData;
2934

    
2935
/*
2936
 * Returns true iff the specified sector is present in the disk image. Drivers
2937
 * not implementing the functionality are assumed to not support backing files,
2938
 * hence all their sectors are reported as allocated.
2939
 *
2940
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2941
 * and 'pnum' is set to 0.
2942
 *
2943
 * 'pnum' is set to the number of sectors (including and immediately following
2944
 * the specified sector) that are known to be in the same
2945
 * allocated/unallocated state.
2946
 *
2947
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2948
 * beyond the end of the disk image it will be clamped.
2949
 */
2950
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2951
                                      int nb_sectors, int *pnum)
2952
{
2953
    int64_t n;
2954

    
2955
    if (sector_num >= bs->total_sectors) {
2956
        *pnum = 0;
2957
        return 0;
2958
    }
2959

    
2960
    n = bs->total_sectors - sector_num;
2961
    if (n < nb_sectors) {
2962
        nb_sectors = n;
2963
    }
2964

    
2965
    if (!bs->drv->bdrv_co_is_allocated) {
2966
        *pnum = nb_sectors;
2967
        return 1;
2968
    }
2969

    
2970
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2971
}
2972

    
2973
/* Coroutine wrapper for bdrv_is_allocated() */
2974
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2975
{
2976
    BdrvCoIsAllocatedData *data = opaque;
2977
    BlockDriverState *bs = data->bs;
2978

    
2979
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2980
                                     data->pnum);
2981
    data->done = true;
2982
}
2983

    
2984
/*
2985
 * Synchronous wrapper around bdrv_co_is_allocated().
2986
 *
2987
 * See bdrv_co_is_allocated() for details.
2988
 */
2989
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2990
                      int *pnum)
2991
{
2992
    Coroutine *co;
2993
    BdrvCoIsAllocatedData data = {
2994
        .bs = bs,
2995
        .sector_num = sector_num,
2996
        .nb_sectors = nb_sectors,
2997
        .pnum = pnum,
2998
        .done = false,
2999
    };
3000

    
3001
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
3002
    qemu_coroutine_enter(co, &data);
3003
    while (!data.done) {
3004
        qemu_aio_wait();
3005
    }
3006
    return data.ret;
3007
}
3008

    
3009
/*
3010
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3011
 *
3012
 * Return true if the given sector is allocated in any image between
3013
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3014
 * sector is allocated in any image of the chain.  Return false otherwise.
3015
 *
3016
 * 'pnum' is set to the number of sectors (including and immediately following
3017
 *  the specified sector) that are known to be in the same
3018
 *  allocated/unallocated state.
3019
 *
3020
 */
3021
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
3022
                                            BlockDriverState *base,
3023
                                            int64_t sector_num,
3024
                                            int nb_sectors, int *pnum)
3025
{
3026
    BlockDriverState *intermediate;
3027
    int ret, n = nb_sectors;
3028

    
3029
    intermediate = top;
3030
    while (intermediate && intermediate != base) {
3031
        int pnum_inter;
3032
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
3033
                                   &pnum_inter);
3034
        if (ret < 0) {
3035
            return ret;
3036
        } else if (ret) {
3037
            *pnum = pnum_inter;
3038
            return 1;
3039
        }
3040

    
3041
        /*
3042
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3043
         * might have
3044
         *
3045
         * [sector_num+x, nr_sectors] allocated.
3046
         */
3047
        if (n > pnum_inter &&
3048
            (intermediate == top ||
3049
             sector_num + pnum_inter < intermediate->total_sectors)) {
3050
            n = pnum_inter;
3051
        }
3052

    
3053
        intermediate = intermediate->backing_hd;
3054
    }
3055

    
3056
    *pnum = n;
3057
    return 0;
3058
}
3059

    
3060
/* Coroutine wrapper for bdrv_is_allocated_above() */
3061
static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque)
3062
{
3063
    BdrvCoIsAllocatedData *data = opaque;
3064
    BlockDriverState *top = data->bs;
3065
    BlockDriverState *base = data->base;
3066

    
3067
    data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num,
3068
                                           data->nb_sectors, data->pnum);
3069
    data->done = true;
3070
}
3071

    
3072
/*
3073
 * Synchronous wrapper around bdrv_co_is_allocated_above().
3074
 *
3075
 * See bdrv_co_is_allocated_above() for details.
3076
 */
3077
int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
3078
                            int64_t sector_num, int nb_sectors, int *pnum)
3079
{
3080
    Coroutine *co;
3081
    BdrvCoIsAllocatedData data = {
3082
        .bs = top,
3083
        .base = base,
3084
        .sector_num = sector_num,
3085
        .nb_sectors = nb_sectors,
3086
        .pnum = pnum,
3087
        .done = false,
3088
    };
3089

    
3090
    co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry);
3091
    qemu_coroutine_enter(co, &data);
3092
    while (!data.done) {
3093
        qemu_aio_wait();
3094
    }
3095
    return data.ret;
3096
}
3097

    
3098
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3099
{
3100
    if (bs->backing_hd && bs->backing_hd->encrypted)
3101
        return bs->backing_file;
3102
    else if (bs->encrypted)
3103
        return bs->filename;
3104
    else
3105
        return NULL;
3106
}
3107

    
3108
void bdrv_get_backing_filename(BlockDriverState *bs,
3109
                               char *filename, int filename_size)
3110
{
3111
    pstrcpy(filename, filename_size, bs->backing_file);
3112
}
3113

    
3114
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3115
                          const uint8_t *buf, int nb_sectors)
3116
{
3117
    BlockDriver *drv = bs->drv;
3118
    if (!drv)
3119
        return -ENOMEDIUM;
3120
    if (!drv->bdrv_write_compressed)
3121
        return -ENOTSUP;
3122
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3123
        return -EIO;
3124

    
3125
    assert(!bs->dirty_bitmap);
3126

    
3127
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3128
}
3129

    
3130
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3131
{
3132
    BlockDriver *drv = bs->drv;
3133
    if (!drv)
3134
        return -ENOMEDIUM;
3135
    if (!drv->bdrv_get_info)
3136
        return -ENOTSUP;
3137
    memset(bdi, 0, sizeof(*bdi));
3138
    return drv->bdrv_get_info(bs, bdi);
3139
}
3140

    
3141
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3142
                      int64_t pos, int size)
3143
{
3144
    QEMUIOVector qiov;
3145
    struct iovec iov = {
3146
        .iov_base   = (void *) buf,
3147
        .iov_len    = size,
3148
    };
3149

    
3150
    qemu_iovec_init_external(&qiov, &iov, 1);
3151
    return bdrv_writev_vmstate(bs, &qiov, pos);
3152
}
3153

    
3154
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3155
{
3156
    BlockDriver *drv = bs->drv;
3157

    
3158
    if (!drv) {
3159
        return -ENOMEDIUM;
3160
    } else if (drv->bdrv_save_vmstate) {
3161
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3162
    } else if (bs->file) {
3163
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3164
    }
3165

    
3166
    return -ENOTSUP;
3167
}
3168

    
3169
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3170
                      int64_t pos, int size)
3171
{
3172
    BlockDriver *drv = bs->drv;
3173
    if (!drv)
3174
        return -ENOMEDIUM;
3175
    if (drv->bdrv_load_vmstate)
3176
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3177
    if (bs->file)
3178
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3179
    return -ENOTSUP;
3180
}
3181

    
3182
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3183
{
3184
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3185
        return;
3186
    }
3187

    
3188
    bs->drv->bdrv_debug_event(bs, event);
3189
}
3190

    
3191
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3192
                          const char *tag)
3193
{
3194
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3195
        bs = bs->file;
3196
    }
3197

    
3198
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3199
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3200
    }
3201

    
3202
    return -ENOTSUP;
3203
}
3204

    
3205
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3206
{
3207
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3208
        bs = bs->file;
3209
    }
3210

    
3211
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3212
        return bs->drv->bdrv_debug_resume(bs, tag);
3213
    }
3214

    
3215
    return -ENOTSUP;
3216
}
3217

    
3218
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3219
{
3220
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3221
        bs = bs->file;
3222
    }
3223

    
3224
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3225
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3226
    }
3227

    
3228
    return false;
3229
}
3230

    
3231
int bdrv_is_snapshot(BlockDriverState *bs)
3232
{
3233
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3234
}
3235

    
3236
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3237
 * relative, it must be relative to the chain.  So, passing in bs->filename
3238
 * from a BDS as backing_file should not be done, as that may be relative to
3239
 * the CWD rather than the chain. */
3240
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3241
        const char *backing_file)
3242
{
3243
    char *filename_full = NULL;
3244
    char *backing_file_full = NULL;
3245
    char *filename_tmp = NULL;
3246
    int is_protocol = 0;
3247
    BlockDriverState *curr_bs = NULL;
3248
    BlockDriverState *retval = NULL;
3249

    
3250
    if (!bs || !bs->drv || !backing_file) {
3251
        return NULL;
3252
    }
3253

    
3254
    filename_full     = g_malloc(PATH_MAX);
3255
    backing_file_full = g_malloc(PATH_MAX);
3256
    filename_tmp      = g_malloc(PATH_MAX);
3257

    
3258
    is_protocol = path_has_protocol(backing_file);
3259

    
3260
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3261

    
3262
        /* If either of the filename paths is actually a protocol, then
3263
         * compare unmodified paths; otherwise make paths relative */
3264
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3265
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3266
                retval = curr_bs->backing_hd;
3267
                break;
3268
            }
3269
        } else {
3270
            /* If not an absolute filename path, make it relative to the current
3271
             * image's filename path */
3272
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3273
                         backing_file);
3274

    
3275
            /* We are going to compare absolute pathnames */
3276
            if (!realpath(filename_tmp, filename_full)) {
3277
                continue;
3278
            }
3279

    
3280
            /* We need to make sure the backing filename we are comparing against
3281
             * is relative to the current image filename (or absolute) */
3282
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3283
                         curr_bs->backing_file);
3284

    
3285
            if (!realpath(filename_tmp, backing_file_full)) {
3286
                continue;
3287
            }
3288

    
3289
            if (strcmp(backing_file_full, filename_full) == 0) {
3290
                retval = curr_bs->backing_hd;
3291
                break;
3292
            }
3293
        }
3294
    }
3295

    
3296
    g_free(filename_full);
3297
    g_free(backing_file_full);
3298
    g_free(filename_tmp);
3299
    return retval;
3300
}
3301

    
3302
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3303
{
3304
    if (!bs->drv) {
3305
        return 0;
3306
    }
3307

    
3308
    if (!bs->backing_hd) {
3309
        return 0;
3310
    }
3311

    
3312
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3313
}
3314

    
3315
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3316
{
3317
    BlockDriverState *curr_bs = NULL;
3318

    
3319
    if (!bs) {
3320
        return NULL;
3321
    }
3322

    
3323
    curr_bs = bs;
3324

    
3325
    while (curr_bs->backing_hd) {
3326
        curr_bs = curr_bs->backing_hd;
3327
    }
3328
    return curr_bs;
3329
}
3330

    
3331
/**************************************************************/
3332
/* async I/Os */
3333

    
3334
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3335
                                 QEMUIOVector *qiov, int nb_sectors,
3336
                                 BlockDriverCompletionFunc *cb, void *opaque)
3337
{
3338
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3339

    
3340
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3341
                                 cb, opaque, false);
3342
}
3343

    
3344
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3345
                                  QEMUIOVector *qiov, int nb_sectors,
3346
                                  BlockDriverCompletionFunc *cb, void *opaque)
3347
{
3348
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3349

    
3350
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3351
                                 cb, opaque, true);
3352
}
3353

    
3354

    
3355
typedef struct MultiwriteCB {
3356
    int error;
3357
    int num_requests;
3358
    int num_callbacks;
3359
    struct {
3360
        BlockDriverCompletionFunc *cb;
3361
        void *opaque;
3362
        QEMUIOVector *free_qiov;
3363
    } callbacks[];
3364
} MultiwriteCB;
3365

    
3366
static void multiwrite_user_cb(MultiwriteCB *mcb)
3367
{
3368
    int i;
3369

    
3370
    for (i = 0; i < mcb->num_callbacks; i++) {
3371
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3372
        if (mcb->callbacks[i].free_qiov) {
3373
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3374
        }
3375
        g_free(mcb->callbacks[i].free_qiov);
3376
    }
3377
}
3378

    
3379
static void multiwrite_cb(void *opaque, int ret)
3380
{
3381
    MultiwriteCB *mcb = opaque;
3382

    
3383
    trace_multiwrite_cb(mcb, ret);
3384

    
3385
    if (ret < 0 && !mcb->error) {
3386
        mcb->error = ret;
3387
    }
3388

    
3389
    mcb->num_requests--;
3390
    if (mcb->num_requests == 0) {
3391
        multiwrite_user_cb(mcb);
3392
        g_free(mcb);
3393
    }
3394
}
3395

    
3396
static int multiwrite_req_compare(const void *a, const void *b)
3397
{
3398
    const BlockRequest *req1 = a, *req2 = b;
3399

    
3400
    /*
3401
     * Note that we can't simply subtract req2->sector from req1->sector
3402
     * here as that could overflow the return value.
3403
     */
3404
    if (req1->sector > req2->sector) {
3405
        return 1;
3406
    } else if (req1->sector < req2->sector) {
3407
        return -1;
3408
    } else {
3409
        return 0;
3410
    }
3411
}
3412

    
3413
/*
3414
 * Takes a bunch of requests and tries to merge them. Returns the number of
3415
 * requests that remain after merging.
3416
 */
3417
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3418
    int num_reqs, MultiwriteCB *mcb)
3419
{
3420
    int i, outidx;
3421

    
3422
    // Sort requests by start sector
3423
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3424

    
3425
    // Check if adjacent requests touch the same clusters. If so, combine them,
3426
    // filling up gaps with zero sectors.
3427
    outidx = 0;
3428
    for (i = 1; i < num_reqs; i++) {
3429
        int merge = 0;
3430
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3431

    
3432
        // Handle exactly sequential writes and overlapping writes.
3433
        if (reqs[i].sector <= oldreq_last) {
3434
            merge = 1;
3435
        }
3436

    
3437
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3438
            merge = 0;
3439
        }
3440

    
3441
        if (merge) {
3442
            size_t size;
3443
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3444
            qemu_iovec_init(qiov,
3445
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3446

    
3447
            // Add the first request to the merged one. If the requests are
3448
            // overlapping, drop the last sectors of the first request.
3449
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3450
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3451

    
3452
            // We should need to add any zeros between the two requests
3453
            assert (reqs[i].sector <= oldreq_last);
3454

    
3455
            // Add the second request
3456
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3457

    
3458
            reqs[outidx].nb_sectors = qiov->size >> 9;
3459
            reqs[outidx].qiov = qiov;
3460

    
3461
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3462
        } else {
3463
            outidx++;
3464
            reqs[outidx].sector     = reqs[i].sector;
3465
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3466
            reqs[outidx].qiov       = reqs[i].qiov;
3467
        }
3468
    }
3469

    
3470
    return outidx + 1;
3471
}
3472

    
3473
/*
3474
 * Submit multiple AIO write requests at once.
3475
 *
3476
 * On success, the function returns 0 and all requests in the reqs array have
3477
 * been submitted. In error case this function returns -1, and any of the
3478
 * requests may or may not be submitted yet. In particular, this means that the
3479
 * callback will be called for some of the requests, for others it won't. The
3480
 * caller must check the error field of the BlockRequest to wait for the right
3481
 * callbacks (if error != 0, no callback will be called).
3482
 *
3483
 * The implementation may modify the contents of the reqs array, e.g. to merge
3484
 * requests. However, the fields opaque and error are left unmodified as they
3485
 * are used to signal failure for a single request to the caller.
3486
 */
3487
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3488
{
3489
    MultiwriteCB *mcb;
3490
    int i;
3491

    
3492
    /* don't submit writes if we don't have a medium */
3493
    if (bs->drv == NULL) {
3494
        for (i = 0; i < num_reqs; i++) {
3495
            reqs[i].error = -ENOMEDIUM;
3496
        }
3497
        return -1;
3498
    }
3499

    
3500
    if (num_reqs == 0) {
3501
        return 0;
3502
    }
3503

    
3504
    // Create MultiwriteCB structure
3505
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3506
    mcb->num_requests = 0;
3507
    mcb->num_callbacks = num_reqs;
3508

    
3509
    for (i = 0; i < num_reqs; i++) {
3510
        mcb->callbacks[i].cb = reqs[i].cb;
3511
        mcb->callbacks[i].opaque = reqs[i].opaque;
3512
    }
3513

    
3514
    // Check for mergable requests
3515
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3516

    
3517
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3518

    
3519
    /* Run the aio requests. */
3520
    mcb->num_requests = num_reqs;
3521
    for (i = 0; i < num_reqs; i++) {
3522
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3523
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3524
    }
3525

    
3526
    return 0;
3527
}
3528

    
3529
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3530
{
3531
    acb->aiocb_info->cancel(acb);
3532
}
3533

    
3534
/* block I/O throttling */
3535
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3536
                 bool is_write, double elapsed_time, uint64_t *wait)
3537
{
3538
    uint64_t bps_limit = 0;
3539
    uint64_t extension;
3540
    double   bytes_limit, bytes_base, bytes_res;
3541
    double   slice_time, wait_time;
3542

    
3543
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3544
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3545
    } else if (bs->io_limits.bps[is_write]) {
3546
        bps_limit = bs->io_limits.bps[is_write];
3547
    } else {
3548
        if (wait) {
3549
            *wait = 0;
3550
        }
3551

    
3552
        return false;
3553
    }
3554

    
3555
    slice_time = bs->slice_end - bs->slice_start;
3556
    slice_time /= (NANOSECONDS_PER_SECOND);
3557
    bytes_limit = bps_limit * slice_time;
3558
    bytes_base  = bs->slice_submitted.bytes[is_write];
3559
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3560
        bytes_base += bs->slice_submitted.bytes[!is_write];
3561
    }
3562

    
3563
    /* bytes_base: the bytes of data which have been read/written; and
3564
     *             it is obtained from the history statistic info.
3565
     * bytes_res: the remaining bytes of data which need to be read/written.
3566
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3567
     *             the total time for completing reading/writting all data.
3568
     */
3569
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3570

    
3571
    if (bytes_base + bytes_res <= bytes_limit) {
3572
        if (wait) {
3573
            *wait = 0;
3574
        }
3575

    
3576
        return false;
3577
    }
3578

    
3579
    /* Calc approx time to dispatch */
3580
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3581

    
3582
    /* When the I/O rate at runtime exceeds the limits,
3583
     * bs->slice_end need to be extended in order that the current statistic
3584
     * info can be kept until the timer fire, so it is increased and tuned
3585
     * based on the result of experiment.
3586
     */
3587
    extension = wait_time * NANOSECONDS_PER_SECOND;
3588
    extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
3589
                BLOCK_IO_SLICE_TIME;
3590
    bs->slice_end += extension;
3591
    if (wait) {
3592
        *wait = wait_time * NANOSECONDS_PER_SECOND;
3593
    }
3594

    
3595
    return true;
3596
}
3597

    
3598
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3599
                             double elapsed_time, uint64_t *wait)
3600
{
3601
    uint64_t iops_limit = 0;
3602
    double   ios_limit, ios_base;
3603
    double   slice_time, wait_time;
3604

    
3605
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3606
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3607
    } else if (bs->io_limits.iops[is_write]) {
3608
        iops_limit = bs->io_limits.iops[is_write];
3609
    } else {
3610
        if (wait) {
3611
            *wait = 0;
3612
        }
3613

    
3614
        return false;
3615
    }
3616

    
3617
    slice_time = bs->slice_end - bs->slice_start;
3618
    slice_time /= (NANOSECONDS_PER_SECOND);
3619
    ios_limit  = iops_limit * slice_time;
3620
    ios_base   = bs->slice_submitted.ios[is_write];
3621
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3622
        ios_base += bs->slice_submitted.ios[!is_write];
3623
    }
3624

    
3625
    if (ios_base + 1 <= ios_limit) {
3626
        if (wait) {
3627
            *wait = 0;
3628
        }
3629

    
3630
        return false;
3631
    }
3632

    
3633
    /* Calc approx time to dispatch, in seconds */
3634
    wait_time = (ios_base + 1) / iops_limit;
3635
    if (wait_time > elapsed_time) {
3636
        wait_time = wait_time - elapsed_time;
3637
    } else {
3638
        wait_time = 0;
3639
    }
3640

    
3641
    /* Exceeded current slice, extend it by another slice time */
3642
    bs->slice_end += BLOCK_IO_SLICE_TIME;
3643
    if (wait) {
3644
        *wait = wait_time * NANOSECONDS_PER_SECOND;
3645
    }
3646

    
3647
    return true;
3648
}
3649

    
3650
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3651
                           bool is_write, int64_t *wait)
3652
{
3653
    int64_t  now, max_wait;
3654
    uint64_t bps_wait = 0, iops_wait = 0;
3655
    double   elapsed_time;
3656
    int      bps_ret, iops_ret;
3657

    
3658
    now = qemu_get_clock_ns(vm_clock);
3659
    if (now > bs->slice_end) {
3660
        bs->slice_start = now;
3661
        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
3662
        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
3663
    }
3664

    
3665
    elapsed_time  = now - bs->slice_start;
3666
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3667

    
3668
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3669
                                      is_write, elapsed_time, &bps_wait);
3670
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3671
                                      elapsed_time, &iops_wait);
3672
    if (bps_ret || iops_ret) {
3673
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3674
        if (wait) {
3675
            *wait = max_wait;
3676
        }
3677

    
3678
        now = qemu_get_clock_ns(vm_clock);
3679
        if (bs->slice_end < now + max_wait) {
3680
            bs->slice_end = now + max_wait;
3681
        }
3682

    
3683
        return true;
3684
    }
3685

    
3686
    if (wait) {
3687
        *wait = 0;
3688
    }
3689

    
3690
    bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
3691
                                           BDRV_SECTOR_SIZE;
3692
    bs->slice_submitted.ios[is_write]++;
3693

    
3694
    return false;
3695
}
3696

    
3697
/**************************************************************/
3698
/* async block device emulation */
3699

    
3700
typedef struct BlockDriverAIOCBSync {
3701
    BlockDriverAIOCB common;
3702
    QEMUBH *bh;
3703
    int ret;
3704
    /* vector translation state */
3705
    QEMUIOVector *qiov;
3706
    uint8_t *bounce;
3707
    int is_write;
3708
} BlockDriverAIOCBSync;
3709

    
3710
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3711
{
3712
    BlockDriverAIOCBSync *acb =
3713
        container_of(blockacb, BlockDriverAIOCBSync, common);
3714
    qemu_bh_delete(acb->bh);
3715
    acb->bh = NULL;
3716
    qemu_aio_release(acb);
3717
}
3718

    
3719
static const AIOCBInfo bdrv_em_aiocb_info = {
3720
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3721
    .cancel             = bdrv_aio_cancel_em,
3722
};
3723

    
3724
static void bdrv_aio_bh_cb(void *opaque)
3725
{
3726
    BlockDriverAIOCBSync *acb = opaque;
3727

    
3728
    if (!acb->is_write)
3729
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3730
    qemu_vfree(acb->bounce);
3731
    acb->common.cb(acb->common.opaque, acb->ret);
3732
    qemu_bh_delete(acb->bh);
3733
    acb->bh = NULL;
3734
    qemu_aio_release(acb);
3735
}
3736

    
3737
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3738
                                            int64_t sector_num,
3739
                                            QEMUIOVector *qiov,
3740
                                            int nb_sectors,
3741
                                            BlockDriverCompletionFunc *cb,
3742
                                            void *opaque,
3743
                                            int is_write)
3744

    
3745
{
3746
    BlockDriverAIOCBSync *acb;
3747

    
3748
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3749
    acb->is_write = is_write;
3750
    acb->qiov = qiov;
3751
    acb->bounce = qemu_blockalign(bs, qiov->size);
3752
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3753

    
3754
    if (is_write) {
3755
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3756
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3757
    } else {
3758
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3759
    }
3760

    
3761
    qemu_bh_schedule(acb->bh);
3762

    
3763
    return &acb->common;
3764
}
3765

    
3766
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3767
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3768
        BlockDriverCompletionFunc *cb, void *opaque)
3769
{
3770
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3771
}
3772

    
3773
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3774
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3775
        BlockDriverCompletionFunc *cb, void *opaque)
3776
{
3777
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3778
}
3779

    
3780

    
3781
typedef struct BlockDriverAIOCBCoroutine {
3782
    BlockDriverAIOCB common;
3783
    BlockRequest req;
3784
    bool is_write;
3785
    bool *done;
3786
    QEMUBH* bh;
3787
} BlockDriverAIOCBCoroutine;
3788

    
3789
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3790
{
3791
    BlockDriverAIOCBCoroutine *acb =
3792
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3793
    bool done = false;
3794

    
3795
    acb->done = &done;
3796
    while (!done) {
3797
        qemu_aio_wait();
3798
    }
3799
}
3800

    
3801
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3802
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3803
    .cancel             = bdrv_aio_co_cancel_em,
3804
};
3805

    
3806
static void bdrv_co_em_bh(void *opaque)
3807
{
3808
    BlockDriverAIOCBCoroutine *acb = opaque;
3809

    
3810
    acb->common.cb(acb->common.opaque, acb->req.error);
3811

    
3812
    if (acb->done) {
3813
        *acb->done = true;
3814
    }
3815

    
3816
    qemu_bh_delete(acb->bh);
3817
    qemu_aio_release(acb);
3818
}
3819

    
3820
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3821
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3822
{
3823
    BlockDriverAIOCBCoroutine *acb = opaque;
3824
    BlockDriverState *bs = acb->common.bs;
3825

    
3826
    if (!acb->is_write) {
3827
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3828
            acb->req.nb_sectors, acb->req.qiov, 0);
3829
    } else {
3830
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3831
            acb->req.nb_sectors, acb->req.qiov, 0);
3832
    }
3833

    
3834
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3835
    qemu_bh_schedule(acb->bh);
3836
}
3837

    
3838
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3839
                                               int64_t sector_num,
3840
                                               QEMUIOVector *qiov,
3841
                                               int nb_sectors,
3842
                                               BlockDriverCompletionFunc *cb,
3843
                                               void *opaque,
3844
                                               bool is_write)
3845
{
3846
    Coroutine *co;
3847
    BlockDriverAIOCBCoroutine *acb;
3848

    
3849
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3850
    acb->req.sector = sector_num;
3851
    acb->req.nb_sectors = nb_sectors;
3852
    acb->req.qiov = qiov;
3853
    acb->is_write = is_write;
3854
    acb->done = NULL;
3855

    
3856
    co = qemu_coroutine_create(bdrv_co_do_rw);
3857
    qemu_coroutine_enter(co, acb);
3858

    
3859
    return &acb->common;
3860
}
3861

    
3862
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3863
{
3864
    BlockDriverAIOCBCoroutine *acb = opaque;
3865
    BlockDriverState *bs = acb->common.bs;
3866

    
3867
    acb->req.error = bdrv_co_flush(bs);
3868
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3869
    qemu_bh_schedule(acb->bh);
3870
}
3871

    
3872
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3873
        BlockDriverCompletionFunc *cb, void *opaque)
3874
{
3875
    trace_bdrv_aio_flush(bs, opaque);
3876

    
3877
    Coroutine *co;
3878
    BlockDriverAIOCBCoroutine *acb;
3879

    
3880
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3881
    acb->done = NULL;
3882

    
3883
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3884
    qemu_coroutine_enter(co, acb);
3885

    
3886
    return &acb->common;
3887
}
3888

    
3889
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3890
{
3891
    BlockDriverAIOCBCoroutine *acb = opaque;
3892
    BlockDriverState *bs = acb->common.bs;
3893

    
3894
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3895
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3896
    qemu_bh_schedule(acb->bh);
3897
}
3898

    
3899
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3900
        int64_t sector_num, int nb_sectors,
3901
        BlockDriverCompletionFunc *cb, void *opaque)
3902
{
3903
    Coroutine *co;
3904
    BlockDriverAIOCBCoroutine *acb;
3905

    
3906
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3907

    
3908
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3909
    acb->req.sector = sector_num;
3910
    acb->req.nb_sectors = nb_sectors;
3911
    acb->done = NULL;
3912
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3913
    qemu_coroutine_enter(co, acb);
3914

    
3915
    return &acb->common;
3916
}
3917

    
3918
void bdrv_init(void)
3919
{
3920
    module_call_init(MODULE_INIT_BLOCK);
3921
}
3922

    
3923
void bdrv_init_with_whitelist(void)
3924
{
3925
    use_bdrv_whitelist = 1;
3926
    bdrv_init();
3927
}
3928

    
3929
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
3930
                   BlockDriverCompletionFunc *cb, void *opaque)
3931
{
3932
    BlockDriverAIOCB *acb;
3933

    
3934
    acb = g_slice_alloc(aiocb_info->aiocb_size);
3935
    acb->aiocb_info = aiocb_info;
3936
    acb->bs = bs;
3937
    acb->cb = cb;
3938
    acb->opaque = opaque;
3939
    return acb;
3940
}
3941

    
3942
void qemu_aio_release(void *p)
3943
{
3944
    BlockDriverAIOCB *acb = p;
3945
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
3946
}
3947

    
3948
/**************************************************************/
3949
/* Coroutine block device emulation */
3950

    
3951
typedef struct CoroutineIOCompletion {
3952
    Coroutine *coroutine;
3953
    int ret;
3954
} CoroutineIOCompletion;
3955

    
3956
static void bdrv_co_io_em_complete(void *opaque, int ret)
3957
{
3958
    CoroutineIOCompletion *co = opaque;
3959

    
3960
    co->ret = ret;
3961
    qemu_coroutine_enter(co->coroutine, NULL);
3962
}
3963

    
3964
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3965
                                      int nb_sectors, QEMUIOVector *iov,
3966
                                      bool is_write)
3967
{
3968
    CoroutineIOCompletion co = {
3969
        .coroutine = qemu_coroutine_self(),
3970
    };
3971
    BlockDriverAIOCB *acb;
3972

    
3973
    if (is_write) {
3974
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3975
                                       bdrv_co_io_em_complete, &co);
3976
    } else {
3977
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3978
                                      bdrv_co_io_em_complete, &co);
3979
    }
3980

    
3981
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3982
    if (!acb) {
3983
        return -EIO;
3984
    }
3985
    qemu_coroutine_yield();
3986

    
3987
    return co.ret;
3988
}
3989

    
3990
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3991
                                         int64_t sector_num, int nb_sectors,
3992
                                         QEMUIOVector *iov)
3993
{
3994
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3995
}
3996

    
3997
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3998
                                         int64_t sector_num, int nb_sectors,
3999
                                         QEMUIOVector *iov)
4000
{
4001
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4002
}
4003

    
4004
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4005
{
4006
    RwCo *rwco = opaque;
4007

    
4008
    rwco->ret = bdrv_co_flush(rwco->bs);
4009
}
4010

    
4011
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4012
{
4013
    int ret;
4014

    
4015
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4016
        return 0;
4017
    }
4018

    
4019
    /* Write back cached data to the OS even with cache=unsafe */
4020
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4021
    if (bs->drv->bdrv_co_flush_to_os) {
4022
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4023
        if (ret < 0) {
4024
            return ret;
4025
        }
4026
    }
4027

    
4028
    /* But don't actually force it to the disk with cache=unsafe */
4029
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4030
        goto flush_parent;
4031
    }
4032

    
4033
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4034
    if (bs->drv->bdrv_co_flush_to_disk) {
4035
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4036
    } else if (bs->drv->bdrv_aio_flush) {
4037
        BlockDriverAIOCB *acb;
4038
        CoroutineIOCompletion co = {
4039
            .coroutine = qemu_coroutine_self(),
4040
        };
4041

    
4042
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4043
        if (acb == NULL) {
4044
            ret = -EIO;
4045
        } else {
4046
            qemu_coroutine_yield();
4047
            ret = co.ret;
4048
        }
4049
    } else {
4050
        /*
4051
         * Some block drivers always operate in either writethrough or unsafe
4052
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4053
         * know how the server works (because the behaviour is hardcoded or
4054
         * depends on server-side configuration), so we can't ensure that
4055
         * everything is safe on disk. Returning an error doesn't work because
4056
         * that would break guests even if the server operates in writethrough
4057
         * mode.
4058
         *
4059
         * Let's hope the user knows what he's doing.
4060
         */
4061
        ret = 0;
4062
    }
4063
    if (ret < 0) {
4064
        return ret;
4065
    }
4066

    
4067
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4068
     * in the case of cache=unsafe, so there are no useless flushes.
4069
     */
4070
flush_parent:
4071
    return bdrv_co_flush(bs->file);
4072
}
4073

    
4074
void bdrv_invalidate_cache(BlockDriverState *bs)
4075
{
4076
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4077
        bs->drv->bdrv_invalidate_cache(bs);
4078
    }
4079
}
4080

    
4081
void bdrv_invalidate_cache_all(void)
4082
{
4083
    BlockDriverState *bs;
4084

    
4085
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4086
        bdrv_invalidate_cache(bs);
4087
    }
4088
}
4089

    
4090
void bdrv_clear_incoming_migration_all(void)
4091
{
4092
    BlockDriverState *bs;
4093

    
4094
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4095
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4096
    }
4097
}
4098

    
4099
int bdrv_flush(BlockDriverState *bs)
4100
{
4101
    Coroutine *co;
4102
    RwCo rwco = {
4103
        .bs = bs,
4104
        .ret = NOT_DONE,
4105
    };
4106

    
4107
    if (qemu_in_coroutine()) {
4108
        /* Fast-path if already in coroutine context */
4109
        bdrv_flush_co_entry(&rwco);
4110
    } else {
4111
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4112
        qemu_coroutine_enter(co, &rwco);
4113
        while (rwco.ret == NOT_DONE) {
4114
            qemu_aio_wait();
4115
        }
4116
    }
4117

    
4118
    return rwco.ret;
4119
}
4120

    
4121
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4122
{
4123
    RwCo *rwco = opaque;
4124

    
4125
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4126
}
4127

    
4128
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4129
                                 int nb_sectors)
4130
{
4131
    if (!bs->drv) {
4132
        return -ENOMEDIUM;
4133
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4134
        return -EIO;
4135
    } else if (bs->read_only) {
4136
        return -EROFS;
4137
    }
4138

    
4139
    if (bs->dirty_bitmap) {
4140
        bdrv_reset_dirty(bs, sector_num, nb_sectors);
4141
    }
4142

    
4143
    /* Do nothing if disabled.  */
4144
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4145
        return 0;
4146
    }
4147

    
4148
    if (bs->drv->bdrv_co_discard) {
4149
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4150
    } else if (bs->drv->bdrv_aio_discard) {
4151
        BlockDriverAIOCB *acb;
4152
        CoroutineIOCompletion co = {
4153
            .coroutine = qemu_coroutine_self(),
4154
        };
4155

    
4156
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4157
                                        bdrv_co_io_em_complete, &co);
4158
        if (acb == NULL) {
4159
            return -EIO;
4160
        } else {
4161
            qemu_coroutine_yield();
4162
            return co.ret;
4163
        }
4164
    } else {
4165
        return 0;
4166
    }
4167
}
4168

    
4169
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4170
{
4171
    Coroutine *co;
4172
    RwCo rwco = {
4173
        .bs = bs,
4174
        .sector_num = sector_num,
4175
        .nb_sectors = nb_sectors,
4176
        .ret = NOT_DONE,
4177
    };
4178

    
4179
    if (qemu_in_coroutine()) {
4180
        /* Fast-path if already in coroutine context */
4181
        bdrv_discard_co_entry(&rwco);
4182
    } else {
4183
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4184
        qemu_coroutine_enter(co, &rwco);
4185
        while (rwco.ret == NOT_DONE) {
4186
            qemu_aio_wait();
4187
        }
4188
    }
4189

    
4190
    return rwco.ret;
4191
}
4192

    
4193
/**************************************************************/
4194
/* removable device support */
4195

    
4196
/**
4197
 * Return TRUE if the media is present
4198
 */
4199
int bdrv_is_inserted(BlockDriverState *bs)
4200
{
4201
    BlockDriver *drv = bs->drv;
4202

    
4203
    if (!drv)
4204
        return 0;
4205
    if (!drv->bdrv_is_inserted)
4206
        return 1;
4207
    return drv->bdrv_is_inserted(bs);
4208
}
4209

    
4210
/**
4211
 * Return whether the media changed since the last call to this
4212
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4213
 */
4214
int bdrv_media_changed(BlockDriverState *bs)
4215
{
4216
    BlockDriver *drv = bs->drv;
4217

    
4218
    if (drv && drv->bdrv_media_changed) {
4219
        return drv->bdrv_media_changed(bs);
4220
    }
4221
    return -ENOTSUP;
4222
}
4223

    
4224
/**
4225
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4226
 */
4227
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4228
{
4229
    BlockDriver *drv = bs->drv;
4230

    
4231
    if (drv && drv->bdrv_eject) {
4232
        drv->bdrv_eject(bs, eject_flag);
4233
    }
4234

    
4235
    if (bs->device_name[0] != '\0') {
4236
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4237
    }
4238
}
4239

    
4240
/**
4241
 * Lock or unlock the media (if it is locked, the user won't be able
4242
 * to eject it manually).
4243
 */
4244
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4245
{
4246
    BlockDriver *drv = bs->drv;
4247

    
4248
    trace_bdrv_lock_medium(bs, locked);
4249

    
4250
    if (drv && drv->bdrv_lock_medium) {
4251
        drv->bdrv_lock_medium(bs, locked);
4252
    }
4253
}
4254

    
4255
/* needed for generic scsi interface */
4256

    
4257
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4258
{
4259
    BlockDriver *drv = bs->drv;
4260

    
4261
    if (drv && drv->bdrv_ioctl)
4262
        return drv->bdrv_ioctl(bs, req, buf);
4263
    return -ENOTSUP;
4264
}
4265

    
4266
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4267
        unsigned long int req, void *buf,
4268
        BlockDriverCompletionFunc *cb, void *opaque)
4269
{
4270
    BlockDriver *drv = bs->drv;
4271

    
4272
    if (drv && drv->bdrv_aio_ioctl)
4273
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4274
    return NULL;
4275
}
4276

    
4277
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4278
{
4279
    bs->buffer_alignment = align;
4280
}
4281

    
4282
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4283
{
4284
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4285
}
4286

    
4287
/*
4288
 * Check if all memory in this vector is sector aligned.
4289
 */
4290
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4291
{
4292
    int i;
4293

    
4294
    for (i = 0; i < qiov->niov; i++) {
4295
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4296
            return false;
4297
        }
4298
    }
4299

    
4300
    return true;
4301
}
4302

    
4303
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4304
{
4305
    int64_t bitmap_size;
4306

    
4307
    assert((granularity & (granularity - 1)) == 0);
4308

    
4309
    if (granularity) {
4310
        granularity >>= BDRV_SECTOR_BITS;
4311
        assert(!bs->dirty_bitmap);
4312
        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4313
        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4314
    } else {
4315
        if (bs->dirty_bitmap) {
4316
            hbitmap_free(bs->dirty_bitmap);
4317
            bs->dirty_bitmap = NULL;
4318
        }
4319
    }
4320
}
4321

    
4322
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4323
{
4324
    if (bs->dirty_bitmap) {
4325
        return hbitmap_get(bs->dirty_bitmap, sector);
4326
    } else {
4327
        return 0;
4328
    }
4329
}
4330

    
4331
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4332
{
4333
    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4334
}
4335

    
4336
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4337
                    int nr_sectors)
4338
{
4339
    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4340
}
4341

    
4342
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4343
                      int nr_sectors)
4344
{
4345
    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4346
}
4347

    
4348
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4349
{
4350
    if (bs->dirty_bitmap) {
4351
        return hbitmap_count(bs->dirty_bitmap);
4352
    } else {
4353
        return 0;
4354
    }
4355
}
4356

    
4357
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4358
{
4359
    assert(bs->in_use != in_use);
4360
    bs->in_use = in_use;
4361
}
4362

    
4363
int bdrv_in_use(BlockDriverState *bs)
4364
{
4365
    return bs->in_use;
4366
}
4367

    
4368
void bdrv_iostatus_enable(BlockDriverState *bs)
4369
{
4370
    bs->iostatus_enabled = true;
4371
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4372
}
4373

    
4374
/* The I/O status is only enabled if the drive explicitly
4375
 * enables it _and_ the VM is configured to stop on errors */
4376
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4377
{
4378
    return (bs->iostatus_enabled &&
4379
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4380
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4381
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4382
}
4383

    
4384
void bdrv_iostatus_disable(BlockDriverState *bs)
4385
{
4386
    bs->iostatus_enabled = false;
4387
}
4388

    
4389
void bdrv_iostatus_reset(BlockDriverState *bs)
4390
{
4391
    if (bdrv_iostatus_is_enabled(bs)) {
4392
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4393
        if (bs->job) {
4394
            block_job_iostatus_reset(bs->job);
4395
        }
4396
    }
4397
}
4398

    
4399
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4400
{
4401
    assert(bdrv_iostatus_is_enabled(bs));
4402
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4403
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4404
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4405
    }
4406
}
4407

    
4408
void
4409
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4410
        enum BlockAcctType type)
4411
{
4412
    assert(type < BDRV_MAX_IOTYPE);
4413

    
4414
    cookie->bytes = bytes;
4415
    cookie->start_time_ns = get_clock();
4416
    cookie->type = type;
4417
}
4418

    
4419
void
4420
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4421
{
4422
    assert(cookie->type < BDRV_MAX_IOTYPE);
4423

    
4424
    bs->nr_bytes[cookie->type] += cookie->bytes;
4425
    bs->nr_ops[cookie->type]++;
4426
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4427
}
4428

    
4429
void bdrv_img_create(const char *filename, const char *fmt,
4430
                     const char *base_filename, const char *base_fmt,
4431
                     char *options, uint64_t img_size, int flags,
4432
                     Error **errp, bool quiet)
4433
{
4434
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4435
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4436
    BlockDriverState *bs = NULL;
4437
    BlockDriver *drv, *proto_drv;
4438
    BlockDriver *backing_drv = NULL;
4439
    int ret = 0;
4440

    
4441
    /* Find driver and parse its options */
4442
    drv = bdrv_find_format(fmt);
4443
    if (!drv) {
4444
        error_setg(errp, "Unknown file format '%s'", fmt);
4445
        return;
4446
    }
4447

    
4448
    proto_drv = bdrv_find_protocol(filename);
4449
    if (!proto_drv) {
4450
        error_setg(errp, "Unknown protocol '%s'", filename);
4451
        return;
4452
    }
4453

    
4454
    create_options = append_option_parameters(create_options,
4455
                                              drv->create_options);
4456
    create_options = append_option_parameters(create_options,
4457
                                              proto_drv->create_options);
4458

    
4459
    /* Create parameter list with default values */
4460
    param = parse_option_parameters("", create_options, param);
4461

    
4462
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4463

    
4464
    /* Parse -o options */
4465
    if (options) {
4466
        param = parse_option_parameters(options, create_options, param);
4467
        if (param == NULL) {
4468
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4469
            goto out;
4470
        }
4471
    }
4472

    
4473
    if (base_filename) {
4474
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4475
                                 base_filename)) {
4476
            error_setg(errp, "Backing file not supported for file format '%s'",
4477
                       fmt);
4478
            goto out;
4479
        }
4480
    }
4481

    
4482
    if (base_fmt) {
4483
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4484
            error_setg(errp, "Backing file format not supported for file "
4485
                             "format '%s'", fmt);
4486
            goto out;
4487
        }
4488
    }
4489

    
4490
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4491
    if (backing_file && backing_file->value.s) {
4492
        if (!strcmp(filename, backing_file->value.s)) {
4493
            error_setg(errp, "Error: Trying to create an image with the "
4494
                             "same filename as the backing file");
4495
            goto out;
4496
        }
4497
    }
4498

    
4499
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4500
    if (backing_fmt && backing_fmt->value.s) {
4501
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4502
        if (!backing_drv) {
4503
            error_setg(errp, "Unknown backing file format '%s'",
4504
                       backing_fmt->value.s);
4505
            goto out;
4506
        }
4507
    }
4508

    
4509
    // The size for the image must always be specified, with one exception:
4510
    // If we are using a backing file, we can obtain the size from there
4511
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4512
    if (size && size->value.n == -1) {
4513
        if (backing_file && backing_file->value.s) {
4514
            uint64_t size;
4515
            char buf[32];
4516
            int back_flags;
4517

    
4518
            /* backing files always opened read-only */
4519
            back_flags =
4520
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4521

    
4522
            bs = bdrv_new("");
4523

    
4524
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4525
                            backing_drv);
4526
            if (ret < 0) {
4527
                error_setg_errno(errp, -ret, "Could not open '%s'",
4528
                                 backing_file->value.s);
4529
                goto out;
4530
            }
4531
            bdrv_get_geometry(bs, &size);
4532
            size *= 512;
4533

    
4534
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4535
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4536
        } else {
4537
            error_setg(errp, "Image creation needs a size parameter");
4538
            goto out;
4539
        }
4540
    }
4541

    
4542
    if (!quiet) {
4543
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4544
        print_option_parameters(param);
4545
        puts("");
4546
    }
4547
    ret = bdrv_create(drv, filename, param);
4548
    if (ret < 0) {
4549
        if (ret == -ENOTSUP) {
4550
            error_setg(errp,"Formatting or formatting option not supported for "
4551
                            "file format '%s'", fmt);
4552
        } else if (ret == -EFBIG) {
4553
            const char *cluster_size_hint = "";
4554
            if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4555
                cluster_size_hint = " (try using a larger cluster size)";
4556
            }
4557
            error_setg(errp, "The image size is too large for file format '%s'%s",
4558
                       fmt, cluster_size_hint);
4559
        } else {
4560
            error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
4561
                       strerror(-ret));
4562
        }
4563
    }
4564

    
4565
out:
4566
    free_option_parameters(create_options);
4567
    free_option_parameters(param);
4568

    
4569
    if (bs) {
4570
        bdrv_delete(bs);
4571
    }
4572
}
4573

    
4574
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4575
{
4576
    /* Currently BlockDriverState always uses the main loop AioContext */
4577
    return qemu_get_aio_context();
4578
}
4579

    
4580
void bdrv_add_before_write_notifier(BlockDriverState *bs,
4581
                                    NotifierWithReturn *notifier)
4582
{
4583
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4584
}