Statistics
| Branch: | Revision:

root / block.c @ 98289620

History | View | Annotate | Download (126 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90
        bool is_write, double elapsed_time, uint64_t *wait);
91
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92
        double elapsed_time, uint64_t *wait);
93
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94
        bool is_write, int64_t *wait);
95

    
96
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
98

    
99
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
101

    
102
/* If non-zero, use only whitelisted block drivers */
103
static int use_bdrv_whitelist;
104

    
105
#ifdef _WIN32
106
static int is_windows_drive_prefix(const char *filename)
107
{
108
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110
            filename[1] == ':');
111
}
112

    
113
int is_windows_drive(const char *filename)
114
{
115
    if (is_windows_drive_prefix(filename) &&
116
        filename[2] == '\0')
117
        return 1;
118
    if (strstart(filename, "\\\\.\\", NULL) ||
119
        strstart(filename, "//./", NULL))
120
        return 1;
121
    return 0;
122
}
123
#endif
124

    
125
/* throttling disk I/O limits */
126
void bdrv_io_limits_disable(BlockDriverState *bs)
127
{
128
    bs->io_limits_enabled = false;
129

    
130
    while (qemu_co_queue_next(&bs->throttled_reqs));
131

    
132
    if (bs->block_timer) {
133
        qemu_del_timer(bs->block_timer);
134
        qemu_free_timer(bs->block_timer);
135
        bs->block_timer = NULL;
136
    }
137

    
138
    bs->slice_start = 0;
139
    bs->slice_end   = 0;
140
}
141

    
142
static void bdrv_block_timer(void *opaque)
143
{
144
    BlockDriverState *bs = opaque;
145

    
146
    qemu_co_queue_next(&bs->throttled_reqs);
147
}
148

    
149
void bdrv_io_limits_enable(BlockDriverState *bs)
150
{
151
    qemu_co_queue_init(&bs->throttled_reqs);
152
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153
    bs->io_limits_enabled = true;
154
}
155

    
156
bool bdrv_io_limits_enabled(BlockDriverState *bs)
157
{
158
    BlockIOLimit *io_limits = &bs->io_limits;
159
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
160
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
161
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
162
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
163
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
164
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
165
}
166

    
167
static void bdrv_io_limits_intercept(BlockDriverState *bs,
168
                                     bool is_write, int nb_sectors)
169
{
170
    int64_t wait_time = -1;
171

    
172
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
173
        qemu_co_queue_wait(&bs->throttled_reqs);
174
    }
175

    
176
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
177
     * throttled requests will not be dequeued until the current request is
178
     * allowed to be serviced. So if the current request still exceeds the
179
     * limits, it will be inserted to the head. All requests followed it will
180
     * be still in throttled_reqs queue.
181
     */
182

    
183
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
184
        qemu_mod_timer(bs->block_timer,
185
                       wait_time + qemu_get_clock_ns(vm_clock));
186
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
187
    }
188

    
189
    qemu_co_queue_next(&bs->throttled_reqs);
190
}
191

    
192
/* check if the path starts with "<protocol>:" */
193
static int path_has_protocol(const char *path)
194
{
195
    const char *p;
196

    
197
#ifdef _WIN32
198
    if (is_windows_drive(path) ||
199
        is_windows_drive_prefix(path)) {
200
        return 0;
201
    }
202
    p = path + strcspn(path, ":/\\");
203
#else
204
    p = path + strcspn(path, ":/");
205
#endif
206

    
207
    return *p == ':';
208
}
209

    
210
int path_is_absolute(const char *path)
211
{
212
#ifdef _WIN32
213
    /* specific case for names like: "\\.\d:" */
214
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
215
        return 1;
216
    }
217
    return (*path == '/' || *path == '\\');
218
#else
219
    return (*path == '/');
220
#endif
221
}
222

    
223
/* if filename is absolute, just copy it to dest. Otherwise, build a
224
   path to it by considering it is relative to base_path. URL are
225
   supported. */
226
void path_combine(char *dest, int dest_size,
227
                  const char *base_path,
228
                  const char *filename)
229
{
230
    const char *p, *p1;
231
    int len;
232

    
233
    if (dest_size <= 0)
234
        return;
235
    if (path_is_absolute(filename)) {
236
        pstrcpy(dest, dest_size, filename);
237
    } else {
238
        p = strchr(base_path, ':');
239
        if (p)
240
            p++;
241
        else
242
            p = base_path;
243
        p1 = strrchr(base_path, '/');
244
#ifdef _WIN32
245
        {
246
            const char *p2;
247
            p2 = strrchr(base_path, '\\');
248
            if (!p1 || p2 > p1)
249
                p1 = p2;
250
        }
251
#endif
252
        if (p1)
253
            p1++;
254
        else
255
            p1 = base_path;
256
        if (p1 > p)
257
            p = p1;
258
        len = p - base_path;
259
        if (len > dest_size - 1)
260
            len = dest_size - 1;
261
        memcpy(dest, base_path, len);
262
        dest[len] = '\0';
263
        pstrcat(dest, dest_size, filename);
264
    }
265
}
266

    
267
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
268
{
269
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
270
        pstrcpy(dest, sz, bs->backing_file);
271
    } else {
272
        path_combine(dest, sz, bs->filename, bs->backing_file);
273
    }
274
}
275

    
276
void bdrv_register(BlockDriver *bdrv)
277
{
278
    /* Block drivers without coroutine functions need emulation */
279
    if (!bdrv->bdrv_co_readv) {
280
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
281
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
282

    
283
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
284
         * the block driver lacks aio we need to emulate that too.
285
         */
286
        if (!bdrv->bdrv_aio_readv) {
287
            /* add AIO emulation layer */
288
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
289
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
290
        }
291
    }
292

    
293
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
294
}
295

    
296
/* create a new block device (by default it is empty) */
297
BlockDriverState *bdrv_new(const char *device_name)
298
{
299
    BlockDriverState *bs;
300

    
301
    bs = g_malloc0(sizeof(BlockDriverState));
302
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
303
    if (device_name[0] != '\0') {
304
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
305
    }
306
    bdrv_iostatus_disable(bs);
307
    notifier_list_init(&bs->close_notifiers);
308
    notifier_with_return_list_init(&bs->before_write_notifiers);
309

    
310
    return bs;
311
}
312

    
313
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
314
{
315
    notifier_list_add(&bs->close_notifiers, notify);
316
}
317

    
318
BlockDriver *bdrv_find_format(const char *format_name)
319
{
320
    BlockDriver *drv1;
321
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
322
        if (!strcmp(drv1->format_name, format_name)) {
323
            return drv1;
324
        }
325
    }
326
    return NULL;
327
}
328

    
329
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
330
{
331
    static const char *whitelist_rw[] = {
332
        CONFIG_BDRV_RW_WHITELIST
333
    };
334
    static const char *whitelist_ro[] = {
335
        CONFIG_BDRV_RO_WHITELIST
336
    };
337
    const char **p;
338

    
339
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
340
        return 1;               /* no whitelist, anything goes */
341
    }
342

    
343
    for (p = whitelist_rw; *p; p++) {
344
        if (!strcmp(drv->format_name, *p)) {
345
            return 1;
346
        }
347
    }
348
    if (read_only) {
349
        for (p = whitelist_ro; *p; p++) {
350
            if (!strcmp(drv->format_name, *p)) {
351
                return 1;
352
            }
353
        }
354
    }
355
    return 0;
356
}
357

    
358
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
359
                                          bool read_only)
360
{
361
    BlockDriver *drv = bdrv_find_format(format_name);
362
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
363
}
364

    
365
typedef struct CreateCo {
366
    BlockDriver *drv;
367
    char *filename;
368
    QEMUOptionParameter *options;
369
    int ret;
370
} CreateCo;
371

    
372
static void coroutine_fn bdrv_create_co_entry(void *opaque)
373
{
374
    CreateCo *cco = opaque;
375
    assert(cco->drv);
376

    
377
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
378
}
379

    
380
int bdrv_create(BlockDriver *drv, const char* filename,
381
    QEMUOptionParameter *options)
382
{
383
    int ret;
384

    
385
    Coroutine *co;
386
    CreateCo cco = {
387
        .drv = drv,
388
        .filename = g_strdup(filename),
389
        .options = options,
390
        .ret = NOT_DONE,
391
    };
392

    
393
    if (!drv->bdrv_create) {
394
        ret = -ENOTSUP;
395
        goto out;
396
    }
397

    
398
    if (qemu_in_coroutine()) {
399
        /* Fast-path if already in coroutine context */
400
        bdrv_create_co_entry(&cco);
401
    } else {
402
        co = qemu_coroutine_create(bdrv_create_co_entry);
403
        qemu_coroutine_enter(co, &cco);
404
        while (cco.ret == NOT_DONE) {
405
            qemu_aio_wait();
406
        }
407
    }
408

    
409
    ret = cco.ret;
410

    
411
out:
412
    g_free(cco.filename);
413
    return ret;
414
}
415

    
416
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
417
{
418
    BlockDriver *drv;
419

    
420
    drv = bdrv_find_protocol(filename, true);
421
    if (drv == NULL) {
422
        return -ENOENT;
423
    }
424

    
425
    return bdrv_create(drv, filename, options);
426
}
427

    
428
/*
429
 * Create a uniquely-named empty temporary file.
430
 * Return 0 upon success, otherwise a negative errno value.
431
 */
432
int get_tmp_filename(char *filename, int size)
433
{
434
#ifdef _WIN32
435
    char temp_dir[MAX_PATH];
436
    /* GetTempFileName requires that its output buffer (4th param)
437
       have length MAX_PATH or greater.  */
438
    assert(size >= MAX_PATH);
439
    return (GetTempPath(MAX_PATH, temp_dir)
440
            && GetTempFileName(temp_dir, "qem", 0, filename)
441
            ? 0 : -GetLastError());
442
#else
443
    int fd;
444
    const char *tmpdir;
445
    tmpdir = getenv("TMPDIR");
446
    if (!tmpdir)
447
        tmpdir = "/tmp";
448
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
449
        return -EOVERFLOW;
450
    }
451
    fd = mkstemp(filename);
452
    if (fd < 0) {
453
        return -errno;
454
    }
455
    if (close(fd) != 0) {
456
        unlink(filename);
457
        return -errno;
458
    }
459
    return 0;
460
#endif
461
}
462

    
463
/*
464
 * Detect host devices. By convention, /dev/cdrom[N] is always
465
 * recognized as a host CDROM.
466
 */
467
static BlockDriver *find_hdev_driver(const char *filename)
468
{
469
    int score_max = 0, score;
470
    BlockDriver *drv = NULL, *d;
471

    
472
    QLIST_FOREACH(d, &bdrv_drivers, list) {
473
        if (d->bdrv_probe_device) {
474
            score = d->bdrv_probe_device(filename);
475
            if (score > score_max) {
476
                score_max = score;
477
                drv = d;
478
            }
479
        }
480
    }
481

    
482
    return drv;
483
}
484

    
485
BlockDriver *bdrv_find_protocol(const char *filename,
486
                                bool allow_protocol_prefix)
487
{
488
    BlockDriver *drv1;
489
    char protocol[128];
490
    int len;
491
    const char *p;
492

    
493
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
494

    
495
    /*
496
     * XXX(hch): we really should not let host device detection
497
     * override an explicit protocol specification, but moving this
498
     * later breaks access to device names with colons in them.
499
     * Thanks to the brain-dead persistent naming schemes on udev-
500
     * based Linux systems those actually are quite common.
501
     */
502
    drv1 = find_hdev_driver(filename);
503
    if (drv1) {
504
        return drv1;
505
    }
506

    
507
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
508
        return bdrv_find_format("file");
509
    }
510

    
511
    p = strchr(filename, ':');
512
    assert(p != NULL);
513
    len = p - filename;
514
    if (len > sizeof(protocol) - 1)
515
        len = sizeof(protocol) - 1;
516
    memcpy(protocol, filename, len);
517
    protocol[len] = '\0';
518
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
519
        if (drv1->protocol_name &&
520
            !strcmp(drv1->protocol_name, protocol)) {
521
            return drv1;
522
        }
523
    }
524
    return NULL;
525
}
526

    
527
static int find_image_format(BlockDriverState *bs, const char *filename,
528
                             BlockDriver **pdrv)
529
{
530
    int score, score_max;
531
    BlockDriver *drv1, *drv;
532
    uint8_t buf[2048];
533
    int ret = 0;
534

    
535
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
536
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
537
        drv = bdrv_find_format("raw");
538
        if (!drv) {
539
            ret = -ENOENT;
540
        }
541
        *pdrv = drv;
542
        return ret;
543
    }
544

    
545
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
546
    if (ret < 0) {
547
        *pdrv = NULL;
548
        return ret;
549
    }
550

    
551
    score_max = 0;
552
    drv = NULL;
553
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
554
        if (drv1->bdrv_probe) {
555
            score = drv1->bdrv_probe(buf, ret, filename);
556
            if (score > score_max) {
557
                score_max = score;
558
                drv = drv1;
559
            }
560
        }
561
    }
562
    if (!drv) {
563
        ret = -ENOENT;
564
    }
565
    *pdrv = drv;
566
    return ret;
567
}
568

    
569
/**
570
 * Set the current 'total_sectors' value
571
 */
572
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
573
{
574
    BlockDriver *drv = bs->drv;
575

    
576
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
577
    if (bs->sg)
578
        return 0;
579

    
580
    /* query actual device if possible, otherwise just trust the hint */
581
    if (drv->bdrv_getlength) {
582
        int64_t length = drv->bdrv_getlength(bs);
583
        if (length < 0) {
584
            return length;
585
        }
586
        hint = length >> BDRV_SECTOR_BITS;
587
    }
588

    
589
    bs->total_sectors = hint;
590
    return 0;
591
}
592

    
593
/**
594
 * Set open flags for a given discard mode
595
 *
596
 * Return 0 on success, -1 if the discard mode was invalid.
597
 */
598
int bdrv_parse_discard_flags(const char *mode, int *flags)
599
{
600
    *flags &= ~BDRV_O_UNMAP;
601

    
602
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
603
        /* do nothing */
604
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
605
        *flags |= BDRV_O_UNMAP;
606
    } else {
607
        return -1;
608
    }
609

    
610
    return 0;
611
}
612

    
613
/**
614
 * Set open flags for a given cache mode
615
 *
616
 * Return 0 on success, -1 if the cache mode was invalid.
617
 */
618
int bdrv_parse_cache_flags(const char *mode, int *flags)
619
{
620
    *flags &= ~BDRV_O_CACHE_MASK;
621

    
622
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
623
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
624
    } else if (!strcmp(mode, "directsync")) {
625
        *flags |= BDRV_O_NOCACHE;
626
    } else if (!strcmp(mode, "writeback")) {
627
        *flags |= BDRV_O_CACHE_WB;
628
    } else if (!strcmp(mode, "unsafe")) {
629
        *flags |= BDRV_O_CACHE_WB;
630
        *flags |= BDRV_O_NO_FLUSH;
631
    } else if (!strcmp(mode, "writethrough")) {
632
        /* this is the default */
633
    } else {
634
        return -1;
635
    }
636

    
637
    return 0;
638
}
639

    
640
/**
641
 * The copy-on-read flag is actually a reference count so multiple users may
642
 * use the feature without worrying about clobbering its previous state.
643
 * Copy-on-read stays enabled until all users have called to disable it.
644
 */
645
void bdrv_enable_copy_on_read(BlockDriverState *bs)
646
{
647
    bs->copy_on_read++;
648
}
649

    
650
void bdrv_disable_copy_on_read(BlockDriverState *bs)
651
{
652
    assert(bs->copy_on_read > 0);
653
    bs->copy_on_read--;
654
}
655

    
656
static int bdrv_open_flags(BlockDriverState *bs, int flags)
657
{
658
    int open_flags = flags | BDRV_O_CACHE_WB;
659

    
660
    /*
661
     * Clear flags that are internal to the block layer before opening the
662
     * image.
663
     */
664
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
665

    
666
    /*
667
     * Snapshots should be writable.
668
     */
669
    if (bs->is_temporary) {
670
        open_flags |= BDRV_O_RDWR;
671
    }
672

    
673
    return open_flags;
674
}
675

    
676
/*
677
 * Common part for opening disk images and files
678
 *
679
 * Removes all processed options from *options.
680
 */
681
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
682
    QDict *options, int flags, BlockDriver *drv)
683
{
684
    int ret, open_flags;
685
    const char *filename;
686

    
687
    assert(drv != NULL);
688
    assert(bs->file == NULL);
689
    assert(options != NULL && bs->options != options);
690

    
691
    if (file != NULL) {
692
        filename = file->filename;
693
    } else {
694
        filename = qdict_get_try_str(options, "filename");
695
    }
696

    
697
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
698

    
699
    /* bdrv_open() with directly using a protocol as drv. This layer is already
700
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
701
     * and return immediately. */
702
    if (file != NULL && drv->bdrv_file_open) {
703
        bdrv_swap(file, bs);
704
        return 0;
705
    }
706

    
707
    bs->open_flags = flags;
708
    bs->buffer_alignment = 512;
709
    open_flags = bdrv_open_flags(bs, flags);
710
    bs->read_only = !(open_flags & BDRV_O_RDWR);
711

    
712
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
713
        return -ENOTSUP;
714
    }
715

    
716
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
717
    if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) {
718
        bdrv_enable_copy_on_read(bs);
719
    }
720

    
721
    if (filename != NULL) {
722
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
723
    } else {
724
        bs->filename[0] = '\0';
725
    }
726

    
727
    bs->drv = drv;
728
    bs->opaque = g_malloc0(drv->instance_size);
729

    
730
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
731

    
732
    /* Open the image, either directly or using a protocol */
733
    if (drv->bdrv_file_open) {
734
        assert(file == NULL);
735
        assert(drv->bdrv_parse_filename || filename != NULL);
736
        ret = drv->bdrv_file_open(bs, options, open_flags);
737
    } else {
738
        if (file == NULL) {
739
            qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't use '%s' as a "
740
                          "block driver for the protocol level",
741
                          drv->format_name);
742
            ret = -EINVAL;
743
            goto free_and_fail;
744
        }
745
        assert(file != NULL);
746
        bs->file = file;
747
        ret = drv->bdrv_open(bs, options, open_flags);
748
    }
749

    
750
    if (ret < 0) {
751
        goto free_and_fail;
752
    }
753

    
754
    ret = refresh_total_sectors(bs, bs->total_sectors);
755
    if (ret < 0) {
756
        goto free_and_fail;
757
    }
758

    
759
#ifndef _WIN32
760
    if (bs->is_temporary) {
761
        assert(filename != NULL);
762
        unlink(filename);
763
    }
764
#endif
765
    return 0;
766

    
767
free_and_fail:
768
    bs->file = NULL;
769
    g_free(bs->opaque);
770
    bs->opaque = NULL;
771
    bs->drv = NULL;
772
    return ret;
773
}
774

    
775
/*
776
 * Opens a file using a protocol (file, host_device, nbd, ...)
777
 *
778
 * options is a QDict of options to pass to the block drivers, or NULL for an
779
 * empty set of options. The reference to the QDict belongs to the block layer
780
 * after the call (even on failure), so if the caller intends to reuse the
781
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
782
 */
783
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
784
                   QDict *options, int flags)
785
{
786
    BlockDriverState *bs;
787
    BlockDriver *drv;
788
    const char *drvname;
789
    bool allow_protocol_prefix = false;
790
    int ret;
791

    
792
    /* NULL means an empty set of options */
793
    if (options == NULL) {
794
        options = qdict_new();
795
    }
796

    
797
    bs = bdrv_new("");
798
    bs->options = options;
799
    options = qdict_clone_shallow(options);
800

    
801
    /* Fetch the file name from the options QDict if necessary */
802
    if (!filename) {
803
        filename = qdict_get_try_str(options, "filename");
804
    } else if (filename && !qdict_haskey(options, "filename")) {
805
        qdict_put(options, "filename", qstring_from_str(filename));
806
        allow_protocol_prefix = true;
807
    } else {
808
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't specify 'file' and "
809
                      "'filename' options at the same time");
810
        ret = -EINVAL;
811
        goto fail;
812
    }
813

    
814
    /* Find the right block driver */
815
    drvname = qdict_get_try_str(options, "driver");
816
    if (drvname) {
817
        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
818
        qdict_del(options, "driver");
819
    } else if (filename) {
820
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
821
        if (!drv) {
822
            qerror_report(ERROR_CLASS_GENERIC_ERROR, "Unknown protocol");
823
        }
824
    } else {
825
        qerror_report(ERROR_CLASS_GENERIC_ERROR,
826
                      "Must specify either driver or file");
827
        drv = NULL;
828
    }
829

    
830
    if (!drv) {
831
        ret = -ENOENT;
832
        goto fail;
833
    }
834

    
835
    /* Parse the filename and open it */
836
    if (drv->bdrv_parse_filename && filename) {
837
        Error *local_err = NULL;
838
        drv->bdrv_parse_filename(filename, options, &local_err);
839
        if (error_is_set(&local_err)) {
840
            qerror_report_err(local_err);
841
            error_free(local_err);
842
            ret = -EINVAL;
843
            goto fail;
844
        }
845
        qdict_del(options, "filename");
846
    } else if (!drv->bdrv_parse_filename && !filename) {
847
        qerror_report(ERROR_CLASS_GENERIC_ERROR,
848
                      "The '%s' block driver requires a file name",
849
                      drv->format_name);
850
        ret = -EINVAL;
851
        goto fail;
852
    }
853

    
854
    ret = bdrv_open_common(bs, NULL, options, flags, drv);
855
    if (ret < 0) {
856
        goto fail;
857
    }
858

    
859
    /* Check if any unknown options were used */
860
    if (qdict_size(options) != 0) {
861
        const QDictEntry *entry = qdict_first(options);
862
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't "
863
                      "support the option '%s'",
864
                      drv->format_name, entry->key);
865
        ret = -EINVAL;
866
        goto fail;
867
    }
868
    QDECREF(options);
869

    
870
    bs->growable = 1;
871
    *pbs = bs;
872
    return 0;
873

    
874
fail:
875
    QDECREF(options);
876
    if (!bs->drv) {
877
        QDECREF(bs->options);
878
    }
879
    bdrv_delete(bs);
880
    return ret;
881
}
882

    
883
/*
884
 * Opens the backing file for a BlockDriverState if not yet open
885
 *
886
 * options is a QDict of options to pass to the block drivers, or NULL for an
887
 * empty set of options. The reference to the QDict is transferred to this
888
 * function (even on failure), so if the caller intends to reuse the dictionary,
889
 * it needs to use QINCREF() before calling bdrv_file_open.
890
 */
891
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options)
892
{
893
    char backing_filename[PATH_MAX];
894
    int back_flags, ret;
895
    BlockDriver *back_drv = NULL;
896

    
897
    if (bs->backing_hd != NULL) {
898
        QDECREF(options);
899
        return 0;
900
    }
901

    
902
    /* NULL means an empty set of options */
903
    if (options == NULL) {
904
        options = qdict_new();
905
    }
906

    
907
    bs->open_flags &= ~BDRV_O_NO_BACKING;
908
    if (qdict_haskey(options, "file.filename")) {
909
        backing_filename[0] = '\0';
910
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
911
        QDECREF(options);
912
        return 0;
913
    }
914

    
915
    bs->backing_hd = bdrv_new("");
916
    bdrv_get_full_backing_filename(bs, backing_filename,
917
                                   sizeof(backing_filename));
918

    
919
    if (bs->backing_format[0] != '\0') {
920
        back_drv = bdrv_find_format(bs->backing_format);
921
    }
922

    
923
    /* backing files always opened read-only */
924
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
925

    
926
    ret = bdrv_open(bs->backing_hd,
927
                    *backing_filename ? backing_filename : NULL, options,
928
                    back_flags, back_drv);
929
    if (ret < 0) {
930
        bdrv_delete(bs->backing_hd);
931
        bs->backing_hd = NULL;
932
        bs->open_flags |= BDRV_O_NO_BACKING;
933
        return ret;
934
    }
935
    return 0;
936
}
937

    
938
static void extract_subqdict(QDict *src, QDict **dst, const char *start)
939
{
940
    const QDictEntry *entry, *next;
941
    const char *p;
942

    
943
    *dst = qdict_new();
944
    entry = qdict_first(src);
945

    
946
    while (entry != NULL) {
947
        next = qdict_next(src, entry);
948
        if (strstart(entry->key, start, &p)) {
949
            qobject_incref(entry->value);
950
            qdict_put_obj(*dst, p, entry->value);
951
            qdict_del(src, entry->key);
952
        }
953
        entry = next;
954
    }
955
}
956

    
957
/*
958
 * Opens a disk image (raw, qcow2, vmdk, ...)
959
 *
960
 * options is a QDict of options to pass to the block drivers, or NULL for an
961
 * empty set of options. The reference to the QDict belongs to the block layer
962
 * after the call (even on failure), so if the caller intends to reuse the
963
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
964
 */
965
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
966
              int flags, BlockDriver *drv)
967
{
968
    int ret;
969
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
970
    char tmp_filename[PATH_MAX + 1];
971
    BlockDriverState *file = NULL;
972
    QDict *file_options = NULL;
973

    
974
    /* NULL means an empty set of options */
975
    if (options == NULL) {
976
        options = qdict_new();
977
    }
978

    
979
    bs->options = options;
980
    options = qdict_clone_shallow(options);
981

    
982
    /* For snapshot=on, create a temporary qcow2 overlay */
983
    if (flags & BDRV_O_SNAPSHOT) {
984
        BlockDriverState *bs1;
985
        int64_t total_size;
986
        BlockDriver *bdrv_qcow2;
987
        QEMUOptionParameter *create_options;
988
        char backing_filename[PATH_MAX];
989

    
990
        if (qdict_size(options) != 0) {
991
            error_report("Can't use snapshot=on with driver-specific options");
992
            ret = -EINVAL;
993
            goto fail;
994
        }
995
        assert(filename != NULL);
996

    
997
        /* if snapshot, we create a temporary backing file and open it
998
           instead of opening 'filename' directly */
999

    
1000
        /* if there is a backing file, use it */
1001
        bs1 = bdrv_new("");
1002
        ret = bdrv_open(bs1, filename, NULL, 0, drv);
1003
        if (ret < 0) {
1004
            bdrv_delete(bs1);
1005
            goto fail;
1006
        }
1007
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1008

    
1009
        bdrv_delete(bs1);
1010

    
1011
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1012
        if (ret < 0) {
1013
            goto fail;
1014
        }
1015

    
1016
        /* Real path is meaningless for protocols */
1017
        if (path_has_protocol(filename)) {
1018
            snprintf(backing_filename, sizeof(backing_filename),
1019
                     "%s", filename);
1020
        } else if (!realpath(filename, backing_filename)) {
1021
            ret = -errno;
1022
            goto fail;
1023
        }
1024

    
1025
        bdrv_qcow2 = bdrv_find_format("qcow2");
1026
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1027
                                                 NULL);
1028

    
1029
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1030
        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
1031
                             backing_filename);
1032
        if (drv) {
1033
            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
1034
                drv->format_name);
1035
        }
1036

    
1037
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options);
1038
        free_option_parameters(create_options);
1039
        if (ret < 0) {
1040
            goto fail;
1041
        }
1042

    
1043
        filename = tmp_filename;
1044
        drv = bdrv_qcow2;
1045
        bs->is_temporary = 1;
1046
    }
1047

    
1048
    /* Open image file without format layer */
1049
    if (flags & BDRV_O_RDWR) {
1050
        flags |= BDRV_O_ALLOW_RDWR;
1051
    }
1052

    
1053
    extract_subqdict(options, &file_options, "file.");
1054

    
1055
    ret = bdrv_file_open(&file, filename, file_options,
1056
                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP));
1057
    if (ret < 0) {
1058
        goto fail;
1059
    }
1060

    
1061
    /* Find the right image format driver */
1062
    if (!drv) {
1063
        ret = find_image_format(file, filename, &drv);
1064
    }
1065

    
1066
    if (!drv) {
1067
        goto unlink_and_fail;
1068
    }
1069

    
1070
    /* Open the image */
1071
    ret = bdrv_open_common(bs, file, options, flags, drv);
1072
    if (ret < 0) {
1073
        goto unlink_and_fail;
1074
    }
1075

    
1076
    if (bs->file != file) {
1077
        bdrv_delete(file);
1078
        file = NULL;
1079
    }
1080

    
1081
    /* If there is a backing file, use it */
1082
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1083
        QDict *backing_options;
1084

    
1085
        extract_subqdict(options, &backing_options, "backing.");
1086
        ret = bdrv_open_backing_file(bs, backing_options);
1087
        if (ret < 0) {
1088
            goto close_and_fail;
1089
        }
1090
    }
1091

    
1092
    /* Check if any unknown options were used */
1093
    if (qdict_size(options) != 0) {
1094
        const QDictEntry *entry = qdict_first(options);
1095
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by "
1096
            "device '%s' doesn't support the option '%s'",
1097
            drv->format_name, bs->device_name, entry->key);
1098

    
1099
        ret = -EINVAL;
1100
        goto close_and_fail;
1101
    }
1102
    QDECREF(options);
1103

    
1104
    if (!bdrv_key_required(bs)) {
1105
        bdrv_dev_change_media_cb(bs, true);
1106
    }
1107

    
1108
    /* throttling disk I/O limits */
1109
    if (bs->io_limits_enabled) {
1110
        bdrv_io_limits_enable(bs);
1111
    }
1112

    
1113
    return 0;
1114

    
1115
unlink_and_fail:
1116
    if (file != NULL) {
1117
        bdrv_delete(file);
1118
    }
1119
    if (bs->is_temporary) {
1120
        unlink(filename);
1121
    }
1122
fail:
1123
    QDECREF(bs->options);
1124
    QDECREF(options);
1125
    bs->options = NULL;
1126
    return ret;
1127

    
1128
close_and_fail:
1129
    bdrv_close(bs);
1130
    QDECREF(options);
1131
    return ret;
1132
}
1133

    
1134
typedef struct BlockReopenQueueEntry {
1135
     bool prepared;
1136
     BDRVReopenState state;
1137
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1138
} BlockReopenQueueEntry;
1139

    
1140
/*
1141
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1142
 * reopen of multiple devices.
1143
 *
1144
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1145
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1146
 * be created and initialized. This newly created BlockReopenQueue should be
1147
 * passed back in for subsequent calls that are intended to be of the same
1148
 * atomic 'set'.
1149
 *
1150
 * bs is the BlockDriverState to add to the reopen queue.
1151
 *
1152
 * flags contains the open flags for the associated bs
1153
 *
1154
 * returns a pointer to bs_queue, which is either the newly allocated
1155
 * bs_queue, or the existing bs_queue being used.
1156
 *
1157
 */
1158
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1159
                                    BlockDriverState *bs, int flags)
1160
{
1161
    assert(bs != NULL);
1162

    
1163
    BlockReopenQueueEntry *bs_entry;
1164
    if (bs_queue == NULL) {
1165
        bs_queue = g_new0(BlockReopenQueue, 1);
1166
        QSIMPLEQ_INIT(bs_queue);
1167
    }
1168

    
1169
    if (bs->file) {
1170
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1171
    }
1172

    
1173
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1174
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1175

    
1176
    bs_entry->state.bs = bs;
1177
    bs_entry->state.flags = flags;
1178

    
1179
    return bs_queue;
1180
}
1181

    
1182
/*
1183
 * Reopen multiple BlockDriverStates atomically & transactionally.
1184
 *
1185
 * The queue passed in (bs_queue) must have been built up previous
1186
 * via bdrv_reopen_queue().
1187
 *
1188
 * Reopens all BDS specified in the queue, with the appropriate
1189
 * flags.  All devices are prepared for reopen, and failure of any
1190
 * device will cause all device changes to be abandonded, and intermediate
1191
 * data cleaned up.
1192
 *
1193
 * If all devices prepare successfully, then the changes are committed
1194
 * to all devices.
1195
 *
1196
 */
1197
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1198
{
1199
    int ret = -1;
1200
    BlockReopenQueueEntry *bs_entry, *next;
1201
    Error *local_err = NULL;
1202

    
1203
    assert(bs_queue != NULL);
1204

    
1205
    bdrv_drain_all();
1206

    
1207
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1208
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1209
            error_propagate(errp, local_err);
1210
            goto cleanup;
1211
        }
1212
        bs_entry->prepared = true;
1213
    }
1214

    
1215
    /* If we reach this point, we have success and just need to apply the
1216
     * changes
1217
     */
1218
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1219
        bdrv_reopen_commit(&bs_entry->state);
1220
    }
1221

    
1222
    ret = 0;
1223

    
1224
cleanup:
1225
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1226
        if (ret && bs_entry->prepared) {
1227
            bdrv_reopen_abort(&bs_entry->state);
1228
        }
1229
        g_free(bs_entry);
1230
    }
1231
    g_free(bs_queue);
1232
    return ret;
1233
}
1234

    
1235

    
1236
/* Reopen a single BlockDriverState with the specified flags. */
1237
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1238
{
1239
    int ret = -1;
1240
    Error *local_err = NULL;
1241
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1242

    
1243
    ret = bdrv_reopen_multiple(queue, &local_err);
1244
    if (local_err != NULL) {
1245
        error_propagate(errp, local_err);
1246
    }
1247
    return ret;
1248
}
1249

    
1250

    
1251
/*
1252
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1253
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1254
 * the block driver layer .bdrv_reopen_prepare()
1255
 *
1256
 * bs is the BlockDriverState to reopen
1257
 * flags are the new open flags
1258
 * queue is the reopen queue
1259
 *
1260
 * Returns 0 on success, non-zero on error.  On error errp will be set
1261
 * as well.
1262
 *
1263
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1264
 * It is the responsibility of the caller to then call the abort() or
1265
 * commit() for any other BDS that have been left in a prepare() state
1266
 *
1267
 */
1268
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1269
                        Error **errp)
1270
{
1271
    int ret = -1;
1272
    Error *local_err = NULL;
1273
    BlockDriver *drv;
1274

    
1275
    assert(reopen_state != NULL);
1276
    assert(reopen_state->bs->drv != NULL);
1277
    drv = reopen_state->bs->drv;
1278

    
1279
    /* if we are to stay read-only, do not allow permission change
1280
     * to r/w */
1281
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1282
        reopen_state->flags & BDRV_O_RDWR) {
1283
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1284
                  reopen_state->bs->device_name);
1285
        goto error;
1286
    }
1287

    
1288

    
1289
    ret = bdrv_flush(reopen_state->bs);
1290
    if (ret) {
1291
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1292
                  strerror(-ret));
1293
        goto error;
1294
    }
1295

    
1296
    if (drv->bdrv_reopen_prepare) {
1297
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1298
        if (ret) {
1299
            if (local_err != NULL) {
1300
                error_propagate(errp, local_err);
1301
            } else {
1302
                error_setg(errp, "failed while preparing to reopen image '%s'",
1303
                           reopen_state->bs->filename);
1304
            }
1305
            goto error;
1306
        }
1307
    } else {
1308
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1309
         * handler for each supported drv. */
1310
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1311
                  drv->format_name, reopen_state->bs->device_name,
1312
                 "reopening of file");
1313
        ret = -1;
1314
        goto error;
1315
    }
1316

    
1317
    ret = 0;
1318

    
1319
error:
1320
    return ret;
1321
}
1322

    
1323
/*
1324
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1325
 * makes them final by swapping the staging BlockDriverState contents into
1326
 * the active BlockDriverState contents.
1327
 */
1328
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1329
{
1330
    BlockDriver *drv;
1331

    
1332
    assert(reopen_state != NULL);
1333
    drv = reopen_state->bs->drv;
1334
    assert(drv != NULL);
1335

    
1336
    /* If there are any driver level actions to take */
1337
    if (drv->bdrv_reopen_commit) {
1338
        drv->bdrv_reopen_commit(reopen_state);
1339
    }
1340

    
1341
    /* set BDS specific flags now */
1342
    reopen_state->bs->open_flags         = reopen_state->flags;
1343
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1344
                                              BDRV_O_CACHE_WB);
1345
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1346
}
1347

    
1348
/*
1349
 * Abort the reopen, and delete and free the staged changes in
1350
 * reopen_state
1351
 */
1352
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1353
{
1354
    BlockDriver *drv;
1355

    
1356
    assert(reopen_state != NULL);
1357
    drv = reopen_state->bs->drv;
1358
    assert(drv != NULL);
1359

    
1360
    if (drv->bdrv_reopen_abort) {
1361
        drv->bdrv_reopen_abort(reopen_state);
1362
    }
1363
}
1364

    
1365

    
1366
void bdrv_close(BlockDriverState *bs)
1367
{
1368
    if (bs->job) {
1369
        block_job_cancel_sync(bs->job);
1370
    }
1371
    bdrv_drain_all(); /* complete I/O */
1372
    bdrv_flush(bs);
1373
    bdrv_drain_all(); /* in case flush left pending I/O */
1374
    notifier_list_notify(&bs->close_notifiers, bs);
1375

    
1376
    if (bs->drv) {
1377
        if (bs->backing_hd) {
1378
            bdrv_delete(bs->backing_hd);
1379
            bs->backing_hd = NULL;
1380
        }
1381
        bs->drv->bdrv_close(bs);
1382
        g_free(bs->opaque);
1383
#ifdef _WIN32
1384
        if (bs->is_temporary) {
1385
            unlink(bs->filename);
1386
        }
1387
#endif
1388
        bs->opaque = NULL;
1389
        bs->drv = NULL;
1390
        bs->copy_on_read = 0;
1391
        bs->backing_file[0] = '\0';
1392
        bs->backing_format[0] = '\0';
1393
        bs->total_sectors = 0;
1394
        bs->encrypted = 0;
1395
        bs->valid_key = 0;
1396
        bs->sg = 0;
1397
        bs->growable = 0;
1398
        QDECREF(bs->options);
1399
        bs->options = NULL;
1400

    
1401
        if (bs->file != NULL) {
1402
            bdrv_delete(bs->file);
1403
            bs->file = NULL;
1404
        }
1405
    }
1406

    
1407
    bdrv_dev_change_media_cb(bs, false);
1408

    
1409
    /*throttling disk I/O limits*/
1410
    if (bs->io_limits_enabled) {
1411
        bdrv_io_limits_disable(bs);
1412
    }
1413
}
1414

    
1415
void bdrv_close_all(void)
1416
{
1417
    BlockDriverState *bs;
1418

    
1419
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1420
        bdrv_close(bs);
1421
    }
1422
}
1423

    
1424
/*
1425
 * Wait for pending requests to complete across all BlockDriverStates
1426
 *
1427
 * This function does not flush data to disk, use bdrv_flush_all() for that
1428
 * after calling this function.
1429
 *
1430
 * Note that completion of an asynchronous I/O operation can trigger any
1431
 * number of other I/O operations on other devices---for example a coroutine
1432
 * can be arbitrarily complex and a constant flow of I/O can come until the
1433
 * coroutine is complete.  Because of this, it is not possible to have a
1434
 * function to drain a single device's I/O queue.
1435
 */
1436
void bdrv_drain_all(void)
1437
{
1438
    BlockDriverState *bs;
1439
    bool busy;
1440

    
1441
    do {
1442
        busy = qemu_aio_wait();
1443

    
1444
        /* FIXME: We do not have timer support here, so this is effectively
1445
         * a busy wait.
1446
         */
1447
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1448
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1449
                qemu_co_queue_restart_all(&bs->throttled_reqs);
1450
                busy = true;
1451
            }
1452
        }
1453
    } while (busy);
1454

    
1455
    /* If requests are still pending there is a bug somewhere */
1456
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1457
        assert(QLIST_EMPTY(&bs->tracked_requests));
1458
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
1459
    }
1460
}
1461

    
1462
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1463
   Also, NULL terminate the device_name to prevent double remove */
1464
void bdrv_make_anon(BlockDriverState *bs)
1465
{
1466
    if (bs->device_name[0] != '\0') {
1467
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1468
    }
1469
    bs->device_name[0] = '\0';
1470
}
1471

    
1472
static void bdrv_rebind(BlockDriverState *bs)
1473
{
1474
    if (bs->drv && bs->drv->bdrv_rebind) {
1475
        bs->drv->bdrv_rebind(bs);
1476
    }
1477
}
1478

    
1479
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1480
                                     BlockDriverState *bs_src)
1481
{
1482
    /* move some fields that need to stay attached to the device */
1483
    bs_dest->open_flags         = bs_src->open_flags;
1484

    
1485
    /* dev info */
1486
    bs_dest->dev_ops            = bs_src->dev_ops;
1487
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1488
    bs_dest->dev                = bs_src->dev;
1489
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1490
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1491

    
1492
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1493

    
1494
    /* i/o timing parameters */
1495
    bs_dest->slice_start        = bs_src->slice_start;
1496
    bs_dest->slice_end          = bs_src->slice_end;
1497
    bs_dest->slice_submitted    = bs_src->slice_submitted;
1498
    bs_dest->io_limits          = bs_src->io_limits;
1499
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1500
    bs_dest->block_timer        = bs_src->block_timer;
1501
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1502

    
1503
    /* r/w error */
1504
    bs_dest->on_read_error      = bs_src->on_read_error;
1505
    bs_dest->on_write_error     = bs_src->on_write_error;
1506

    
1507
    /* i/o status */
1508
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1509
    bs_dest->iostatus           = bs_src->iostatus;
1510

    
1511
    /* dirty bitmap */
1512
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1513

    
1514
    /* job */
1515
    bs_dest->in_use             = bs_src->in_use;
1516
    bs_dest->job                = bs_src->job;
1517

    
1518
    /* keep the same entry in bdrv_states */
1519
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1520
            bs_src->device_name);
1521
    bs_dest->list = bs_src->list;
1522
}
1523

    
1524
/*
1525
 * Swap bs contents for two image chains while they are live,
1526
 * while keeping required fields on the BlockDriverState that is
1527
 * actually attached to a device.
1528
 *
1529
 * This will modify the BlockDriverState fields, and swap contents
1530
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1531
 *
1532
 * bs_new is required to be anonymous.
1533
 *
1534
 * This function does not create any image files.
1535
 */
1536
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1537
{
1538
    BlockDriverState tmp;
1539

    
1540
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1541
    assert(bs_new->device_name[0] == '\0');
1542
    assert(bs_new->dirty_bitmap == NULL);
1543
    assert(bs_new->job == NULL);
1544
    assert(bs_new->dev == NULL);
1545
    assert(bs_new->in_use == 0);
1546
    assert(bs_new->io_limits_enabled == false);
1547
    assert(bs_new->block_timer == NULL);
1548

    
1549
    tmp = *bs_new;
1550
    *bs_new = *bs_old;
1551
    *bs_old = tmp;
1552

    
1553
    /* there are some fields that should not be swapped, move them back */
1554
    bdrv_move_feature_fields(&tmp, bs_old);
1555
    bdrv_move_feature_fields(bs_old, bs_new);
1556
    bdrv_move_feature_fields(bs_new, &tmp);
1557

    
1558
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1559
    assert(bs_new->device_name[0] == '\0');
1560

    
1561
    /* Check a few fields that should remain attached to the device */
1562
    assert(bs_new->dev == NULL);
1563
    assert(bs_new->job == NULL);
1564
    assert(bs_new->in_use == 0);
1565
    assert(bs_new->io_limits_enabled == false);
1566
    assert(bs_new->block_timer == NULL);
1567

    
1568
    bdrv_rebind(bs_new);
1569
    bdrv_rebind(bs_old);
1570
}
1571

    
1572
/*
1573
 * Add new bs contents at the top of an image chain while the chain is
1574
 * live, while keeping required fields on the top layer.
1575
 *
1576
 * This will modify the BlockDriverState fields, and swap contents
1577
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1578
 *
1579
 * bs_new is required to be anonymous.
1580
 *
1581
 * This function does not create any image files.
1582
 */
1583
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1584
{
1585
    bdrv_swap(bs_new, bs_top);
1586

    
1587
    /* The contents of 'tmp' will become bs_top, as we are
1588
     * swapping bs_new and bs_top contents. */
1589
    bs_top->backing_hd = bs_new;
1590
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1591
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1592
            bs_new->filename);
1593
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1594
            bs_new->drv ? bs_new->drv->format_name : "");
1595
}
1596

    
1597
void bdrv_delete(BlockDriverState *bs)
1598
{
1599
    assert(!bs->dev);
1600
    assert(!bs->job);
1601
    assert(!bs->in_use);
1602

    
1603
    /* remove from list, if necessary */
1604
    bdrv_make_anon(bs);
1605

    
1606
    bdrv_close(bs);
1607

    
1608
    g_free(bs);
1609
}
1610

    
1611
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1612
/* TODO change to DeviceState *dev when all users are qdevified */
1613
{
1614
    if (bs->dev) {
1615
        return -EBUSY;
1616
    }
1617
    bs->dev = dev;
1618
    bdrv_iostatus_reset(bs);
1619
    return 0;
1620
}
1621

    
1622
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1623
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1624
{
1625
    if (bdrv_attach_dev(bs, dev) < 0) {
1626
        abort();
1627
    }
1628
}
1629

    
1630
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1631
/* TODO change to DeviceState *dev when all users are qdevified */
1632
{
1633
    assert(bs->dev == dev);
1634
    bs->dev = NULL;
1635
    bs->dev_ops = NULL;
1636
    bs->dev_opaque = NULL;
1637
    bs->buffer_alignment = 512;
1638
}
1639

    
1640
/* TODO change to return DeviceState * when all users are qdevified */
1641
void *bdrv_get_attached_dev(BlockDriverState *bs)
1642
{
1643
    return bs->dev;
1644
}
1645

    
1646
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1647
                      void *opaque)
1648
{
1649
    bs->dev_ops = ops;
1650
    bs->dev_opaque = opaque;
1651
}
1652

    
1653
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1654
                               enum MonitorEvent ev,
1655
                               BlockErrorAction action, bool is_read)
1656
{
1657
    QObject *data;
1658
    const char *action_str;
1659

    
1660
    switch (action) {
1661
    case BDRV_ACTION_REPORT:
1662
        action_str = "report";
1663
        break;
1664
    case BDRV_ACTION_IGNORE:
1665
        action_str = "ignore";
1666
        break;
1667
    case BDRV_ACTION_STOP:
1668
        action_str = "stop";
1669
        break;
1670
    default:
1671
        abort();
1672
    }
1673

    
1674
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1675
                              bdrv->device_name,
1676
                              action_str,
1677
                              is_read ? "read" : "write");
1678
    monitor_protocol_event(ev, data);
1679

    
1680
    qobject_decref(data);
1681
}
1682

    
1683
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1684
{
1685
    QObject *data;
1686

    
1687
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1688
                              bdrv_get_device_name(bs), ejected);
1689
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1690

    
1691
    qobject_decref(data);
1692
}
1693

    
1694
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1695
{
1696
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1697
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1698
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1699
        if (tray_was_closed) {
1700
            /* tray open */
1701
            bdrv_emit_qmp_eject_event(bs, true);
1702
        }
1703
        if (load) {
1704
            /* tray close */
1705
            bdrv_emit_qmp_eject_event(bs, false);
1706
        }
1707
    }
1708
}
1709

    
1710
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1711
{
1712
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1713
}
1714

    
1715
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1716
{
1717
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1718
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1719
    }
1720
}
1721

    
1722
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1723
{
1724
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1725
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1726
    }
1727
    return false;
1728
}
1729

    
1730
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1731
{
1732
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1733
        bs->dev_ops->resize_cb(bs->dev_opaque);
1734
    }
1735
}
1736

    
1737
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1738
{
1739
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1740
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1741
    }
1742
    return false;
1743
}
1744

    
1745
/*
1746
 * Run consistency checks on an image
1747
 *
1748
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1749
 * free of errors) or -errno when an internal error occurred. The results of the
1750
 * check are stored in res.
1751
 */
1752
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1753
{
1754
    if (bs->drv->bdrv_check == NULL) {
1755
        return -ENOTSUP;
1756
    }
1757

    
1758
    memset(res, 0, sizeof(*res));
1759
    return bs->drv->bdrv_check(bs, res, fix);
1760
}
1761

    
1762
#define COMMIT_BUF_SECTORS 2048
1763

    
1764
/* commit COW file into the raw image */
1765
int bdrv_commit(BlockDriverState *bs)
1766
{
1767
    BlockDriver *drv = bs->drv;
1768
    int64_t sector, total_sectors;
1769
    int n, ro, open_flags;
1770
    int ret = 0;
1771
    uint8_t *buf;
1772
    char filename[PATH_MAX];
1773

    
1774
    if (!drv)
1775
        return -ENOMEDIUM;
1776
    
1777
    if (!bs->backing_hd) {
1778
        return -ENOTSUP;
1779
    }
1780

    
1781
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1782
        return -EBUSY;
1783
    }
1784

    
1785
    ro = bs->backing_hd->read_only;
1786
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1787
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1788
    open_flags =  bs->backing_hd->open_flags;
1789

    
1790
    if (ro) {
1791
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1792
            return -EACCES;
1793
        }
1794
    }
1795

    
1796
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1797
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1798

    
1799
    for (sector = 0; sector < total_sectors; sector += n) {
1800
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1801

    
1802
            if (bdrv_read(bs, sector, buf, n) != 0) {
1803
                ret = -EIO;
1804
                goto ro_cleanup;
1805
            }
1806

    
1807
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1808
                ret = -EIO;
1809
                goto ro_cleanup;
1810
            }
1811
        }
1812
    }
1813

    
1814
    if (drv->bdrv_make_empty) {
1815
        ret = drv->bdrv_make_empty(bs);
1816
        bdrv_flush(bs);
1817
    }
1818

    
1819
    /*
1820
     * Make sure all data we wrote to the backing device is actually
1821
     * stable on disk.
1822
     */
1823
    if (bs->backing_hd)
1824
        bdrv_flush(bs->backing_hd);
1825

    
1826
ro_cleanup:
1827
    g_free(buf);
1828

    
1829
    if (ro) {
1830
        /* ignoring error return here */
1831
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1832
    }
1833

    
1834
    return ret;
1835
}
1836

    
1837
int bdrv_commit_all(void)
1838
{
1839
    BlockDriverState *bs;
1840

    
1841
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1842
        if (bs->drv && bs->backing_hd) {
1843
            int ret = bdrv_commit(bs);
1844
            if (ret < 0) {
1845
                return ret;
1846
            }
1847
        }
1848
    }
1849
    return 0;
1850
}
1851

    
1852
/**
1853
 * Remove an active request from the tracked requests list
1854
 *
1855
 * This function should be called when a tracked request is completing.
1856
 */
1857
static void tracked_request_end(BdrvTrackedRequest *req)
1858
{
1859
    QLIST_REMOVE(req, list);
1860
    qemu_co_queue_restart_all(&req->wait_queue);
1861
}
1862

    
1863
/**
1864
 * Add an active request to the tracked requests list
1865
 */
1866
static void tracked_request_begin(BdrvTrackedRequest *req,
1867
                                  BlockDriverState *bs,
1868
                                  int64_t sector_num,
1869
                                  int nb_sectors, bool is_write)
1870
{
1871
    *req = (BdrvTrackedRequest){
1872
        .bs = bs,
1873
        .sector_num = sector_num,
1874
        .nb_sectors = nb_sectors,
1875
        .is_write = is_write,
1876
        .co = qemu_coroutine_self(),
1877
    };
1878

    
1879
    qemu_co_queue_init(&req->wait_queue);
1880

    
1881
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1882
}
1883

    
1884
/**
1885
 * Round a region to cluster boundaries
1886
 */
1887
void bdrv_round_to_clusters(BlockDriverState *bs,
1888
                            int64_t sector_num, int nb_sectors,
1889
                            int64_t *cluster_sector_num,
1890
                            int *cluster_nb_sectors)
1891
{
1892
    BlockDriverInfo bdi;
1893

    
1894
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1895
        *cluster_sector_num = sector_num;
1896
        *cluster_nb_sectors = nb_sectors;
1897
    } else {
1898
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1899
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1900
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1901
                                            nb_sectors, c);
1902
    }
1903
}
1904

    
1905
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1906
                                     int64_t sector_num, int nb_sectors) {
1907
    /*        aaaa   bbbb */
1908
    if (sector_num >= req->sector_num + req->nb_sectors) {
1909
        return false;
1910
    }
1911
    /* bbbb   aaaa        */
1912
    if (req->sector_num >= sector_num + nb_sectors) {
1913
        return false;
1914
    }
1915
    return true;
1916
}
1917

    
1918
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1919
        int64_t sector_num, int nb_sectors)
1920
{
1921
    BdrvTrackedRequest *req;
1922
    int64_t cluster_sector_num;
1923
    int cluster_nb_sectors;
1924
    bool retry;
1925

    
1926
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1927
     * that allocating writes will be serialized and not race with each other
1928
     * for the same cluster.  For example, in copy-on-read it ensures that the
1929
     * CoR read and write operations are atomic and guest writes cannot
1930
     * interleave between them.
1931
     */
1932
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
1933
                           &cluster_sector_num, &cluster_nb_sectors);
1934

    
1935
    do {
1936
        retry = false;
1937
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1938
            if (tracked_request_overlaps(req, cluster_sector_num,
1939
                                         cluster_nb_sectors)) {
1940
                /* Hitting this means there was a reentrant request, for
1941
                 * example, a block driver issuing nested requests.  This must
1942
                 * never happen since it means deadlock.
1943
                 */
1944
                assert(qemu_coroutine_self() != req->co);
1945

    
1946
                qemu_co_queue_wait(&req->wait_queue);
1947
                retry = true;
1948
                break;
1949
            }
1950
        }
1951
    } while (retry);
1952
}
1953

    
1954
/*
1955
 * Return values:
1956
 * 0        - success
1957
 * -EINVAL  - backing format specified, but no file
1958
 * -ENOSPC  - can't update the backing file because no space is left in the
1959
 *            image file header
1960
 * -ENOTSUP - format driver doesn't support changing the backing file
1961
 */
1962
int bdrv_change_backing_file(BlockDriverState *bs,
1963
    const char *backing_file, const char *backing_fmt)
1964
{
1965
    BlockDriver *drv = bs->drv;
1966
    int ret;
1967

    
1968
    /* Backing file format doesn't make sense without a backing file */
1969
    if (backing_fmt && !backing_file) {
1970
        return -EINVAL;
1971
    }
1972

    
1973
    if (drv->bdrv_change_backing_file != NULL) {
1974
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1975
    } else {
1976
        ret = -ENOTSUP;
1977
    }
1978

    
1979
    if (ret == 0) {
1980
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1981
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1982
    }
1983
    return ret;
1984
}
1985

    
1986
/*
1987
 * Finds the image layer in the chain that has 'bs' as its backing file.
1988
 *
1989
 * active is the current topmost image.
1990
 *
1991
 * Returns NULL if bs is not found in active's image chain,
1992
 * or if active == bs.
1993
 */
1994
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1995
                                    BlockDriverState *bs)
1996
{
1997
    BlockDriverState *overlay = NULL;
1998
    BlockDriverState *intermediate;
1999

    
2000
    assert(active != NULL);
2001
    assert(bs != NULL);
2002

    
2003
    /* if bs is the same as active, then by definition it has no overlay
2004
     */
2005
    if (active == bs) {
2006
        return NULL;
2007
    }
2008

    
2009
    intermediate = active;
2010
    while (intermediate->backing_hd) {
2011
        if (intermediate->backing_hd == bs) {
2012
            overlay = intermediate;
2013
            break;
2014
        }
2015
        intermediate = intermediate->backing_hd;
2016
    }
2017

    
2018
    return overlay;
2019
}
2020

    
2021
typedef struct BlkIntermediateStates {
2022
    BlockDriverState *bs;
2023
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2024
} BlkIntermediateStates;
2025

    
2026

    
2027
/*
2028
 * Drops images above 'base' up to and including 'top', and sets the image
2029
 * above 'top' to have base as its backing file.
2030
 *
2031
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2032
 * information in 'bs' can be properly updated.
2033
 *
2034
 * E.g., this will convert the following chain:
2035
 * bottom <- base <- intermediate <- top <- active
2036
 *
2037
 * to
2038
 *
2039
 * bottom <- base <- active
2040
 *
2041
 * It is allowed for bottom==base, in which case it converts:
2042
 *
2043
 * base <- intermediate <- top <- active
2044
 *
2045
 * to
2046
 *
2047
 * base <- active
2048
 *
2049
 * Error conditions:
2050
 *  if active == top, that is considered an error
2051
 *
2052
 */
2053
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2054
                           BlockDriverState *base)
2055
{
2056
    BlockDriverState *intermediate;
2057
    BlockDriverState *base_bs = NULL;
2058
    BlockDriverState *new_top_bs = NULL;
2059
    BlkIntermediateStates *intermediate_state, *next;
2060
    int ret = -EIO;
2061

    
2062
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2063
    QSIMPLEQ_INIT(&states_to_delete);
2064

    
2065
    if (!top->drv || !base->drv) {
2066
        goto exit;
2067
    }
2068

    
2069
    new_top_bs = bdrv_find_overlay(active, top);
2070

    
2071
    if (new_top_bs == NULL) {
2072
        /* we could not find the image above 'top', this is an error */
2073
        goto exit;
2074
    }
2075

    
2076
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2077
     * to do, no intermediate images */
2078
    if (new_top_bs->backing_hd == base) {
2079
        ret = 0;
2080
        goto exit;
2081
    }
2082

    
2083
    intermediate = top;
2084

    
2085
    /* now we will go down through the list, and add each BDS we find
2086
     * into our deletion queue, until we hit the 'base'
2087
     */
2088
    while (intermediate) {
2089
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2090
        intermediate_state->bs = intermediate;
2091
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2092

    
2093
        if (intermediate->backing_hd == base) {
2094
            base_bs = intermediate->backing_hd;
2095
            break;
2096
        }
2097
        intermediate = intermediate->backing_hd;
2098
    }
2099
    if (base_bs == NULL) {
2100
        /* something went wrong, we did not end at the base. safely
2101
         * unravel everything, and exit with error */
2102
        goto exit;
2103
    }
2104

    
2105
    /* success - we can delete the intermediate states, and link top->base */
2106
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2107
                                   base_bs->drv ? base_bs->drv->format_name : "");
2108
    if (ret) {
2109
        goto exit;
2110
    }
2111
    new_top_bs->backing_hd = base_bs;
2112

    
2113

    
2114
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2115
        /* so that bdrv_close() does not recursively close the chain */
2116
        intermediate_state->bs->backing_hd = NULL;
2117
        bdrv_delete(intermediate_state->bs);
2118
    }
2119
    ret = 0;
2120

    
2121
exit:
2122
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2123
        g_free(intermediate_state);
2124
    }
2125
    return ret;
2126
}
2127

    
2128

    
2129
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2130
                                   size_t size)
2131
{
2132
    int64_t len;
2133

    
2134
    if (!bdrv_is_inserted(bs))
2135
        return -ENOMEDIUM;
2136

    
2137
    if (bs->growable)
2138
        return 0;
2139

    
2140
    len = bdrv_getlength(bs);
2141

    
2142
    if (offset < 0)
2143
        return -EIO;
2144

    
2145
    if ((offset > len) || (len - offset < size))
2146
        return -EIO;
2147

    
2148
    return 0;
2149
}
2150

    
2151
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2152
                              int nb_sectors)
2153
{
2154
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2155
                                   nb_sectors * BDRV_SECTOR_SIZE);
2156
}
2157

    
2158
typedef struct RwCo {
2159
    BlockDriverState *bs;
2160
    int64_t sector_num;
2161
    int nb_sectors;
2162
    QEMUIOVector *qiov;
2163
    bool is_write;
2164
    int ret;
2165
} RwCo;
2166

    
2167
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2168
{
2169
    RwCo *rwco = opaque;
2170

    
2171
    if (!rwco->is_write) {
2172
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2173
                                     rwco->nb_sectors, rwco->qiov, 0);
2174
    } else {
2175
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2176
                                      rwco->nb_sectors, rwco->qiov, 0);
2177
    }
2178
}
2179

    
2180
/*
2181
 * Process a vectored synchronous request using coroutines
2182
 */
2183
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2184
                       QEMUIOVector *qiov, bool is_write)
2185
{
2186
    Coroutine *co;
2187
    RwCo rwco = {
2188
        .bs = bs,
2189
        .sector_num = sector_num,
2190
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2191
        .qiov = qiov,
2192
        .is_write = is_write,
2193
        .ret = NOT_DONE,
2194
    };
2195
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2196

    
2197
    /**
2198
     * In sync call context, when the vcpu is blocked, this throttling timer
2199
     * will not fire; so the I/O throttling function has to be disabled here
2200
     * if it has been enabled.
2201
     */
2202
    if (bs->io_limits_enabled) {
2203
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2204
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2205
        bdrv_io_limits_disable(bs);
2206
    }
2207

    
2208
    if (qemu_in_coroutine()) {
2209
        /* Fast-path if already in coroutine context */
2210
        bdrv_rw_co_entry(&rwco);
2211
    } else {
2212
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2213
        qemu_coroutine_enter(co, &rwco);
2214
        while (rwco.ret == NOT_DONE) {
2215
            qemu_aio_wait();
2216
        }
2217
    }
2218
    return rwco.ret;
2219
}
2220

    
2221
/*
2222
 * Process a synchronous request using coroutines
2223
 */
2224
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2225
                      int nb_sectors, bool is_write)
2226
{
2227
    QEMUIOVector qiov;
2228
    struct iovec iov = {
2229
        .iov_base = (void *)buf,
2230
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2231
    };
2232

    
2233
    qemu_iovec_init_external(&qiov, &iov, 1);
2234
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write);
2235
}
2236

    
2237
/* return < 0 if error. See bdrv_write() for the return codes */
2238
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2239
              uint8_t *buf, int nb_sectors)
2240
{
2241
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
2242
}
2243

    
2244
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2245
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2246
                          uint8_t *buf, int nb_sectors)
2247
{
2248
    bool enabled;
2249
    int ret;
2250

    
2251
    enabled = bs->io_limits_enabled;
2252
    bs->io_limits_enabled = false;
2253
    ret = bdrv_read(bs, 0, buf, 1);
2254
    bs->io_limits_enabled = enabled;
2255
    return ret;
2256
}
2257

    
2258
/* Return < 0 if error. Important errors are:
2259
  -EIO         generic I/O error (may happen for all errors)
2260
  -ENOMEDIUM   No media inserted.
2261
  -EINVAL      Invalid sector number or nb_sectors
2262
  -EACCES      Trying to write a read-only device
2263
*/
2264
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2265
               const uint8_t *buf, int nb_sectors)
2266
{
2267
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
2268
}
2269

    
2270
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2271
{
2272
    return bdrv_rwv_co(bs, sector_num, qiov, true);
2273
}
2274

    
2275
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2276
               void *buf, int count1)
2277
{
2278
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2279
    int len, nb_sectors, count;
2280
    int64_t sector_num;
2281
    int ret;
2282

    
2283
    count = count1;
2284
    /* first read to align to sector start */
2285
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2286
    if (len > count)
2287
        len = count;
2288
    sector_num = offset >> BDRV_SECTOR_BITS;
2289
    if (len > 0) {
2290
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2291
            return ret;
2292
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2293
        count -= len;
2294
        if (count == 0)
2295
            return count1;
2296
        sector_num++;
2297
        buf += len;
2298
    }
2299

    
2300
    /* read the sectors "in place" */
2301
    nb_sectors = count >> BDRV_SECTOR_BITS;
2302
    if (nb_sectors > 0) {
2303
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2304
            return ret;
2305
        sector_num += nb_sectors;
2306
        len = nb_sectors << BDRV_SECTOR_BITS;
2307
        buf += len;
2308
        count -= len;
2309
    }
2310

    
2311
    /* add data from the last sector */
2312
    if (count > 0) {
2313
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2314
            return ret;
2315
        memcpy(buf, tmp_buf, count);
2316
    }
2317
    return count1;
2318
}
2319

    
2320
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2321
{
2322
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2323
    int len, nb_sectors, count;
2324
    int64_t sector_num;
2325
    int ret;
2326

    
2327
    count = qiov->size;
2328

    
2329
    /* first write to align to sector start */
2330
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2331
    if (len > count)
2332
        len = count;
2333
    sector_num = offset >> BDRV_SECTOR_BITS;
2334
    if (len > 0) {
2335
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2336
            return ret;
2337
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2338
                          len);
2339
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2340
            return ret;
2341
        count -= len;
2342
        if (count == 0)
2343
            return qiov->size;
2344
        sector_num++;
2345
    }
2346

    
2347
    /* write the sectors "in place" */
2348
    nb_sectors = count >> BDRV_SECTOR_BITS;
2349
    if (nb_sectors > 0) {
2350
        QEMUIOVector qiov_inplace;
2351

    
2352
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2353
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2354
                          nb_sectors << BDRV_SECTOR_BITS);
2355
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2356
        qemu_iovec_destroy(&qiov_inplace);
2357
        if (ret < 0) {
2358
            return ret;
2359
        }
2360

    
2361
        sector_num += nb_sectors;
2362
        len = nb_sectors << BDRV_SECTOR_BITS;
2363
        count -= len;
2364
    }
2365

    
2366
    /* add data from the last sector */
2367
    if (count > 0) {
2368
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2369
            return ret;
2370
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2371
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2372
            return ret;
2373
    }
2374
    return qiov->size;
2375
}
2376

    
2377
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2378
                const void *buf, int count1)
2379
{
2380
    QEMUIOVector qiov;
2381
    struct iovec iov = {
2382
        .iov_base   = (void *) buf,
2383
        .iov_len    = count1,
2384
    };
2385

    
2386
    qemu_iovec_init_external(&qiov, &iov, 1);
2387
    return bdrv_pwritev(bs, offset, &qiov);
2388
}
2389

    
2390
/*
2391
 * Writes to the file and ensures that no writes are reordered across this
2392
 * request (acts as a barrier)
2393
 *
2394
 * Returns 0 on success, -errno in error cases.
2395
 */
2396
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2397
    const void *buf, int count)
2398
{
2399
    int ret;
2400

    
2401
    ret = bdrv_pwrite(bs, offset, buf, count);
2402
    if (ret < 0) {
2403
        return ret;
2404
    }
2405

    
2406
    /* No flush needed for cache modes that already do it */
2407
    if (bs->enable_write_cache) {
2408
        bdrv_flush(bs);
2409
    }
2410

    
2411
    return 0;
2412
}
2413

    
2414
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2415
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2416
{
2417
    /* Perform I/O through a temporary buffer so that users who scribble over
2418
     * their read buffer while the operation is in progress do not end up
2419
     * modifying the image file.  This is critical for zero-copy guest I/O
2420
     * where anything might happen inside guest memory.
2421
     */
2422
    void *bounce_buffer;
2423

    
2424
    BlockDriver *drv = bs->drv;
2425
    struct iovec iov;
2426
    QEMUIOVector bounce_qiov;
2427
    int64_t cluster_sector_num;
2428
    int cluster_nb_sectors;
2429
    size_t skip_bytes;
2430
    int ret;
2431

    
2432
    /* Cover entire cluster so no additional backing file I/O is required when
2433
     * allocating cluster in the image file.
2434
     */
2435
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2436
                           &cluster_sector_num, &cluster_nb_sectors);
2437

    
2438
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2439
                                   cluster_sector_num, cluster_nb_sectors);
2440

    
2441
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2442
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2443
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2444

    
2445
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2446
                             &bounce_qiov);
2447
    if (ret < 0) {
2448
        goto err;
2449
    }
2450

    
2451
    if (drv->bdrv_co_write_zeroes &&
2452
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2453
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2454
                                      cluster_nb_sectors);
2455
    } else {
2456
        /* This does not change the data on the disk, it is not necessary
2457
         * to flush even in cache=writethrough mode.
2458
         */
2459
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2460
                                  &bounce_qiov);
2461
    }
2462

    
2463
    if (ret < 0) {
2464
        /* It might be okay to ignore write errors for guest requests.  If this
2465
         * is a deliberate copy-on-read then we don't want to ignore the error.
2466
         * Simply report it in all cases.
2467
         */
2468
        goto err;
2469
    }
2470

    
2471
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2472
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2473
                        nb_sectors * BDRV_SECTOR_SIZE);
2474

    
2475
err:
2476
    qemu_vfree(bounce_buffer);
2477
    return ret;
2478
}
2479

    
2480
/*
2481
 * Handle a read request in coroutine context
2482
 */
2483
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2484
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2485
    BdrvRequestFlags flags)
2486
{
2487
    BlockDriver *drv = bs->drv;
2488
    BdrvTrackedRequest req;
2489
    int ret;
2490

    
2491
    if (!drv) {
2492
        return -ENOMEDIUM;
2493
    }
2494
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2495
        return -EIO;
2496
    }
2497

    
2498
    /* throttling disk read I/O */
2499
    if (bs->io_limits_enabled) {
2500
        bdrv_io_limits_intercept(bs, false, nb_sectors);
2501
    }
2502

    
2503
    if (bs->copy_on_read) {
2504
        flags |= BDRV_REQ_COPY_ON_READ;
2505
    }
2506
    if (flags & BDRV_REQ_COPY_ON_READ) {
2507
        bs->copy_on_read_in_flight++;
2508
    }
2509

    
2510
    if (bs->copy_on_read_in_flight) {
2511
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2512
    }
2513

    
2514
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2515

    
2516
    if (flags & BDRV_REQ_COPY_ON_READ) {
2517
        int pnum;
2518

    
2519
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2520
        if (ret < 0) {
2521
            goto out;
2522
        }
2523

    
2524
        if (!ret || pnum != nb_sectors) {
2525
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2526
            goto out;
2527
        }
2528
    }
2529

    
2530
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2531

    
2532
out:
2533
    tracked_request_end(&req);
2534

    
2535
    if (flags & BDRV_REQ_COPY_ON_READ) {
2536
        bs->copy_on_read_in_flight--;
2537
    }
2538

    
2539
    return ret;
2540
}
2541

    
2542
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2543
    int nb_sectors, QEMUIOVector *qiov)
2544
{
2545
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2546

    
2547
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2548
}
2549

    
2550
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2551
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2552
{
2553
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2554

    
2555
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2556
                            BDRV_REQ_COPY_ON_READ);
2557
}
2558

    
2559
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2560
    int64_t sector_num, int nb_sectors)
2561
{
2562
    BlockDriver *drv = bs->drv;
2563
    QEMUIOVector qiov;
2564
    struct iovec iov;
2565
    int ret;
2566

    
2567
    /* TODO Emulate only part of misaligned requests instead of letting block
2568
     * drivers return -ENOTSUP and emulate everything */
2569

    
2570
    /* First try the efficient write zeroes operation */
2571
    if (drv->bdrv_co_write_zeroes) {
2572
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2573
        if (ret != -ENOTSUP) {
2574
            return ret;
2575
        }
2576
    }
2577

    
2578
    /* Fall back to bounce buffer if write zeroes is unsupported */
2579
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2580
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2581
    memset(iov.iov_base, 0, iov.iov_len);
2582
    qemu_iovec_init_external(&qiov, &iov, 1);
2583

    
2584
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2585

    
2586
    qemu_vfree(iov.iov_base);
2587
    return ret;
2588
}
2589

    
2590
/*
2591
 * Handle a write request in coroutine context
2592
 */
2593
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2594
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2595
    BdrvRequestFlags flags)
2596
{
2597
    BlockDriver *drv = bs->drv;
2598
    BdrvTrackedRequest req;
2599
    int ret;
2600

    
2601
    if (!bs->drv) {
2602
        return -ENOMEDIUM;
2603
    }
2604
    if (bs->read_only) {
2605
        return -EACCES;
2606
    }
2607
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2608
        return -EIO;
2609
    }
2610

    
2611
    /* throttling disk write I/O */
2612
    if (bs->io_limits_enabled) {
2613
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2614
    }
2615

    
2616
    if (bs->copy_on_read_in_flight) {
2617
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2618
    }
2619

    
2620
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2621

    
2622
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2623

    
2624
    if (ret < 0) {
2625
        /* Do nothing, write notifier decided to fail this request */
2626
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
2627
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2628
    } else {
2629
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2630
    }
2631

    
2632
    if (ret == 0 && !bs->enable_write_cache) {
2633
        ret = bdrv_co_flush(bs);
2634
    }
2635

    
2636
    if (bs->dirty_bitmap) {
2637
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2638
    }
2639

    
2640
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2641
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2642
    }
2643

    
2644
    tracked_request_end(&req);
2645

    
2646
    return ret;
2647
}
2648

    
2649
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2650
    int nb_sectors, QEMUIOVector *qiov)
2651
{
2652
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2653

    
2654
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2655
}
2656

    
2657
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2658
                                      int64_t sector_num, int nb_sectors)
2659
{
2660
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2661

    
2662
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2663
                             BDRV_REQ_ZERO_WRITE);
2664
}
2665

    
2666
/**
2667
 * Truncate file to 'offset' bytes (needed only for file protocols)
2668
 */
2669
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2670
{
2671
    BlockDriver *drv = bs->drv;
2672
    int ret;
2673
    if (!drv)
2674
        return -ENOMEDIUM;
2675
    if (!drv->bdrv_truncate)
2676
        return -ENOTSUP;
2677
    if (bs->read_only)
2678
        return -EACCES;
2679
    if (bdrv_in_use(bs))
2680
        return -EBUSY;
2681
    ret = drv->bdrv_truncate(bs, offset);
2682
    if (ret == 0) {
2683
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2684
        bdrv_dev_resize_cb(bs);
2685
    }
2686
    return ret;
2687
}
2688

    
2689
/**
2690
 * Length of a allocated file in bytes. Sparse files are counted by actual
2691
 * allocated space. Return < 0 if error or unknown.
2692
 */
2693
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2694
{
2695
    BlockDriver *drv = bs->drv;
2696
    if (!drv) {
2697
        return -ENOMEDIUM;
2698
    }
2699
    if (drv->bdrv_get_allocated_file_size) {
2700
        return drv->bdrv_get_allocated_file_size(bs);
2701
    }
2702
    if (bs->file) {
2703
        return bdrv_get_allocated_file_size(bs->file);
2704
    }
2705
    return -ENOTSUP;
2706
}
2707

    
2708
/**
2709
 * Length of a file in bytes. Return < 0 if error or unknown.
2710
 */
2711
int64_t bdrv_getlength(BlockDriverState *bs)
2712
{
2713
    BlockDriver *drv = bs->drv;
2714
    if (!drv)
2715
        return -ENOMEDIUM;
2716

    
2717
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2718
        if (drv->bdrv_getlength) {
2719
            return drv->bdrv_getlength(bs);
2720
        }
2721
    }
2722
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2723
}
2724

    
2725
/* return 0 as number of sectors if no device present or error */
2726
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2727
{
2728
    int64_t length;
2729
    length = bdrv_getlength(bs);
2730
    if (length < 0)
2731
        length = 0;
2732
    else
2733
        length = length >> BDRV_SECTOR_BITS;
2734
    *nb_sectors_ptr = length;
2735
}
2736

    
2737
/* throttling disk io limits */
2738
void bdrv_set_io_limits(BlockDriverState *bs,
2739
                        BlockIOLimit *io_limits)
2740
{
2741
    bs->io_limits = *io_limits;
2742
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2743
}
2744

    
2745
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2746
                       BlockdevOnError on_write_error)
2747
{
2748
    bs->on_read_error = on_read_error;
2749
    bs->on_write_error = on_write_error;
2750
}
2751

    
2752
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2753
{
2754
    return is_read ? bs->on_read_error : bs->on_write_error;
2755
}
2756

    
2757
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2758
{
2759
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2760

    
2761
    switch (on_err) {
2762
    case BLOCKDEV_ON_ERROR_ENOSPC:
2763
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2764
    case BLOCKDEV_ON_ERROR_STOP:
2765
        return BDRV_ACTION_STOP;
2766
    case BLOCKDEV_ON_ERROR_REPORT:
2767
        return BDRV_ACTION_REPORT;
2768
    case BLOCKDEV_ON_ERROR_IGNORE:
2769
        return BDRV_ACTION_IGNORE;
2770
    default:
2771
        abort();
2772
    }
2773
}
2774

    
2775
/* This is done by device models because, while the block layer knows
2776
 * about the error, it does not know whether an operation comes from
2777
 * the device or the block layer (from a job, for example).
2778
 */
2779
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2780
                       bool is_read, int error)
2781
{
2782
    assert(error >= 0);
2783
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2784
    if (action == BDRV_ACTION_STOP) {
2785
        vm_stop(RUN_STATE_IO_ERROR);
2786
        bdrv_iostatus_set_err(bs, error);
2787
    }
2788
}
2789

    
2790
int bdrv_is_read_only(BlockDriverState *bs)
2791
{
2792
    return bs->read_only;
2793
}
2794

    
2795
int bdrv_is_sg(BlockDriverState *bs)
2796
{
2797
    return bs->sg;
2798
}
2799

    
2800
int bdrv_enable_write_cache(BlockDriverState *bs)
2801
{
2802
    return bs->enable_write_cache;
2803
}
2804

    
2805
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2806
{
2807
    bs->enable_write_cache = wce;
2808

    
2809
    /* so a reopen() will preserve wce */
2810
    if (wce) {
2811
        bs->open_flags |= BDRV_O_CACHE_WB;
2812
    } else {
2813
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2814
    }
2815
}
2816

    
2817
int bdrv_is_encrypted(BlockDriverState *bs)
2818
{
2819
    if (bs->backing_hd && bs->backing_hd->encrypted)
2820
        return 1;
2821
    return bs->encrypted;
2822
}
2823

    
2824
int bdrv_key_required(BlockDriverState *bs)
2825
{
2826
    BlockDriverState *backing_hd = bs->backing_hd;
2827

    
2828
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2829
        return 1;
2830
    return (bs->encrypted && !bs->valid_key);
2831
}
2832

    
2833
int bdrv_set_key(BlockDriverState *bs, const char *key)
2834
{
2835
    int ret;
2836
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2837
        ret = bdrv_set_key(bs->backing_hd, key);
2838
        if (ret < 0)
2839
            return ret;
2840
        if (!bs->encrypted)
2841
            return 0;
2842
    }
2843
    if (!bs->encrypted) {
2844
        return -EINVAL;
2845
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2846
        return -ENOMEDIUM;
2847
    }
2848
    ret = bs->drv->bdrv_set_key(bs, key);
2849
    if (ret < 0) {
2850
        bs->valid_key = 0;
2851
    } else if (!bs->valid_key) {
2852
        bs->valid_key = 1;
2853
        /* call the change callback now, we skipped it on open */
2854
        bdrv_dev_change_media_cb(bs, true);
2855
    }
2856
    return ret;
2857
}
2858

    
2859
const char *bdrv_get_format_name(BlockDriverState *bs)
2860
{
2861
    return bs->drv ? bs->drv->format_name : NULL;
2862
}
2863

    
2864
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2865
                         void *opaque)
2866
{
2867
    BlockDriver *drv;
2868

    
2869
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2870
        it(opaque, drv->format_name);
2871
    }
2872
}
2873

    
2874
BlockDriverState *bdrv_find(const char *name)
2875
{
2876
    BlockDriverState *bs;
2877

    
2878
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2879
        if (!strcmp(name, bs->device_name)) {
2880
            return bs;
2881
        }
2882
    }
2883
    return NULL;
2884
}
2885

    
2886
BlockDriverState *bdrv_next(BlockDriverState *bs)
2887
{
2888
    if (!bs) {
2889
        return QTAILQ_FIRST(&bdrv_states);
2890
    }
2891
    return QTAILQ_NEXT(bs, list);
2892
}
2893

    
2894
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2895
{
2896
    BlockDriverState *bs;
2897

    
2898
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2899
        it(opaque, bs);
2900
    }
2901
}
2902

    
2903
const char *bdrv_get_device_name(BlockDriverState *bs)
2904
{
2905
    return bs->device_name;
2906
}
2907

    
2908
int bdrv_get_flags(BlockDriverState *bs)
2909
{
2910
    return bs->open_flags;
2911
}
2912

    
2913
void bdrv_flush_all(void)
2914
{
2915
    BlockDriverState *bs;
2916

    
2917
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2918
        bdrv_flush(bs);
2919
    }
2920
}
2921

    
2922
int bdrv_has_zero_init_1(BlockDriverState *bs)
2923
{
2924
    return 1;
2925
}
2926

    
2927
int bdrv_has_zero_init(BlockDriverState *bs)
2928
{
2929
    assert(bs->drv);
2930

    
2931
    if (bs->drv->bdrv_has_zero_init) {
2932
        return bs->drv->bdrv_has_zero_init(bs);
2933
    }
2934

    
2935
    /* safe default */
2936
    return 0;
2937
}
2938

    
2939
typedef struct BdrvCoIsAllocatedData {
2940
    BlockDriverState *bs;
2941
    BlockDriverState *base;
2942
    int64_t sector_num;
2943
    int nb_sectors;
2944
    int *pnum;
2945
    int ret;
2946
    bool done;
2947
} BdrvCoIsAllocatedData;
2948

    
2949
/*
2950
 * Returns true iff the specified sector is present in the disk image. Drivers
2951
 * not implementing the functionality are assumed to not support backing files,
2952
 * hence all their sectors are reported as allocated.
2953
 *
2954
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2955
 * and 'pnum' is set to 0.
2956
 *
2957
 * 'pnum' is set to the number of sectors (including and immediately following
2958
 * the specified sector) that are known to be in the same
2959
 * allocated/unallocated state.
2960
 *
2961
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2962
 * beyond the end of the disk image it will be clamped.
2963
 */
2964
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2965
                                      int nb_sectors, int *pnum)
2966
{
2967
    int64_t n;
2968

    
2969
    if (sector_num >= bs->total_sectors) {
2970
        *pnum = 0;
2971
        return 0;
2972
    }
2973

    
2974
    n = bs->total_sectors - sector_num;
2975
    if (n < nb_sectors) {
2976
        nb_sectors = n;
2977
    }
2978

    
2979
    if (!bs->drv->bdrv_co_is_allocated) {
2980
        *pnum = nb_sectors;
2981
        return 1;
2982
    }
2983

    
2984
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2985
}
2986

    
2987
/* Coroutine wrapper for bdrv_is_allocated() */
2988
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2989
{
2990
    BdrvCoIsAllocatedData *data = opaque;
2991
    BlockDriverState *bs = data->bs;
2992

    
2993
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2994
                                     data->pnum);
2995
    data->done = true;
2996
}
2997

    
2998
/*
2999
 * Synchronous wrapper around bdrv_co_is_allocated().
3000
 *
3001
 * See bdrv_co_is_allocated() for details.
3002
 */
3003
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
3004
                      int *pnum)
3005
{
3006
    Coroutine *co;
3007
    BdrvCoIsAllocatedData data = {
3008
        .bs = bs,
3009
        .sector_num = sector_num,
3010
        .nb_sectors = nb_sectors,
3011
        .pnum = pnum,
3012
        .done = false,
3013
    };
3014

    
3015
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
3016
    qemu_coroutine_enter(co, &data);
3017
    while (!data.done) {
3018
        qemu_aio_wait();
3019
    }
3020
    return data.ret;
3021
}
3022

    
3023
/*
3024
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3025
 *
3026
 * Return true if the given sector is allocated in any image between
3027
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3028
 * sector is allocated in any image of the chain.  Return false otherwise.
3029
 *
3030
 * 'pnum' is set to the number of sectors (including and immediately following
3031
 *  the specified sector) that are known to be in the same
3032
 *  allocated/unallocated state.
3033
 *
3034
 */
3035
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
3036
                                            BlockDriverState *base,
3037
                                            int64_t sector_num,
3038
                                            int nb_sectors, int *pnum)
3039
{
3040
    BlockDriverState *intermediate;
3041
    int ret, n = nb_sectors;
3042

    
3043
    intermediate = top;
3044
    while (intermediate && intermediate != base) {
3045
        int pnum_inter;
3046
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
3047
                                   &pnum_inter);
3048
        if (ret < 0) {
3049
            return ret;
3050
        } else if (ret) {
3051
            *pnum = pnum_inter;
3052
            return 1;
3053
        }
3054

    
3055
        /*
3056
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3057
         * might have
3058
         *
3059
         * [sector_num+x, nr_sectors] allocated.
3060
         */
3061
        if (n > pnum_inter &&
3062
            (intermediate == top ||
3063
             sector_num + pnum_inter < intermediate->total_sectors)) {
3064
            n = pnum_inter;
3065
        }
3066

    
3067
        intermediate = intermediate->backing_hd;
3068
    }
3069

    
3070
    *pnum = n;
3071
    return 0;
3072
}
3073

    
3074
/* Coroutine wrapper for bdrv_is_allocated_above() */
3075
static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque)
3076
{
3077
    BdrvCoIsAllocatedData *data = opaque;
3078
    BlockDriverState *top = data->bs;
3079
    BlockDriverState *base = data->base;
3080

    
3081
    data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num,
3082
                                           data->nb_sectors, data->pnum);
3083
    data->done = true;
3084
}
3085

    
3086
/*
3087
 * Synchronous wrapper around bdrv_co_is_allocated_above().
3088
 *
3089
 * See bdrv_co_is_allocated_above() for details.
3090
 */
3091
int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
3092
                            int64_t sector_num, int nb_sectors, int *pnum)
3093
{
3094
    Coroutine *co;
3095
    BdrvCoIsAllocatedData data = {
3096
        .bs = top,
3097
        .base = base,
3098
        .sector_num = sector_num,
3099
        .nb_sectors = nb_sectors,
3100
        .pnum = pnum,
3101
        .done = false,
3102
    };
3103

    
3104
    co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry);
3105
    qemu_coroutine_enter(co, &data);
3106
    while (!data.done) {
3107
        qemu_aio_wait();
3108
    }
3109
    return data.ret;
3110
}
3111

    
3112
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3113
{
3114
    if (bs->backing_hd && bs->backing_hd->encrypted)
3115
        return bs->backing_file;
3116
    else if (bs->encrypted)
3117
        return bs->filename;
3118
    else
3119
        return NULL;
3120
}
3121

    
3122
void bdrv_get_backing_filename(BlockDriverState *bs,
3123
                               char *filename, int filename_size)
3124
{
3125
    pstrcpy(filename, filename_size, bs->backing_file);
3126
}
3127

    
3128
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3129
                          const uint8_t *buf, int nb_sectors)
3130
{
3131
    BlockDriver *drv = bs->drv;
3132
    if (!drv)
3133
        return -ENOMEDIUM;
3134
    if (!drv->bdrv_write_compressed)
3135
        return -ENOTSUP;
3136
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3137
        return -EIO;
3138

    
3139
    assert(!bs->dirty_bitmap);
3140

    
3141
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3142
}
3143

    
3144
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3145
{
3146
    BlockDriver *drv = bs->drv;
3147
    if (!drv)
3148
        return -ENOMEDIUM;
3149
    if (!drv->bdrv_get_info)
3150
        return -ENOTSUP;
3151
    memset(bdi, 0, sizeof(*bdi));
3152
    return drv->bdrv_get_info(bs, bdi);
3153
}
3154

    
3155
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3156
                      int64_t pos, int size)
3157
{
3158
    QEMUIOVector qiov;
3159
    struct iovec iov = {
3160
        .iov_base   = (void *) buf,
3161
        .iov_len    = size,
3162
    };
3163

    
3164
    qemu_iovec_init_external(&qiov, &iov, 1);
3165
    return bdrv_writev_vmstate(bs, &qiov, pos);
3166
}
3167

    
3168
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3169
{
3170
    BlockDriver *drv = bs->drv;
3171

    
3172
    if (!drv) {
3173
        return -ENOMEDIUM;
3174
    } else if (drv->bdrv_save_vmstate) {
3175
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3176
    } else if (bs->file) {
3177
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3178
    }
3179

    
3180
    return -ENOTSUP;
3181
}
3182

    
3183
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3184
                      int64_t pos, int size)
3185
{
3186
    BlockDriver *drv = bs->drv;
3187
    if (!drv)
3188
        return -ENOMEDIUM;
3189
    if (drv->bdrv_load_vmstate)
3190
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3191
    if (bs->file)
3192
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3193
    return -ENOTSUP;
3194
}
3195

    
3196
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3197
{
3198
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3199
        return;
3200
    }
3201

    
3202
    bs->drv->bdrv_debug_event(bs, event);
3203
}
3204

    
3205
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3206
                          const char *tag)
3207
{
3208
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3209
        bs = bs->file;
3210
    }
3211

    
3212
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3213
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3214
    }
3215

    
3216
    return -ENOTSUP;
3217
}
3218

    
3219
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3220
{
3221
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3222
        bs = bs->file;
3223
    }
3224

    
3225
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3226
        return bs->drv->bdrv_debug_resume(bs, tag);
3227
    }
3228

    
3229
    return -ENOTSUP;
3230
}
3231

    
3232
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3233
{
3234
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3235
        bs = bs->file;
3236
    }
3237

    
3238
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3239
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3240
    }
3241

    
3242
    return false;
3243
}
3244

    
3245
int bdrv_is_snapshot(BlockDriverState *bs)
3246
{
3247
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3248
}
3249

    
3250
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3251
 * relative, it must be relative to the chain.  So, passing in bs->filename
3252
 * from a BDS as backing_file should not be done, as that may be relative to
3253
 * the CWD rather than the chain. */
3254
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3255
        const char *backing_file)
3256
{
3257
    char *filename_full = NULL;
3258
    char *backing_file_full = NULL;
3259
    char *filename_tmp = NULL;
3260
    int is_protocol = 0;
3261
    BlockDriverState *curr_bs = NULL;
3262
    BlockDriverState *retval = NULL;
3263

    
3264
    if (!bs || !bs->drv || !backing_file) {
3265
        return NULL;
3266
    }
3267

    
3268
    filename_full     = g_malloc(PATH_MAX);
3269
    backing_file_full = g_malloc(PATH_MAX);
3270
    filename_tmp      = g_malloc(PATH_MAX);
3271

    
3272
    is_protocol = path_has_protocol(backing_file);
3273

    
3274
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3275

    
3276
        /* If either of the filename paths is actually a protocol, then
3277
         * compare unmodified paths; otherwise make paths relative */
3278
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3279
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3280
                retval = curr_bs->backing_hd;
3281
                break;
3282
            }
3283
        } else {
3284
            /* If not an absolute filename path, make it relative to the current
3285
             * image's filename path */
3286
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3287
                         backing_file);
3288

    
3289
            /* We are going to compare absolute pathnames */
3290
            if (!realpath(filename_tmp, filename_full)) {
3291
                continue;
3292
            }
3293

    
3294
            /* We need to make sure the backing filename we are comparing against
3295
             * is relative to the current image filename (or absolute) */
3296
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3297
                         curr_bs->backing_file);
3298

    
3299
            if (!realpath(filename_tmp, backing_file_full)) {
3300
                continue;
3301
            }
3302

    
3303
            if (strcmp(backing_file_full, filename_full) == 0) {
3304
                retval = curr_bs->backing_hd;
3305
                break;
3306
            }
3307
        }
3308
    }
3309

    
3310
    g_free(filename_full);
3311
    g_free(backing_file_full);
3312
    g_free(filename_tmp);
3313
    return retval;
3314
}
3315

    
3316
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3317
{
3318
    if (!bs->drv) {
3319
        return 0;
3320
    }
3321

    
3322
    if (!bs->backing_hd) {
3323
        return 0;
3324
    }
3325

    
3326
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3327
}
3328

    
3329
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3330
{
3331
    BlockDriverState *curr_bs = NULL;
3332

    
3333
    if (!bs) {
3334
        return NULL;
3335
    }
3336

    
3337
    curr_bs = bs;
3338

    
3339
    while (curr_bs->backing_hd) {
3340
        curr_bs = curr_bs->backing_hd;
3341
    }
3342
    return curr_bs;
3343
}
3344

    
3345
/**************************************************************/
3346
/* async I/Os */
3347

    
3348
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3349
                                 QEMUIOVector *qiov, int nb_sectors,
3350
                                 BlockDriverCompletionFunc *cb, void *opaque)
3351
{
3352
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3353

    
3354
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3355
                                 cb, opaque, false);
3356
}
3357

    
3358
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3359
                                  QEMUIOVector *qiov, int nb_sectors,
3360
                                  BlockDriverCompletionFunc *cb, void *opaque)
3361
{
3362
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3363

    
3364
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3365
                                 cb, opaque, true);
3366
}
3367

    
3368

    
3369
typedef struct MultiwriteCB {
3370
    int error;
3371
    int num_requests;
3372
    int num_callbacks;
3373
    struct {
3374
        BlockDriverCompletionFunc *cb;
3375
        void *opaque;
3376
        QEMUIOVector *free_qiov;
3377
    } callbacks[];
3378
} MultiwriteCB;
3379

    
3380
static void multiwrite_user_cb(MultiwriteCB *mcb)
3381
{
3382
    int i;
3383

    
3384
    for (i = 0; i < mcb->num_callbacks; i++) {
3385
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3386
        if (mcb->callbacks[i].free_qiov) {
3387
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3388
        }
3389
        g_free(mcb->callbacks[i].free_qiov);
3390
    }
3391
}
3392

    
3393
static void multiwrite_cb(void *opaque, int ret)
3394
{
3395
    MultiwriteCB *mcb = opaque;
3396

    
3397
    trace_multiwrite_cb(mcb, ret);
3398

    
3399
    if (ret < 0 && !mcb->error) {
3400
        mcb->error = ret;
3401
    }
3402

    
3403
    mcb->num_requests--;
3404
    if (mcb->num_requests == 0) {
3405
        multiwrite_user_cb(mcb);
3406
        g_free(mcb);
3407
    }
3408
}
3409

    
3410
static int multiwrite_req_compare(const void *a, const void *b)
3411
{
3412
    const BlockRequest *req1 = a, *req2 = b;
3413

    
3414
    /*
3415
     * Note that we can't simply subtract req2->sector from req1->sector
3416
     * here as that could overflow the return value.
3417
     */
3418
    if (req1->sector > req2->sector) {
3419
        return 1;
3420
    } else if (req1->sector < req2->sector) {
3421
        return -1;
3422
    } else {
3423
        return 0;
3424
    }
3425
}
3426

    
3427
/*
3428
 * Takes a bunch of requests and tries to merge them. Returns the number of
3429
 * requests that remain after merging.
3430
 */
3431
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3432
    int num_reqs, MultiwriteCB *mcb)
3433
{
3434
    int i, outidx;
3435

    
3436
    // Sort requests by start sector
3437
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3438

    
3439
    // Check if adjacent requests touch the same clusters. If so, combine them,
3440
    // filling up gaps with zero sectors.
3441
    outidx = 0;
3442
    for (i = 1; i < num_reqs; i++) {
3443
        int merge = 0;
3444
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3445

    
3446
        // Handle exactly sequential writes and overlapping writes.
3447
        if (reqs[i].sector <= oldreq_last) {
3448
            merge = 1;
3449
        }
3450

    
3451
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3452
            merge = 0;
3453
        }
3454

    
3455
        if (merge) {
3456
            size_t size;
3457
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3458
            qemu_iovec_init(qiov,
3459
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3460

    
3461
            // Add the first request to the merged one. If the requests are
3462
            // overlapping, drop the last sectors of the first request.
3463
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3464
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3465

    
3466
            // We should need to add any zeros between the two requests
3467
            assert (reqs[i].sector <= oldreq_last);
3468

    
3469
            // Add the second request
3470
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3471

    
3472
            reqs[outidx].nb_sectors = qiov->size >> 9;
3473
            reqs[outidx].qiov = qiov;
3474

    
3475
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3476
        } else {
3477
            outidx++;
3478
            reqs[outidx].sector     = reqs[i].sector;
3479
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3480
            reqs[outidx].qiov       = reqs[i].qiov;
3481
        }
3482
    }
3483

    
3484
    return outidx + 1;
3485
}
3486

    
3487
/*
3488
 * Submit multiple AIO write requests at once.
3489
 *
3490
 * On success, the function returns 0 and all requests in the reqs array have
3491
 * been submitted. In error case this function returns -1, and any of the
3492
 * requests may or may not be submitted yet. In particular, this means that the
3493
 * callback will be called for some of the requests, for others it won't. The
3494
 * caller must check the error field of the BlockRequest to wait for the right
3495
 * callbacks (if error != 0, no callback will be called).
3496
 *
3497
 * The implementation may modify the contents of the reqs array, e.g. to merge
3498
 * requests. However, the fields opaque and error are left unmodified as they
3499
 * are used to signal failure for a single request to the caller.
3500
 */
3501
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3502
{
3503
    MultiwriteCB *mcb;
3504
    int i;
3505

    
3506
    /* don't submit writes if we don't have a medium */
3507
    if (bs->drv == NULL) {
3508
        for (i = 0; i < num_reqs; i++) {
3509
            reqs[i].error = -ENOMEDIUM;
3510
        }
3511
        return -1;
3512
    }
3513

    
3514
    if (num_reqs == 0) {
3515
        return 0;
3516
    }
3517

    
3518
    // Create MultiwriteCB structure
3519
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3520
    mcb->num_requests = 0;
3521
    mcb->num_callbacks = num_reqs;
3522

    
3523
    for (i = 0; i < num_reqs; i++) {
3524
        mcb->callbacks[i].cb = reqs[i].cb;
3525
        mcb->callbacks[i].opaque = reqs[i].opaque;
3526
    }
3527

    
3528
    // Check for mergable requests
3529
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3530

    
3531
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3532

    
3533
    /* Run the aio requests. */
3534
    mcb->num_requests = num_reqs;
3535
    for (i = 0; i < num_reqs; i++) {
3536
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3537
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3538
    }
3539

    
3540
    return 0;
3541
}
3542

    
3543
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3544
{
3545
    acb->aiocb_info->cancel(acb);
3546
}
3547

    
3548
/* block I/O throttling */
3549
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3550
                 bool is_write, double elapsed_time, uint64_t *wait)
3551
{
3552
    uint64_t bps_limit = 0;
3553
    uint64_t extension;
3554
    double   bytes_limit, bytes_base, bytes_res;
3555
    double   slice_time, wait_time;
3556

    
3557
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3558
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3559
    } else if (bs->io_limits.bps[is_write]) {
3560
        bps_limit = bs->io_limits.bps[is_write];
3561
    } else {
3562
        if (wait) {
3563
            *wait = 0;
3564
        }
3565

    
3566
        return false;
3567
    }
3568

    
3569
    slice_time = bs->slice_end - bs->slice_start;
3570
    slice_time /= (NANOSECONDS_PER_SECOND);
3571
    bytes_limit = bps_limit * slice_time;
3572
    bytes_base  = bs->slice_submitted.bytes[is_write];
3573
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3574
        bytes_base += bs->slice_submitted.bytes[!is_write];
3575
    }
3576

    
3577
    /* bytes_base: the bytes of data which have been read/written; and
3578
     *             it is obtained from the history statistic info.
3579
     * bytes_res: the remaining bytes of data which need to be read/written.
3580
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3581
     *             the total time for completing reading/writting all data.
3582
     */
3583
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3584

    
3585
    if (bytes_base + bytes_res <= bytes_limit) {
3586
        if (wait) {
3587
            *wait = 0;
3588
        }
3589

    
3590
        return false;
3591
    }
3592

    
3593
    /* Calc approx time to dispatch */
3594
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3595

    
3596
    /* When the I/O rate at runtime exceeds the limits,
3597
     * bs->slice_end need to be extended in order that the current statistic
3598
     * info can be kept until the timer fire, so it is increased and tuned
3599
     * based on the result of experiment.
3600
     */
3601
    extension = wait_time * NANOSECONDS_PER_SECOND;
3602
    extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
3603
                BLOCK_IO_SLICE_TIME;
3604
    bs->slice_end += extension;
3605
    if (wait) {
3606
        *wait = wait_time * NANOSECONDS_PER_SECOND;
3607
    }
3608

    
3609
    return true;
3610
}
3611

    
3612
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3613
                             double elapsed_time, uint64_t *wait)
3614
{
3615
    uint64_t iops_limit = 0;
3616
    double   ios_limit, ios_base;
3617
    double   slice_time, wait_time;
3618

    
3619
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3620
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3621
    } else if (bs->io_limits.iops[is_write]) {
3622
        iops_limit = bs->io_limits.iops[is_write];
3623
    } else {
3624
        if (wait) {
3625
            *wait = 0;
3626
        }
3627

    
3628
        return false;
3629
    }
3630

    
3631
    slice_time = bs->slice_end - bs->slice_start;
3632
    slice_time /= (NANOSECONDS_PER_SECOND);
3633
    ios_limit  = iops_limit * slice_time;
3634
    ios_base   = bs->slice_submitted.ios[is_write];
3635
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3636
        ios_base += bs->slice_submitted.ios[!is_write];
3637
    }
3638

    
3639
    if (ios_base + 1 <= ios_limit) {
3640
        if (wait) {
3641
            *wait = 0;
3642
        }
3643

    
3644
        return false;
3645
    }
3646

    
3647
    /* Calc approx time to dispatch, in seconds */
3648
    wait_time = (ios_base + 1) / iops_limit;
3649
    if (wait_time > elapsed_time) {
3650
        wait_time = wait_time - elapsed_time;
3651
    } else {
3652
        wait_time = 0;
3653
    }
3654

    
3655
    /* Exceeded current slice, extend it by another slice time */
3656
    bs->slice_end += BLOCK_IO_SLICE_TIME;
3657
    if (wait) {
3658
        *wait = wait_time * NANOSECONDS_PER_SECOND;
3659
    }
3660

    
3661
    return true;
3662
}
3663

    
3664
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3665
                           bool is_write, int64_t *wait)
3666
{
3667
    int64_t  now, max_wait;
3668
    uint64_t bps_wait = 0, iops_wait = 0;
3669
    double   elapsed_time;
3670
    int      bps_ret, iops_ret;
3671

    
3672
    now = qemu_get_clock_ns(vm_clock);
3673
    if (now > bs->slice_end) {
3674
        bs->slice_start = now;
3675
        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
3676
        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
3677
    }
3678

    
3679
    elapsed_time  = now - bs->slice_start;
3680
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3681

    
3682
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3683
                                      is_write, elapsed_time, &bps_wait);
3684
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3685
                                      elapsed_time, &iops_wait);
3686
    if (bps_ret || iops_ret) {
3687
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3688
        if (wait) {
3689
            *wait = max_wait;
3690
        }
3691

    
3692
        now = qemu_get_clock_ns(vm_clock);
3693
        if (bs->slice_end < now + max_wait) {
3694
            bs->slice_end = now + max_wait;
3695
        }
3696

    
3697
        return true;
3698
    }
3699

    
3700
    if (wait) {
3701
        *wait = 0;
3702
    }
3703

    
3704
    bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
3705
                                           BDRV_SECTOR_SIZE;
3706
    bs->slice_submitted.ios[is_write]++;
3707

    
3708
    return false;
3709
}
3710

    
3711
/**************************************************************/
3712
/* async block device emulation */
3713

    
3714
typedef struct BlockDriverAIOCBSync {
3715
    BlockDriverAIOCB common;
3716
    QEMUBH *bh;
3717
    int ret;
3718
    /* vector translation state */
3719
    QEMUIOVector *qiov;
3720
    uint8_t *bounce;
3721
    int is_write;
3722
} BlockDriverAIOCBSync;
3723

    
3724
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3725
{
3726
    BlockDriverAIOCBSync *acb =
3727
        container_of(blockacb, BlockDriverAIOCBSync, common);
3728
    qemu_bh_delete(acb->bh);
3729
    acb->bh = NULL;
3730
    qemu_aio_release(acb);
3731
}
3732

    
3733
static const AIOCBInfo bdrv_em_aiocb_info = {
3734
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3735
    .cancel             = bdrv_aio_cancel_em,
3736
};
3737

    
3738
static void bdrv_aio_bh_cb(void *opaque)
3739
{
3740
    BlockDriverAIOCBSync *acb = opaque;
3741

    
3742
    if (!acb->is_write)
3743
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3744
    qemu_vfree(acb->bounce);
3745
    acb->common.cb(acb->common.opaque, acb->ret);
3746
    qemu_bh_delete(acb->bh);
3747
    acb->bh = NULL;
3748
    qemu_aio_release(acb);
3749
}
3750

    
3751
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3752
                                            int64_t sector_num,
3753
                                            QEMUIOVector *qiov,
3754
                                            int nb_sectors,
3755
                                            BlockDriverCompletionFunc *cb,
3756
                                            void *opaque,
3757
                                            int is_write)
3758

    
3759
{
3760
    BlockDriverAIOCBSync *acb;
3761

    
3762
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3763
    acb->is_write = is_write;
3764
    acb->qiov = qiov;
3765
    acb->bounce = qemu_blockalign(bs, qiov->size);
3766
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3767

    
3768
    if (is_write) {
3769
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3770
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3771
    } else {
3772
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3773
    }
3774

    
3775
    qemu_bh_schedule(acb->bh);
3776

    
3777
    return &acb->common;
3778
}
3779

    
3780
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3781
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3782
        BlockDriverCompletionFunc *cb, void *opaque)
3783
{
3784
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3785
}
3786

    
3787
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3788
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3789
        BlockDriverCompletionFunc *cb, void *opaque)
3790
{
3791
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3792
}
3793

    
3794

    
3795
typedef struct BlockDriverAIOCBCoroutine {
3796
    BlockDriverAIOCB common;
3797
    BlockRequest req;
3798
    bool is_write;
3799
    bool *done;
3800
    QEMUBH* bh;
3801
} BlockDriverAIOCBCoroutine;
3802

    
3803
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3804
{
3805
    BlockDriverAIOCBCoroutine *acb =
3806
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3807
    bool done = false;
3808

    
3809
    acb->done = &done;
3810
    while (!done) {
3811
        qemu_aio_wait();
3812
    }
3813
}
3814

    
3815
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3816
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3817
    .cancel             = bdrv_aio_co_cancel_em,
3818
};
3819

    
3820
static void bdrv_co_em_bh(void *opaque)
3821
{
3822
    BlockDriverAIOCBCoroutine *acb = opaque;
3823

    
3824
    acb->common.cb(acb->common.opaque, acb->req.error);
3825

    
3826
    if (acb->done) {
3827
        *acb->done = true;
3828
    }
3829

    
3830
    qemu_bh_delete(acb->bh);
3831
    qemu_aio_release(acb);
3832
}
3833

    
3834
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3835
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3836
{
3837
    BlockDriverAIOCBCoroutine *acb = opaque;
3838
    BlockDriverState *bs = acb->common.bs;
3839

    
3840
    if (!acb->is_write) {
3841
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3842
            acb->req.nb_sectors, acb->req.qiov, 0);
3843
    } else {
3844
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3845
            acb->req.nb_sectors, acb->req.qiov, 0);
3846
    }
3847

    
3848
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3849
    qemu_bh_schedule(acb->bh);
3850
}
3851

    
3852
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3853
                                               int64_t sector_num,
3854
                                               QEMUIOVector *qiov,
3855
                                               int nb_sectors,
3856
                                               BlockDriverCompletionFunc *cb,
3857
                                               void *opaque,
3858
                                               bool is_write)
3859
{
3860
    Coroutine *co;
3861
    BlockDriverAIOCBCoroutine *acb;
3862

    
3863
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3864
    acb->req.sector = sector_num;
3865
    acb->req.nb_sectors = nb_sectors;
3866
    acb->req.qiov = qiov;
3867
    acb->is_write = is_write;
3868
    acb->done = NULL;
3869

    
3870
    co = qemu_coroutine_create(bdrv_co_do_rw);
3871
    qemu_coroutine_enter(co, acb);
3872

    
3873
    return &acb->common;
3874
}
3875

    
3876
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3877
{
3878
    BlockDriverAIOCBCoroutine *acb = opaque;
3879
    BlockDriverState *bs = acb->common.bs;
3880

    
3881
    acb->req.error = bdrv_co_flush(bs);
3882
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3883
    qemu_bh_schedule(acb->bh);
3884
}
3885

    
3886
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3887
        BlockDriverCompletionFunc *cb, void *opaque)
3888
{
3889
    trace_bdrv_aio_flush(bs, opaque);
3890

    
3891
    Coroutine *co;
3892
    BlockDriverAIOCBCoroutine *acb;
3893

    
3894
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3895
    acb->done = NULL;
3896

    
3897
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3898
    qemu_coroutine_enter(co, acb);
3899

    
3900
    return &acb->common;
3901
}
3902

    
3903
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3904
{
3905
    BlockDriverAIOCBCoroutine *acb = opaque;
3906
    BlockDriverState *bs = acb->common.bs;
3907

    
3908
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3909
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3910
    qemu_bh_schedule(acb->bh);
3911
}
3912

    
3913
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3914
        int64_t sector_num, int nb_sectors,
3915
        BlockDriverCompletionFunc *cb, void *opaque)
3916
{
3917
    Coroutine *co;
3918
    BlockDriverAIOCBCoroutine *acb;
3919

    
3920
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3921

    
3922
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3923
    acb->req.sector = sector_num;
3924
    acb->req.nb_sectors = nb_sectors;
3925
    acb->done = NULL;
3926
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3927
    qemu_coroutine_enter(co, acb);
3928

    
3929
    return &acb->common;
3930
}
3931

    
3932
void bdrv_init(void)
3933
{
3934
    module_call_init(MODULE_INIT_BLOCK);
3935
}
3936

    
3937
void bdrv_init_with_whitelist(void)
3938
{
3939
    use_bdrv_whitelist = 1;
3940
    bdrv_init();
3941
}
3942

    
3943
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
3944
                   BlockDriverCompletionFunc *cb, void *opaque)
3945
{
3946
    BlockDriverAIOCB *acb;
3947

    
3948
    acb = g_slice_alloc(aiocb_info->aiocb_size);
3949
    acb->aiocb_info = aiocb_info;
3950
    acb->bs = bs;
3951
    acb->cb = cb;
3952
    acb->opaque = opaque;
3953
    return acb;
3954
}
3955

    
3956
void qemu_aio_release(void *p)
3957
{
3958
    BlockDriverAIOCB *acb = p;
3959
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
3960
}
3961

    
3962
/**************************************************************/
3963
/* Coroutine block device emulation */
3964

    
3965
typedef struct CoroutineIOCompletion {
3966
    Coroutine *coroutine;
3967
    int ret;
3968
} CoroutineIOCompletion;
3969

    
3970
static void bdrv_co_io_em_complete(void *opaque, int ret)
3971
{
3972
    CoroutineIOCompletion *co = opaque;
3973

    
3974
    co->ret = ret;
3975
    qemu_coroutine_enter(co->coroutine, NULL);
3976
}
3977

    
3978
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3979
                                      int nb_sectors, QEMUIOVector *iov,
3980
                                      bool is_write)
3981
{
3982
    CoroutineIOCompletion co = {
3983
        .coroutine = qemu_coroutine_self(),
3984
    };
3985
    BlockDriverAIOCB *acb;
3986

    
3987
    if (is_write) {
3988
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3989
                                       bdrv_co_io_em_complete, &co);
3990
    } else {
3991
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3992
                                      bdrv_co_io_em_complete, &co);
3993
    }
3994

    
3995
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3996
    if (!acb) {
3997
        return -EIO;
3998
    }
3999
    qemu_coroutine_yield();
4000

    
4001
    return co.ret;
4002
}
4003

    
4004
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4005
                                         int64_t sector_num, int nb_sectors,
4006
                                         QEMUIOVector *iov)
4007
{
4008
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4009
}
4010

    
4011
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4012
                                         int64_t sector_num, int nb_sectors,
4013
                                         QEMUIOVector *iov)
4014
{
4015
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4016
}
4017

    
4018
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4019
{
4020
    RwCo *rwco = opaque;
4021

    
4022
    rwco->ret = bdrv_co_flush(rwco->bs);
4023
}
4024

    
4025
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4026
{
4027
    int ret;
4028

    
4029
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4030
        return 0;
4031
    }
4032

    
4033
    /* Write back cached data to the OS even with cache=unsafe */
4034
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4035
    if (bs->drv->bdrv_co_flush_to_os) {
4036
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4037
        if (ret < 0) {
4038
            return ret;
4039
        }
4040
    }
4041

    
4042
    /* But don't actually force it to the disk with cache=unsafe */
4043
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4044
        goto flush_parent;
4045
    }
4046

    
4047
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4048
    if (bs->drv->bdrv_co_flush_to_disk) {
4049
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4050
    } else if (bs->drv->bdrv_aio_flush) {
4051
        BlockDriverAIOCB *acb;
4052
        CoroutineIOCompletion co = {
4053
            .coroutine = qemu_coroutine_self(),
4054
        };
4055

    
4056
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4057
        if (acb == NULL) {
4058
            ret = -EIO;
4059
        } else {
4060
            qemu_coroutine_yield();
4061
            ret = co.ret;
4062
        }
4063
    } else {
4064
        /*
4065
         * Some block drivers always operate in either writethrough or unsafe
4066
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4067
         * know how the server works (because the behaviour is hardcoded or
4068
         * depends on server-side configuration), so we can't ensure that
4069
         * everything is safe on disk. Returning an error doesn't work because
4070
         * that would break guests even if the server operates in writethrough
4071
         * mode.
4072
         *
4073
         * Let's hope the user knows what he's doing.
4074
         */
4075
        ret = 0;
4076
    }
4077
    if (ret < 0) {
4078
        return ret;
4079
    }
4080

    
4081
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4082
     * in the case of cache=unsafe, so there are no useless flushes.
4083
     */
4084
flush_parent:
4085
    return bdrv_co_flush(bs->file);
4086
}
4087

    
4088
void bdrv_invalidate_cache(BlockDriverState *bs)
4089
{
4090
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4091
        bs->drv->bdrv_invalidate_cache(bs);
4092
    }
4093
}
4094

    
4095
void bdrv_invalidate_cache_all(void)
4096
{
4097
    BlockDriverState *bs;
4098

    
4099
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4100
        bdrv_invalidate_cache(bs);
4101
    }
4102
}
4103

    
4104
void bdrv_clear_incoming_migration_all(void)
4105
{
4106
    BlockDriverState *bs;
4107

    
4108
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4109
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4110
    }
4111
}
4112

    
4113
int bdrv_flush(BlockDriverState *bs)
4114
{
4115
    Coroutine *co;
4116
    RwCo rwco = {
4117
        .bs = bs,
4118
        .ret = NOT_DONE,
4119
    };
4120

    
4121
    if (qemu_in_coroutine()) {
4122
        /* Fast-path if already in coroutine context */
4123
        bdrv_flush_co_entry(&rwco);
4124
    } else {
4125
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4126
        qemu_coroutine_enter(co, &rwco);
4127
        while (rwco.ret == NOT_DONE) {
4128
            qemu_aio_wait();
4129
        }
4130
    }
4131

    
4132
    return rwco.ret;
4133
}
4134

    
4135
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4136
{
4137
    RwCo *rwco = opaque;
4138

    
4139
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4140
}
4141

    
4142
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4143
                                 int nb_sectors)
4144
{
4145
    if (!bs->drv) {
4146
        return -ENOMEDIUM;
4147
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4148
        return -EIO;
4149
    } else if (bs->read_only) {
4150
        return -EROFS;
4151
    }
4152

    
4153
    if (bs->dirty_bitmap) {
4154
        bdrv_reset_dirty(bs, sector_num, nb_sectors);
4155
    }
4156

    
4157
    /* Do nothing if disabled.  */
4158
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4159
        return 0;
4160
    }
4161

    
4162
    if (bs->drv->bdrv_co_discard) {
4163
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4164
    } else if (bs->drv->bdrv_aio_discard) {
4165
        BlockDriverAIOCB *acb;
4166
        CoroutineIOCompletion co = {
4167
            .coroutine = qemu_coroutine_self(),
4168
        };
4169

    
4170
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4171
                                        bdrv_co_io_em_complete, &co);
4172
        if (acb == NULL) {
4173
            return -EIO;
4174
        } else {
4175
            qemu_coroutine_yield();
4176
            return co.ret;
4177
        }
4178
    } else {
4179
        return 0;
4180
    }
4181
}
4182

    
4183
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4184
{
4185
    Coroutine *co;
4186
    RwCo rwco = {
4187
        .bs = bs,
4188
        .sector_num = sector_num,
4189
        .nb_sectors = nb_sectors,
4190
        .ret = NOT_DONE,
4191
    };
4192

    
4193
    if (qemu_in_coroutine()) {
4194
        /* Fast-path if already in coroutine context */
4195
        bdrv_discard_co_entry(&rwco);
4196
    } else {
4197
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4198
        qemu_coroutine_enter(co, &rwco);
4199
        while (rwco.ret == NOT_DONE) {
4200
            qemu_aio_wait();
4201
        }
4202
    }
4203

    
4204
    return rwco.ret;
4205
}
4206

    
4207
/**************************************************************/
4208
/* removable device support */
4209

    
4210
/**
4211
 * Return TRUE if the media is present
4212
 */
4213
int bdrv_is_inserted(BlockDriverState *bs)
4214
{
4215
    BlockDriver *drv = bs->drv;
4216

    
4217
    if (!drv)
4218
        return 0;
4219
    if (!drv->bdrv_is_inserted)
4220
        return 1;
4221
    return drv->bdrv_is_inserted(bs);
4222
}
4223

    
4224
/**
4225
 * Return whether the media changed since the last call to this
4226
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4227
 */
4228
int bdrv_media_changed(BlockDriverState *bs)
4229
{
4230
    BlockDriver *drv = bs->drv;
4231

    
4232
    if (drv && drv->bdrv_media_changed) {
4233
        return drv->bdrv_media_changed(bs);
4234
    }
4235
    return -ENOTSUP;
4236
}
4237

    
4238
/**
4239
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4240
 */
4241
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4242
{
4243
    BlockDriver *drv = bs->drv;
4244

    
4245
    if (drv && drv->bdrv_eject) {
4246
        drv->bdrv_eject(bs, eject_flag);
4247
    }
4248

    
4249
    if (bs->device_name[0] != '\0') {
4250
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4251
    }
4252
}
4253

    
4254
/**
4255
 * Lock or unlock the media (if it is locked, the user won't be able
4256
 * to eject it manually).
4257
 */
4258
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4259
{
4260
    BlockDriver *drv = bs->drv;
4261

    
4262
    trace_bdrv_lock_medium(bs, locked);
4263

    
4264
    if (drv && drv->bdrv_lock_medium) {
4265
        drv->bdrv_lock_medium(bs, locked);
4266
    }
4267
}
4268

    
4269
/* needed for generic scsi interface */
4270

    
4271
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4272
{
4273
    BlockDriver *drv = bs->drv;
4274

    
4275
    if (drv && drv->bdrv_ioctl)
4276
        return drv->bdrv_ioctl(bs, req, buf);
4277
    return -ENOTSUP;
4278
}
4279

    
4280
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4281
        unsigned long int req, void *buf,
4282
        BlockDriverCompletionFunc *cb, void *opaque)
4283
{
4284
    BlockDriver *drv = bs->drv;
4285

    
4286
    if (drv && drv->bdrv_aio_ioctl)
4287
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4288
    return NULL;
4289
}
4290

    
4291
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4292
{
4293
    bs->buffer_alignment = align;
4294
}
4295

    
4296
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4297
{
4298
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4299
}
4300

    
4301
/*
4302
 * Check if all memory in this vector is sector aligned.
4303
 */
4304
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4305
{
4306
    int i;
4307

    
4308
    for (i = 0; i < qiov->niov; i++) {
4309
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4310
            return false;
4311
        }
4312
    }
4313

    
4314
    return true;
4315
}
4316

    
4317
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4318
{
4319
    int64_t bitmap_size;
4320

    
4321
    assert((granularity & (granularity - 1)) == 0);
4322

    
4323
    if (granularity) {
4324
        granularity >>= BDRV_SECTOR_BITS;
4325
        assert(!bs->dirty_bitmap);
4326
        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4327
        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4328
    } else {
4329
        if (bs->dirty_bitmap) {
4330
            hbitmap_free(bs->dirty_bitmap);
4331
            bs->dirty_bitmap = NULL;
4332
        }
4333
    }
4334
}
4335

    
4336
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4337
{
4338
    if (bs->dirty_bitmap) {
4339
        return hbitmap_get(bs->dirty_bitmap, sector);
4340
    } else {
4341
        return 0;
4342
    }
4343
}
4344

    
4345
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4346
{
4347
    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4348
}
4349

    
4350
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4351
                    int nr_sectors)
4352
{
4353
    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4354
}
4355

    
4356
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4357
                      int nr_sectors)
4358
{
4359
    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4360
}
4361

    
4362
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4363
{
4364
    if (bs->dirty_bitmap) {
4365
        return hbitmap_count(bs->dirty_bitmap);
4366
    } else {
4367
        return 0;
4368
    }
4369
}
4370

    
4371
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4372
{
4373
    assert(bs->in_use != in_use);
4374
    bs->in_use = in_use;
4375
}
4376

    
4377
int bdrv_in_use(BlockDriverState *bs)
4378
{
4379
    return bs->in_use;
4380
}
4381

    
4382
void bdrv_iostatus_enable(BlockDriverState *bs)
4383
{
4384
    bs->iostatus_enabled = true;
4385
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4386
}
4387

    
4388
/* The I/O status is only enabled if the drive explicitly
4389
 * enables it _and_ the VM is configured to stop on errors */
4390
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4391
{
4392
    return (bs->iostatus_enabled &&
4393
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4394
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4395
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4396
}
4397

    
4398
void bdrv_iostatus_disable(BlockDriverState *bs)
4399
{
4400
    bs->iostatus_enabled = false;
4401
}
4402

    
4403
void bdrv_iostatus_reset(BlockDriverState *bs)
4404
{
4405
    if (bdrv_iostatus_is_enabled(bs)) {
4406
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4407
        if (bs->job) {
4408
            block_job_iostatus_reset(bs->job);
4409
        }
4410
    }
4411
}
4412

    
4413
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4414
{
4415
    assert(bdrv_iostatus_is_enabled(bs));
4416
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4417
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4418
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4419
    }
4420
}
4421

    
4422
void
4423
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4424
        enum BlockAcctType type)
4425
{
4426
    assert(type < BDRV_MAX_IOTYPE);
4427

    
4428
    cookie->bytes = bytes;
4429
    cookie->start_time_ns = get_clock();
4430
    cookie->type = type;
4431
}
4432

    
4433
void
4434
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4435
{
4436
    assert(cookie->type < BDRV_MAX_IOTYPE);
4437

    
4438
    bs->nr_bytes[cookie->type] += cookie->bytes;
4439
    bs->nr_ops[cookie->type]++;
4440
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4441
}
4442

    
4443
void bdrv_img_create(const char *filename, const char *fmt,
4444
                     const char *base_filename, const char *base_fmt,
4445
                     char *options, uint64_t img_size, int flags,
4446
                     Error **errp, bool quiet)
4447
{
4448
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4449
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4450
    BlockDriverState *bs = NULL;
4451
    BlockDriver *drv, *proto_drv;
4452
    BlockDriver *backing_drv = NULL;
4453
    int ret = 0;
4454

    
4455
    /* Find driver and parse its options */
4456
    drv = bdrv_find_format(fmt);
4457
    if (!drv) {
4458
        error_setg(errp, "Unknown file format '%s'", fmt);
4459
        return;
4460
    }
4461

    
4462
    proto_drv = bdrv_find_protocol(filename, true);
4463
    if (!proto_drv) {
4464
        error_setg(errp, "Unknown protocol '%s'", filename);
4465
        return;
4466
    }
4467

    
4468
    create_options = append_option_parameters(create_options,
4469
                                              drv->create_options);
4470
    create_options = append_option_parameters(create_options,
4471
                                              proto_drv->create_options);
4472

    
4473
    /* Create parameter list with default values */
4474
    param = parse_option_parameters("", create_options, param);
4475

    
4476
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4477

    
4478
    /* Parse -o options */
4479
    if (options) {
4480
        param = parse_option_parameters(options, create_options, param);
4481
        if (param == NULL) {
4482
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4483
            goto out;
4484
        }
4485
    }
4486

    
4487
    if (base_filename) {
4488
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4489
                                 base_filename)) {
4490
            error_setg(errp, "Backing file not supported for file format '%s'",
4491
                       fmt);
4492
            goto out;
4493
        }
4494
    }
4495

    
4496
    if (base_fmt) {
4497
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4498
            error_setg(errp, "Backing file format not supported for file "
4499
                             "format '%s'", fmt);
4500
            goto out;
4501
        }
4502
    }
4503

    
4504
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4505
    if (backing_file && backing_file->value.s) {
4506
        if (!strcmp(filename, backing_file->value.s)) {
4507
            error_setg(errp, "Error: Trying to create an image with the "
4508
                             "same filename as the backing file");
4509
            goto out;
4510
        }
4511
    }
4512

    
4513
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4514
    if (backing_fmt && backing_fmt->value.s) {
4515
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4516
        if (!backing_drv) {
4517
            error_setg(errp, "Unknown backing file format '%s'",
4518
                       backing_fmt->value.s);
4519
            goto out;
4520
        }
4521
    }
4522

    
4523
    // The size for the image must always be specified, with one exception:
4524
    // If we are using a backing file, we can obtain the size from there
4525
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4526
    if (size && size->value.n == -1) {
4527
        if (backing_file && backing_file->value.s) {
4528
            uint64_t size;
4529
            char buf[32];
4530
            int back_flags;
4531

    
4532
            /* backing files always opened read-only */
4533
            back_flags =
4534
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4535

    
4536
            bs = bdrv_new("");
4537

    
4538
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4539
                            backing_drv);
4540
            if (ret < 0) {
4541
                error_setg_errno(errp, -ret, "Could not open '%s'",
4542
                                 backing_file->value.s);
4543
                goto out;
4544
            }
4545
            bdrv_get_geometry(bs, &size);
4546
            size *= 512;
4547

    
4548
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4549
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4550
        } else {
4551
            error_setg(errp, "Image creation needs a size parameter");
4552
            goto out;
4553
        }
4554
    }
4555

    
4556
    if (!quiet) {
4557
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4558
        print_option_parameters(param);
4559
        puts("");
4560
    }
4561
    ret = bdrv_create(drv, filename, param);
4562
    if (ret < 0) {
4563
        if (ret == -ENOTSUP) {
4564
            error_setg(errp,"Formatting or formatting option not supported for "
4565
                            "file format '%s'", fmt);
4566
        } else if (ret == -EFBIG) {
4567
            const char *cluster_size_hint = "";
4568
            if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4569
                cluster_size_hint = " (try using a larger cluster size)";
4570
            }
4571
            error_setg(errp, "The image size is too large for file format '%s'%s",
4572
                       fmt, cluster_size_hint);
4573
        } else {
4574
            error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
4575
                       strerror(-ret));
4576
        }
4577
    }
4578

    
4579
out:
4580
    free_option_parameters(create_options);
4581
    free_option_parameters(param);
4582

    
4583
    if (bs) {
4584
        bdrv_delete(bs);
4585
    }
4586
}
4587

    
4588
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4589
{
4590
    /* Currently BlockDriverState always uses the main loop AioContext */
4591
    return qemu_get_aio_context();
4592
}
4593

    
4594
void bdrv_add_before_write_notifier(BlockDriverState *bs,
4595
                                    NotifierWithReturn *notifier)
4596
{
4597
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4598
}