Statistics
| Branch: | Revision:

root / block.c @ 182735ef

History | View | Annotate | Download (125.7 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90
        bool is_write, double elapsed_time, uint64_t *wait);
91
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92
        double elapsed_time, uint64_t *wait);
93
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94
        bool is_write, int64_t *wait);
95

    
96
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
98

    
99
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
101

    
102
/* If non-zero, use only whitelisted block drivers */
103
static int use_bdrv_whitelist;
104

    
105
#ifdef _WIN32
106
static int is_windows_drive_prefix(const char *filename)
107
{
108
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110
            filename[1] == ':');
111
}
112

    
113
int is_windows_drive(const char *filename)
114
{
115
    if (is_windows_drive_prefix(filename) &&
116
        filename[2] == '\0')
117
        return 1;
118
    if (strstart(filename, "\\\\.\\", NULL) ||
119
        strstart(filename, "//./", NULL))
120
        return 1;
121
    return 0;
122
}
123
#endif
124

    
125
/* throttling disk I/O limits */
126
void bdrv_io_limits_disable(BlockDriverState *bs)
127
{
128
    bs->io_limits_enabled = false;
129

    
130
    while (qemu_co_queue_next(&bs->throttled_reqs));
131

    
132
    if (bs->block_timer) {
133
        qemu_del_timer(bs->block_timer);
134
        qemu_free_timer(bs->block_timer);
135
        bs->block_timer = NULL;
136
    }
137

    
138
    bs->slice_start = 0;
139
    bs->slice_end   = 0;
140
}
141

    
142
static void bdrv_block_timer(void *opaque)
143
{
144
    BlockDriverState *bs = opaque;
145

    
146
    qemu_co_queue_next(&bs->throttled_reqs);
147
}
148

    
149
void bdrv_io_limits_enable(BlockDriverState *bs)
150
{
151
    qemu_co_queue_init(&bs->throttled_reqs);
152
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153
    bs->io_limits_enabled = true;
154
}
155

    
156
bool bdrv_io_limits_enabled(BlockDriverState *bs)
157
{
158
    BlockIOLimit *io_limits = &bs->io_limits;
159
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
160
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
161
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
162
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
163
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
164
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
165
}
166

    
167
static void bdrv_io_limits_intercept(BlockDriverState *bs,
168
                                     bool is_write, int nb_sectors)
169
{
170
    int64_t wait_time = -1;
171

    
172
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
173
        qemu_co_queue_wait(&bs->throttled_reqs);
174
    }
175

    
176
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
177
     * throttled requests will not be dequeued until the current request is
178
     * allowed to be serviced. So if the current request still exceeds the
179
     * limits, it will be inserted to the head. All requests followed it will
180
     * be still in throttled_reqs queue.
181
     */
182

    
183
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
184
        qemu_mod_timer(bs->block_timer,
185
                       wait_time + qemu_get_clock_ns(vm_clock));
186
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
187
    }
188

    
189
    qemu_co_queue_next(&bs->throttled_reqs);
190
}
191

    
192
/* check if the path starts with "<protocol>:" */
193
static int path_has_protocol(const char *path)
194
{
195
    const char *p;
196

    
197
#ifdef _WIN32
198
    if (is_windows_drive(path) ||
199
        is_windows_drive_prefix(path)) {
200
        return 0;
201
    }
202
    p = path + strcspn(path, ":/\\");
203
#else
204
    p = path + strcspn(path, ":/");
205
#endif
206

    
207
    return *p == ':';
208
}
209

    
210
int path_is_absolute(const char *path)
211
{
212
#ifdef _WIN32
213
    /* specific case for names like: "\\.\d:" */
214
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
215
        return 1;
216
    }
217
    return (*path == '/' || *path == '\\');
218
#else
219
    return (*path == '/');
220
#endif
221
}
222

    
223
/* if filename is absolute, just copy it to dest. Otherwise, build a
224
   path to it by considering it is relative to base_path. URL are
225
   supported. */
226
void path_combine(char *dest, int dest_size,
227
                  const char *base_path,
228
                  const char *filename)
229
{
230
    const char *p, *p1;
231
    int len;
232

    
233
    if (dest_size <= 0)
234
        return;
235
    if (path_is_absolute(filename)) {
236
        pstrcpy(dest, dest_size, filename);
237
    } else {
238
        p = strchr(base_path, ':');
239
        if (p)
240
            p++;
241
        else
242
            p = base_path;
243
        p1 = strrchr(base_path, '/');
244
#ifdef _WIN32
245
        {
246
            const char *p2;
247
            p2 = strrchr(base_path, '\\');
248
            if (!p1 || p2 > p1)
249
                p1 = p2;
250
        }
251
#endif
252
        if (p1)
253
            p1++;
254
        else
255
            p1 = base_path;
256
        if (p1 > p)
257
            p = p1;
258
        len = p - base_path;
259
        if (len > dest_size - 1)
260
            len = dest_size - 1;
261
        memcpy(dest, base_path, len);
262
        dest[len] = '\0';
263
        pstrcat(dest, dest_size, filename);
264
    }
265
}
266

    
267
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
268
{
269
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
270
        pstrcpy(dest, sz, bs->backing_file);
271
    } else {
272
        path_combine(dest, sz, bs->filename, bs->backing_file);
273
    }
274
}
275

    
276
void bdrv_register(BlockDriver *bdrv)
277
{
278
    /* Block drivers without coroutine functions need emulation */
279
    if (!bdrv->bdrv_co_readv) {
280
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
281
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
282

    
283
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
284
         * the block driver lacks aio we need to emulate that too.
285
         */
286
        if (!bdrv->bdrv_aio_readv) {
287
            /* add AIO emulation layer */
288
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
289
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
290
        }
291
    }
292

    
293
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
294
}
295

    
296
/* create a new block device (by default it is empty) */
297
BlockDriverState *bdrv_new(const char *device_name)
298
{
299
    BlockDriverState *bs;
300

    
301
    bs = g_malloc0(sizeof(BlockDriverState));
302
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
303
    if (device_name[0] != '\0') {
304
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
305
    }
306
    bdrv_iostatus_disable(bs);
307
    notifier_list_init(&bs->close_notifiers);
308
    notifier_with_return_list_init(&bs->before_write_notifiers);
309

    
310
    return bs;
311
}
312

    
313
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
314
{
315
    notifier_list_add(&bs->close_notifiers, notify);
316
}
317

    
318
BlockDriver *bdrv_find_format(const char *format_name)
319
{
320
    BlockDriver *drv1;
321
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
322
        if (!strcmp(drv1->format_name, format_name)) {
323
            return drv1;
324
        }
325
    }
326
    return NULL;
327
}
328

    
329
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
330
{
331
    static const char *whitelist_rw[] = {
332
        CONFIG_BDRV_RW_WHITELIST
333
    };
334
    static const char *whitelist_ro[] = {
335
        CONFIG_BDRV_RO_WHITELIST
336
    };
337
    const char **p;
338

    
339
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
340
        return 1;               /* no whitelist, anything goes */
341
    }
342

    
343
    for (p = whitelist_rw; *p; p++) {
344
        if (!strcmp(drv->format_name, *p)) {
345
            return 1;
346
        }
347
    }
348
    if (read_only) {
349
        for (p = whitelist_ro; *p; p++) {
350
            if (!strcmp(drv->format_name, *p)) {
351
                return 1;
352
            }
353
        }
354
    }
355
    return 0;
356
}
357

    
358
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
359
                                          bool read_only)
360
{
361
    BlockDriver *drv = bdrv_find_format(format_name);
362
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
363
}
364

    
365
typedef struct CreateCo {
366
    BlockDriver *drv;
367
    char *filename;
368
    QEMUOptionParameter *options;
369
    int ret;
370
} CreateCo;
371

    
372
static void coroutine_fn bdrv_create_co_entry(void *opaque)
373
{
374
    CreateCo *cco = opaque;
375
    assert(cco->drv);
376

    
377
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
378
}
379

    
380
int bdrv_create(BlockDriver *drv, const char* filename,
381
    QEMUOptionParameter *options)
382
{
383
    int ret;
384

    
385
    Coroutine *co;
386
    CreateCo cco = {
387
        .drv = drv,
388
        .filename = g_strdup(filename),
389
        .options = options,
390
        .ret = NOT_DONE,
391
    };
392

    
393
    if (!drv->bdrv_create) {
394
        ret = -ENOTSUP;
395
        goto out;
396
    }
397

    
398
    if (qemu_in_coroutine()) {
399
        /* Fast-path if already in coroutine context */
400
        bdrv_create_co_entry(&cco);
401
    } else {
402
        co = qemu_coroutine_create(bdrv_create_co_entry);
403
        qemu_coroutine_enter(co, &cco);
404
        while (cco.ret == NOT_DONE) {
405
            qemu_aio_wait();
406
        }
407
    }
408

    
409
    ret = cco.ret;
410

    
411
out:
412
    g_free(cco.filename);
413
    return ret;
414
}
415

    
416
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
417
{
418
    BlockDriver *drv;
419

    
420
    drv = bdrv_find_protocol(filename);
421
    if (drv == NULL) {
422
        return -ENOENT;
423
    }
424

    
425
    return bdrv_create(drv, filename, options);
426
}
427

    
428
/*
429
 * Create a uniquely-named empty temporary file.
430
 * Return 0 upon success, otherwise a negative errno value.
431
 */
432
int get_tmp_filename(char *filename, int size)
433
{
434
#ifdef _WIN32
435
    char temp_dir[MAX_PATH];
436
    /* GetTempFileName requires that its output buffer (4th param)
437
       have length MAX_PATH or greater.  */
438
    assert(size >= MAX_PATH);
439
    return (GetTempPath(MAX_PATH, temp_dir)
440
            && GetTempFileName(temp_dir, "qem", 0, filename)
441
            ? 0 : -GetLastError());
442
#else
443
    int fd;
444
    const char *tmpdir;
445
    tmpdir = getenv("TMPDIR");
446
    if (!tmpdir)
447
        tmpdir = "/tmp";
448
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
449
        return -EOVERFLOW;
450
    }
451
    fd = mkstemp(filename);
452
    if (fd < 0) {
453
        return -errno;
454
    }
455
    if (close(fd) != 0) {
456
        unlink(filename);
457
        return -errno;
458
    }
459
    return 0;
460
#endif
461
}
462

    
463
/*
464
 * Detect host devices. By convention, /dev/cdrom[N] is always
465
 * recognized as a host CDROM.
466
 */
467
static BlockDriver *find_hdev_driver(const char *filename)
468
{
469
    int score_max = 0, score;
470
    BlockDriver *drv = NULL, *d;
471

    
472
    QLIST_FOREACH(d, &bdrv_drivers, list) {
473
        if (d->bdrv_probe_device) {
474
            score = d->bdrv_probe_device(filename);
475
            if (score > score_max) {
476
                score_max = score;
477
                drv = d;
478
            }
479
        }
480
    }
481

    
482
    return drv;
483
}
484

    
485
BlockDriver *bdrv_find_protocol(const char *filename)
486
{
487
    BlockDriver *drv1;
488
    char protocol[128];
489
    int len;
490
    const char *p;
491

    
492
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
493

    
494
    /*
495
     * XXX(hch): we really should not let host device detection
496
     * override an explicit protocol specification, but moving this
497
     * later breaks access to device names with colons in them.
498
     * Thanks to the brain-dead persistent naming schemes on udev-
499
     * based Linux systems those actually are quite common.
500
     */
501
    drv1 = find_hdev_driver(filename);
502
    if (drv1) {
503
        return drv1;
504
    }
505

    
506
    if (!path_has_protocol(filename)) {
507
        return bdrv_find_format("file");
508
    }
509
    p = strchr(filename, ':');
510
    assert(p != NULL);
511
    len = p - filename;
512
    if (len > sizeof(protocol) - 1)
513
        len = sizeof(protocol) - 1;
514
    memcpy(protocol, filename, len);
515
    protocol[len] = '\0';
516
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
517
        if (drv1->protocol_name &&
518
            !strcmp(drv1->protocol_name, protocol)) {
519
            return drv1;
520
        }
521
    }
522
    return NULL;
523
}
524

    
525
static int find_image_format(BlockDriverState *bs, const char *filename,
526
                             BlockDriver **pdrv)
527
{
528
    int score, score_max;
529
    BlockDriver *drv1, *drv;
530
    uint8_t buf[2048];
531
    int ret = 0;
532

    
533
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
534
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
535
        drv = bdrv_find_format("raw");
536
        if (!drv) {
537
            ret = -ENOENT;
538
        }
539
        *pdrv = drv;
540
        return ret;
541
    }
542

    
543
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
544
    if (ret < 0) {
545
        *pdrv = NULL;
546
        return ret;
547
    }
548

    
549
    score_max = 0;
550
    drv = NULL;
551
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
552
        if (drv1->bdrv_probe) {
553
            score = drv1->bdrv_probe(buf, ret, filename);
554
            if (score > score_max) {
555
                score_max = score;
556
                drv = drv1;
557
            }
558
        }
559
    }
560
    if (!drv) {
561
        ret = -ENOENT;
562
    }
563
    *pdrv = drv;
564
    return ret;
565
}
566

    
567
/**
568
 * Set the current 'total_sectors' value
569
 */
570
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
571
{
572
    BlockDriver *drv = bs->drv;
573

    
574
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
575
    if (bs->sg)
576
        return 0;
577

    
578
    /* query actual device if possible, otherwise just trust the hint */
579
    if (drv->bdrv_getlength) {
580
        int64_t length = drv->bdrv_getlength(bs);
581
        if (length < 0) {
582
            return length;
583
        }
584
        hint = length >> BDRV_SECTOR_BITS;
585
    }
586

    
587
    bs->total_sectors = hint;
588
    return 0;
589
}
590

    
591
/**
592
 * Set open flags for a given discard mode
593
 *
594
 * Return 0 on success, -1 if the discard mode was invalid.
595
 */
596
int bdrv_parse_discard_flags(const char *mode, int *flags)
597
{
598
    *flags &= ~BDRV_O_UNMAP;
599

    
600
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
601
        /* do nothing */
602
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
603
        *flags |= BDRV_O_UNMAP;
604
    } else {
605
        return -1;
606
    }
607

    
608
    return 0;
609
}
610

    
611
/**
612
 * Set open flags for a given cache mode
613
 *
614
 * Return 0 on success, -1 if the cache mode was invalid.
615
 */
616
int bdrv_parse_cache_flags(const char *mode, int *flags)
617
{
618
    *flags &= ~BDRV_O_CACHE_MASK;
619

    
620
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
621
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
622
    } else if (!strcmp(mode, "directsync")) {
623
        *flags |= BDRV_O_NOCACHE;
624
    } else if (!strcmp(mode, "writeback")) {
625
        *flags |= BDRV_O_CACHE_WB;
626
    } else if (!strcmp(mode, "unsafe")) {
627
        *flags |= BDRV_O_CACHE_WB;
628
        *flags |= BDRV_O_NO_FLUSH;
629
    } else if (!strcmp(mode, "writethrough")) {
630
        /* this is the default */
631
    } else {
632
        return -1;
633
    }
634

    
635
    return 0;
636
}
637

    
638
/**
639
 * The copy-on-read flag is actually a reference count so multiple users may
640
 * use the feature without worrying about clobbering its previous state.
641
 * Copy-on-read stays enabled until all users have called to disable it.
642
 */
643
void bdrv_enable_copy_on_read(BlockDriverState *bs)
644
{
645
    bs->copy_on_read++;
646
}
647

    
648
void bdrv_disable_copy_on_read(BlockDriverState *bs)
649
{
650
    assert(bs->copy_on_read > 0);
651
    bs->copy_on_read--;
652
}
653

    
654
static int bdrv_open_flags(BlockDriverState *bs, int flags)
655
{
656
    int open_flags = flags | BDRV_O_CACHE_WB;
657

    
658
    /*
659
     * Clear flags that are internal to the block layer before opening the
660
     * image.
661
     */
662
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
663

    
664
    /*
665
     * Snapshots should be writable.
666
     */
667
    if (bs->is_temporary) {
668
        open_flags |= BDRV_O_RDWR;
669
    }
670

    
671
    return open_flags;
672
}
673

    
674
/*
675
 * Common part for opening disk images and files
676
 *
677
 * Removes all processed options from *options.
678
 */
679
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
680
    QDict *options, int flags, BlockDriver *drv)
681
{
682
    int ret, open_flags;
683
    const char *filename;
684

    
685
    assert(drv != NULL);
686
    assert(bs->file == NULL);
687
    assert(options != NULL && bs->options != options);
688

    
689
    if (file != NULL) {
690
        filename = file->filename;
691
    } else {
692
        filename = qdict_get_try_str(options, "filename");
693
    }
694

    
695
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
696

    
697
    /* bdrv_open() with directly using a protocol as drv. This layer is already
698
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
699
     * and return immediately. */
700
    if (file != NULL && drv->bdrv_file_open) {
701
        bdrv_swap(file, bs);
702
        return 0;
703
    }
704

    
705
    bs->open_flags = flags;
706
    bs->buffer_alignment = 512;
707
    open_flags = bdrv_open_flags(bs, flags);
708
    bs->read_only = !(open_flags & BDRV_O_RDWR);
709

    
710
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
711
        return -ENOTSUP;
712
    }
713

    
714
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
715
    if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) {
716
        bdrv_enable_copy_on_read(bs);
717
    }
718

    
719
    if (filename != NULL) {
720
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
721
    } else {
722
        bs->filename[0] = '\0';
723
    }
724

    
725
    bs->drv = drv;
726
    bs->opaque = g_malloc0(drv->instance_size);
727

    
728
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
729

    
730
    /* Open the image, either directly or using a protocol */
731
    if (drv->bdrv_file_open) {
732
        assert(file == NULL);
733
        assert(drv->bdrv_parse_filename || filename != NULL);
734
        ret = drv->bdrv_file_open(bs, options, open_flags);
735
    } else {
736
        if (file == NULL) {
737
            qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't use '%s' as a "
738
                          "block driver for the protocol level",
739
                          drv->format_name);
740
            ret = -EINVAL;
741
            goto free_and_fail;
742
        }
743
        assert(file != NULL);
744
        bs->file = file;
745
        ret = drv->bdrv_open(bs, options, open_flags);
746
    }
747

    
748
    if (ret < 0) {
749
        goto free_and_fail;
750
    }
751

    
752
    ret = refresh_total_sectors(bs, bs->total_sectors);
753
    if (ret < 0) {
754
        goto free_and_fail;
755
    }
756

    
757
#ifndef _WIN32
758
    if (bs->is_temporary) {
759
        assert(filename != NULL);
760
        unlink(filename);
761
    }
762
#endif
763
    return 0;
764

    
765
free_and_fail:
766
    bs->file = NULL;
767
    g_free(bs->opaque);
768
    bs->opaque = NULL;
769
    bs->drv = NULL;
770
    return ret;
771
}
772

    
773
/*
774
 * Opens a file using a protocol (file, host_device, nbd, ...)
775
 *
776
 * options is a QDict of options to pass to the block drivers, or NULL for an
777
 * empty set of options. The reference to the QDict belongs to the block layer
778
 * after the call (even on failure), so if the caller intends to reuse the
779
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
780
 */
781
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
782
                   QDict *options, int flags)
783
{
784
    BlockDriverState *bs;
785
    BlockDriver *drv;
786
    const char *drvname;
787
    int ret;
788

    
789
    /* NULL means an empty set of options */
790
    if (options == NULL) {
791
        options = qdict_new();
792
    }
793

    
794
    bs = bdrv_new("");
795
    bs->options = options;
796
    options = qdict_clone_shallow(options);
797

    
798
    /* Fetch the file name from the options QDict if necessary */
799
    if (!filename) {
800
        filename = qdict_get_try_str(options, "filename");
801
    } else if (filename && !qdict_haskey(options, "filename")) {
802
        qdict_put(options, "filename", qstring_from_str(filename));
803
    } else {
804
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't specify 'file' and "
805
                      "'filename' options at the same time");
806
        ret = -EINVAL;
807
        goto fail;
808
    }
809

    
810
    /* Find the right block driver */
811
    drvname = qdict_get_try_str(options, "driver");
812
    if (drvname) {
813
        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
814
        qdict_del(options, "driver");
815
    } else if (filename) {
816
        drv = bdrv_find_protocol(filename);
817
    } else {
818
        qerror_report(ERROR_CLASS_GENERIC_ERROR,
819
                      "Must specify either driver or file");
820
        drv = NULL;
821
    }
822

    
823
    if (!drv) {
824
        ret = -ENOENT;
825
        goto fail;
826
    }
827

    
828
    /* Parse the filename and open it */
829
    if (drv->bdrv_parse_filename && filename) {
830
        Error *local_err = NULL;
831
        drv->bdrv_parse_filename(filename, options, &local_err);
832
        if (error_is_set(&local_err)) {
833
            qerror_report_err(local_err);
834
            error_free(local_err);
835
            ret = -EINVAL;
836
            goto fail;
837
        }
838
        qdict_del(options, "filename");
839
    } else if (!drv->bdrv_parse_filename && !filename) {
840
        qerror_report(ERROR_CLASS_GENERIC_ERROR,
841
                      "The '%s' block driver requires a file name",
842
                      drv->format_name);
843
        ret = -EINVAL;
844
        goto fail;
845
    }
846

    
847
    ret = bdrv_open_common(bs, NULL, options, flags, drv);
848
    if (ret < 0) {
849
        goto fail;
850
    }
851

    
852
    /* Check if any unknown options were used */
853
    if (qdict_size(options) != 0) {
854
        const QDictEntry *entry = qdict_first(options);
855
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't "
856
                      "support the option '%s'",
857
                      drv->format_name, entry->key);
858
        ret = -EINVAL;
859
        goto fail;
860
    }
861
    QDECREF(options);
862

    
863
    bs->growable = 1;
864
    *pbs = bs;
865
    return 0;
866

    
867
fail:
868
    QDECREF(options);
869
    if (!bs->drv) {
870
        QDECREF(bs->options);
871
    }
872
    bdrv_delete(bs);
873
    return ret;
874
}
875

    
876
/*
877
 * Opens the backing file for a BlockDriverState if not yet open
878
 *
879
 * options is a QDict of options to pass to the block drivers, or NULL for an
880
 * empty set of options. The reference to the QDict is transferred to this
881
 * function (even on failure), so if the caller intends to reuse the dictionary,
882
 * it needs to use QINCREF() before calling bdrv_file_open.
883
 */
884
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options)
885
{
886
    char backing_filename[PATH_MAX];
887
    int back_flags, ret;
888
    BlockDriver *back_drv = NULL;
889

    
890
    if (bs->backing_hd != NULL) {
891
        QDECREF(options);
892
        return 0;
893
    }
894

    
895
    /* NULL means an empty set of options */
896
    if (options == NULL) {
897
        options = qdict_new();
898
    }
899

    
900
    bs->open_flags &= ~BDRV_O_NO_BACKING;
901
    if (qdict_haskey(options, "file.filename")) {
902
        backing_filename[0] = '\0';
903
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
904
        QDECREF(options);
905
        return 0;
906
    }
907

    
908
    bs->backing_hd = bdrv_new("");
909
    bdrv_get_full_backing_filename(bs, backing_filename,
910
                                   sizeof(backing_filename));
911

    
912
    if (bs->backing_format[0] != '\0') {
913
        back_drv = bdrv_find_format(bs->backing_format);
914
    }
915

    
916
    /* backing files always opened read-only */
917
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
918

    
919
    ret = bdrv_open(bs->backing_hd,
920
                    *backing_filename ? backing_filename : NULL, options,
921
                    back_flags, back_drv);
922
    if (ret < 0) {
923
        bdrv_delete(bs->backing_hd);
924
        bs->backing_hd = NULL;
925
        bs->open_flags |= BDRV_O_NO_BACKING;
926
        return ret;
927
    }
928
    return 0;
929
}
930

    
931
static void extract_subqdict(QDict *src, QDict **dst, const char *start)
932
{
933
    const QDictEntry *entry, *next;
934
    const char *p;
935

    
936
    *dst = qdict_new();
937
    entry = qdict_first(src);
938

    
939
    while (entry != NULL) {
940
        next = qdict_next(src, entry);
941
        if (strstart(entry->key, start, &p)) {
942
            qobject_incref(entry->value);
943
            qdict_put_obj(*dst, p, entry->value);
944
            qdict_del(src, entry->key);
945
        }
946
        entry = next;
947
    }
948
}
949

    
950
/*
951
 * Opens a disk image (raw, qcow2, vmdk, ...)
952
 *
953
 * options is a QDict of options to pass to the block drivers, or NULL for an
954
 * empty set of options. The reference to the QDict belongs to the block layer
955
 * after the call (even on failure), so if the caller intends to reuse the
956
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
957
 */
958
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
959
              int flags, BlockDriver *drv)
960
{
961
    int ret;
962
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
963
    char tmp_filename[PATH_MAX + 1];
964
    BlockDriverState *file = NULL;
965
    QDict *file_options = NULL;
966

    
967
    /* NULL means an empty set of options */
968
    if (options == NULL) {
969
        options = qdict_new();
970
    }
971

    
972
    bs->options = options;
973
    options = qdict_clone_shallow(options);
974

    
975
    /* For snapshot=on, create a temporary qcow2 overlay */
976
    if (flags & BDRV_O_SNAPSHOT) {
977
        BlockDriverState *bs1;
978
        int64_t total_size;
979
        BlockDriver *bdrv_qcow2;
980
        QEMUOptionParameter *create_options;
981
        char backing_filename[PATH_MAX];
982

    
983
        if (qdict_size(options) != 0) {
984
            error_report("Can't use snapshot=on with driver-specific options");
985
            ret = -EINVAL;
986
            goto fail;
987
        }
988
        assert(filename != NULL);
989

    
990
        /* if snapshot, we create a temporary backing file and open it
991
           instead of opening 'filename' directly */
992

    
993
        /* if there is a backing file, use it */
994
        bs1 = bdrv_new("");
995
        ret = bdrv_open(bs1, filename, NULL, 0, drv);
996
        if (ret < 0) {
997
            bdrv_delete(bs1);
998
            goto fail;
999
        }
1000
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1001

    
1002
        bdrv_delete(bs1);
1003

    
1004
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1005
        if (ret < 0) {
1006
            goto fail;
1007
        }
1008

    
1009
        /* Real path is meaningless for protocols */
1010
        if (path_has_protocol(filename)) {
1011
            snprintf(backing_filename, sizeof(backing_filename),
1012
                     "%s", filename);
1013
        } else if (!realpath(filename, backing_filename)) {
1014
            ret = -errno;
1015
            goto fail;
1016
        }
1017

    
1018
        bdrv_qcow2 = bdrv_find_format("qcow2");
1019
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1020
                                                 NULL);
1021

    
1022
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1023
        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
1024
                             backing_filename);
1025
        if (drv) {
1026
            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
1027
                drv->format_name);
1028
        }
1029

    
1030
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options);
1031
        free_option_parameters(create_options);
1032
        if (ret < 0) {
1033
            goto fail;
1034
        }
1035

    
1036
        filename = tmp_filename;
1037
        drv = bdrv_qcow2;
1038
        bs->is_temporary = 1;
1039
    }
1040

    
1041
    /* Open image file without format layer */
1042
    if (flags & BDRV_O_RDWR) {
1043
        flags |= BDRV_O_ALLOW_RDWR;
1044
    }
1045

    
1046
    extract_subqdict(options, &file_options, "file.");
1047

    
1048
    ret = bdrv_file_open(&file, filename, file_options,
1049
                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP));
1050
    if (ret < 0) {
1051
        goto fail;
1052
    }
1053

    
1054
    /* Find the right image format driver */
1055
    if (!drv) {
1056
        ret = find_image_format(file, filename, &drv);
1057
    }
1058

    
1059
    if (!drv) {
1060
        goto unlink_and_fail;
1061
    }
1062

    
1063
    /* Open the image */
1064
    ret = bdrv_open_common(bs, file, options, flags, drv);
1065
    if (ret < 0) {
1066
        goto unlink_and_fail;
1067
    }
1068

    
1069
    if (bs->file != file) {
1070
        bdrv_delete(file);
1071
        file = NULL;
1072
    }
1073

    
1074
    /* If there is a backing file, use it */
1075
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1076
        QDict *backing_options;
1077

    
1078
        extract_subqdict(options, &backing_options, "backing.");
1079
        ret = bdrv_open_backing_file(bs, backing_options);
1080
        if (ret < 0) {
1081
            goto close_and_fail;
1082
        }
1083
    }
1084

    
1085
    /* Check if any unknown options were used */
1086
    if (qdict_size(options) != 0) {
1087
        const QDictEntry *entry = qdict_first(options);
1088
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by "
1089
            "device '%s' doesn't support the option '%s'",
1090
            drv->format_name, bs->device_name, entry->key);
1091

    
1092
        ret = -EINVAL;
1093
        goto close_and_fail;
1094
    }
1095
    QDECREF(options);
1096

    
1097
    if (!bdrv_key_required(bs)) {
1098
        bdrv_dev_change_media_cb(bs, true);
1099
    }
1100

    
1101
    /* throttling disk I/O limits */
1102
    if (bs->io_limits_enabled) {
1103
        bdrv_io_limits_enable(bs);
1104
    }
1105

    
1106
    return 0;
1107

    
1108
unlink_and_fail:
1109
    if (file != NULL) {
1110
        bdrv_delete(file);
1111
    }
1112
    if (bs->is_temporary) {
1113
        unlink(filename);
1114
    }
1115
fail:
1116
    QDECREF(bs->options);
1117
    QDECREF(options);
1118
    bs->options = NULL;
1119
    return ret;
1120

    
1121
close_and_fail:
1122
    bdrv_close(bs);
1123
    QDECREF(options);
1124
    return ret;
1125
}
1126

    
1127
typedef struct BlockReopenQueueEntry {
1128
     bool prepared;
1129
     BDRVReopenState state;
1130
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1131
} BlockReopenQueueEntry;
1132

    
1133
/*
1134
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1135
 * reopen of multiple devices.
1136
 *
1137
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1138
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1139
 * be created and initialized. This newly created BlockReopenQueue should be
1140
 * passed back in for subsequent calls that are intended to be of the same
1141
 * atomic 'set'.
1142
 *
1143
 * bs is the BlockDriverState to add to the reopen queue.
1144
 *
1145
 * flags contains the open flags for the associated bs
1146
 *
1147
 * returns a pointer to bs_queue, which is either the newly allocated
1148
 * bs_queue, or the existing bs_queue being used.
1149
 *
1150
 */
1151
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1152
                                    BlockDriverState *bs, int flags)
1153
{
1154
    assert(bs != NULL);
1155

    
1156
    BlockReopenQueueEntry *bs_entry;
1157
    if (bs_queue == NULL) {
1158
        bs_queue = g_new0(BlockReopenQueue, 1);
1159
        QSIMPLEQ_INIT(bs_queue);
1160
    }
1161

    
1162
    if (bs->file) {
1163
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1164
    }
1165

    
1166
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1167
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1168

    
1169
    bs_entry->state.bs = bs;
1170
    bs_entry->state.flags = flags;
1171

    
1172
    return bs_queue;
1173
}
1174

    
1175
/*
1176
 * Reopen multiple BlockDriverStates atomically & transactionally.
1177
 *
1178
 * The queue passed in (bs_queue) must have been built up previous
1179
 * via bdrv_reopen_queue().
1180
 *
1181
 * Reopens all BDS specified in the queue, with the appropriate
1182
 * flags.  All devices are prepared for reopen, and failure of any
1183
 * device will cause all device changes to be abandonded, and intermediate
1184
 * data cleaned up.
1185
 *
1186
 * If all devices prepare successfully, then the changes are committed
1187
 * to all devices.
1188
 *
1189
 */
1190
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1191
{
1192
    int ret = -1;
1193
    BlockReopenQueueEntry *bs_entry, *next;
1194
    Error *local_err = NULL;
1195

    
1196
    assert(bs_queue != NULL);
1197

    
1198
    bdrv_drain_all();
1199

    
1200
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1201
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1202
            error_propagate(errp, local_err);
1203
            goto cleanup;
1204
        }
1205
        bs_entry->prepared = true;
1206
    }
1207

    
1208
    /* If we reach this point, we have success and just need to apply the
1209
     * changes
1210
     */
1211
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1212
        bdrv_reopen_commit(&bs_entry->state);
1213
    }
1214

    
1215
    ret = 0;
1216

    
1217
cleanup:
1218
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1219
        if (ret && bs_entry->prepared) {
1220
            bdrv_reopen_abort(&bs_entry->state);
1221
        }
1222
        g_free(bs_entry);
1223
    }
1224
    g_free(bs_queue);
1225
    return ret;
1226
}
1227

    
1228

    
1229
/* Reopen a single BlockDriverState with the specified flags. */
1230
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1231
{
1232
    int ret = -1;
1233
    Error *local_err = NULL;
1234
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1235

    
1236
    ret = bdrv_reopen_multiple(queue, &local_err);
1237
    if (local_err != NULL) {
1238
        error_propagate(errp, local_err);
1239
    }
1240
    return ret;
1241
}
1242

    
1243

    
1244
/*
1245
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1246
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1247
 * the block driver layer .bdrv_reopen_prepare()
1248
 *
1249
 * bs is the BlockDriverState to reopen
1250
 * flags are the new open flags
1251
 * queue is the reopen queue
1252
 *
1253
 * Returns 0 on success, non-zero on error.  On error errp will be set
1254
 * as well.
1255
 *
1256
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1257
 * It is the responsibility of the caller to then call the abort() or
1258
 * commit() for any other BDS that have been left in a prepare() state
1259
 *
1260
 */
1261
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1262
                        Error **errp)
1263
{
1264
    int ret = -1;
1265
    Error *local_err = NULL;
1266
    BlockDriver *drv;
1267

    
1268
    assert(reopen_state != NULL);
1269
    assert(reopen_state->bs->drv != NULL);
1270
    drv = reopen_state->bs->drv;
1271

    
1272
    /* if we are to stay read-only, do not allow permission change
1273
     * to r/w */
1274
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1275
        reopen_state->flags & BDRV_O_RDWR) {
1276
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1277
                  reopen_state->bs->device_name);
1278
        goto error;
1279
    }
1280

    
1281

    
1282
    ret = bdrv_flush(reopen_state->bs);
1283
    if (ret) {
1284
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1285
                  strerror(-ret));
1286
        goto error;
1287
    }
1288

    
1289
    if (drv->bdrv_reopen_prepare) {
1290
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1291
        if (ret) {
1292
            if (local_err != NULL) {
1293
                error_propagate(errp, local_err);
1294
            } else {
1295
                error_setg(errp, "failed while preparing to reopen image '%s'",
1296
                           reopen_state->bs->filename);
1297
            }
1298
            goto error;
1299
        }
1300
    } else {
1301
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1302
         * handler for each supported drv. */
1303
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1304
                  drv->format_name, reopen_state->bs->device_name,
1305
                 "reopening of file");
1306
        ret = -1;
1307
        goto error;
1308
    }
1309

    
1310
    ret = 0;
1311

    
1312
error:
1313
    return ret;
1314
}
1315

    
1316
/*
1317
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1318
 * makes them final by swapping the staging BlockDriverState contents into
1319
 * the active BlockDriverState contents.
1320
 */
1321
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1322
{
1323
    BlockDriver *drv;
1324

    
1325
    assert(reopen_state != NULL);
1326
    drv = reopen_state->bs->drv;
1327
    assert(drv != NULL);
1328

    
1329
    /* If there are any driver level actions to take */
1330
    if (drv->bdrv_reopen_commit) {
1331
        drv->bdrv_reopen_commit(reopen_state);
1332
    }
1333

    
1334
    /* set BDS specific flags now */
1335
    reopen_state->bs->open_flags         = reopen_state->flags;
1336
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1337
                                              BDRV_O_CACHE_WB);
1338
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1339
}
1340

    
1341
/*
1342
 * Abort the reopen, and delete and free the staged changes in
1343
 * reopen_state
1344
 */
1345
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1346
{
1347
    BlockDriver *drv;
1348

    
1349
    assert(reopen_state != NULL);
1350
    drv = reopen_state->bs->drv;
1351
    assert(drv != NULL);
1352

    
1353
    if (drv->bdrv_reopen_abort) {
1354
        drv->bdrv_reopen_abort(reopen_state);
1355
    }
1356
}
1357

    
1358

    
1359
void bdrv_close(BlockDriverState *bs)
1360
{
1361
    if (bs->job) {
1362
        block_job_cancel_sync(bs->job);
1363
    }
1364
    bdrv_drain_all(); /* complete I/O */
1365
    bdrv_flush(bs);
1366
    bdrv_drain_all(); /* in case flush left pending I/O */
1367
    notifier_list_notify(&bs->close_notifiers, bs);
1368

    
1369
    if (bs->drv) {
1370
        if (bs->backing_hd) {
1371
            bdrv_delete(bs->backing_hd);
1372
            bs->backing_hd = NULL;
1373
        }
1374
        bs->drv->bdrv_close(bs);
1375
        g_free(bs->opaque);
1376
#ifdef _WIN32
1377
        if (bs->is_temporary) {
1378
            unlink(bs->filename);
1379
        }
1380
#endif
1381
        bs->opaque = NULL;
1382
        bs->drv = NULL;
1383
        bs->copy_on_read = 0;
1384
        bs->backing_file[0] = '\0';
1385
        bs->backing_format[0] = '\0';
1386
        bs->total_sectors = 0;
1387
        bs->encrypted = 0;
1388
        bs->valid_key = 0;
1389
        bs->sg = 0;
1390
        bs->growable = 0;
1391
        QDECREF(bs->options);
1392
        bs->options = NULL;
1393

    
1394
        if (bs->file != NULL) {
1395
            bdrv_delete(bs->file);
1396
            bs->file = NULL;
1397
        }
1398
    }
1399

    
1400
    bdrv_dev_change_media_cb(bs, false);
1401

    
1402
    /*throttling disk I/O limits*/
1403
    if (bs->io_limits_enabled) {
1404
        bdrv_io_limits_disable(bs);
1405
    }
1406
}
1407

    
1408
void bdrv_close_all(void)
1409
{
1410
    BlockDriverState *bs;
1411

    
1412
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1413
        bdrv_close(bs);
1414
    }
1415
}
1416

    
1417
/*
1418
 * Wait for pending requests to complete across all BlockDriverStates
1419
 *
1420
 * This function does not flush data to disk, use bdrv_flush_all() for that
1421
 * after calling this function.
1422
 *
1423
 * Note that completion of an asynchronous I/O operation can trigger any
1424
 * number of other I/O operations on other devices---for example a coroutine
1425
 * can be arbitrarily complex and a constant flow of I/O can come until the
1426
 * coroutine is complete.  Because of this, it is not possible to have a
1427
 * function to drain a single device's I/O queue.
1428
 */
1429
void bdrv_drain_all(void)
1430
{
1431
    BlockDriverState *bs;
1432
    bool busy;
1433

    
1434
    do {
1435
        busy = qemu_aio_wait();
1436

    
1437
        /* FIXME: We do not have timer support here, so this is effectively
1438
         * a busy wait.
1439
         */
1440
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1441
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1442
                qemu_co_queue_restart_all(&bs->throttled_reqs);
1443
                busy = true;
1444
            }
1445
        }
1446
    } while (busy);
1447

    
1448
    /* If requests are still pending there is a bug somewhere */
1449
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1450
        assert(QLIST_EMPTY(&bs->tracked_requests));
1451
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
1452
    }
1453
}
1454

    
1455
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1456
   Also, NULL terminate the device_name to prevent double remove */
1457
void bdrv_make_anon(BlockDriverState *bs)
1458
{
1459
    if (bs->device_name[0] != '\0') {
1460
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1461
    }
1462
    bs->device_name[0] = '\0';
1463
}
1464

    
1465
static void bdrv_rebind(BlockDriverState *bs)
1466
{
1467
    if (bs->drv && bs->drv->bdrv_rebind) {
1468
        bs->drv->bdrv_rebind(bs);
1469
    }
1470
}
1471

    
1472
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1473
                                     BlockDriverState *bs_src)
1474
{
1475
    /* move some fields that need to stay attached to the device */
1476
    bs_dest->open_flags         = bs_src->open_flags;
1477

    
1478
    /* dev info */
1479
    bs_dest->dev_ops            = bs_src->dev_ops;
1480
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1481
    bs_dest->dev                = bs_src->dev;
1482
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1483
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1484

    
1485
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1486

    
1487
    /* i/o timing parameters */
1488
    bs_dest->slice_start        = bs_src->slice_start;
1489
    bs_dest->slice_end          = bs_src->slice_end;
1490
    bs_dest->slice_submitted    = bs_src->slice_submitted;
1491
    bs_dest->io_limits          = bs_src->io_limits;
1492
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1493
    bs_dest->block_timer        = bs_src->block_timer;
1494
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1495

    
1496
    /* r/w error */
1497
    bs_dest->on_read_error      = bs_src->on_read_error;
1498
    bs_dest->on_write_error     = bs_src->on_write_error;
1499

    
1500
    /* i/o status */
1501
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1502
    bs_dest->iostatus           = bs_src->iostatus;
1503

    
1504
    /* dirty bitmap */
1505
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1506

    
1507
    /* job */
1508
    bs_dest->in_use             = bs_src->in_use;
1509
    bs_dest->job                = bs_src->job;
1510

    
1511
    /* keep the same entry in bdrv_states */
1512
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1513
            bs_src->device_name);
1514
    bs_dest->list = bs_src->list;
1515
}
1516

    
1517
/*
1518
 * Swap bs contents for two image chains while they are live,
1519
 * while keeping required fields on the BlockDriverState that is
1520
 * actually attached to a device.
1521
 *
1522
 * This will modify the BlockDriverState fields, and swap contents
1523
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1524
 *
1525
 * bs_new is required to be anonymous.
1526
 *
1527
 * This function does not create any image files.
1528
 */
1529
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1530
{
1531
    BlockDriverState tmp;
1532

    
1533
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1534
    assert(bs_new->device_name[0] == '\0');
1535
    assert(bs_new->dirty_bitmap == NULL);
1536
    assert(bs_new->job == NULL);
1537
    assert(bs_new->dev == NULL);
1538
    assert(bs_new->in_use == 0);
1539
    assert(bs_new->io_limits_enabled == false);
1540
    assert(bs_new->block_timer == NULL);
1541

    
1542
    tmp = *bs_new;
1543
    *bs_new = *bs_old;
1544
    *bs_old = tmp;
1545

    
1546
    /* there are some fields that should not be swapped, move them back */
1547
    bdrv_move_feature_fields(&tmp, bs_old);
1548
    bdrv_move_feature_fields(bs_old, bs_new);
1549
    bdrv_move_feature_fields(bs_new, &tmp);
1550

    
1551
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1552
    assert(bs_new->device_name[0] == '\0');
1553

    
1554
    /* Check a few fields that should remain attached to the device */
1555
    assert(bs_new->dev == NULL);
1556
    assert(bs_new->job == NULL);
1557
    assert(bs_new->in_use == 0);
1558
    assert(bs_new->io_limits_enabled == false);
1559
    assert(bs_new->block_timer == NULL);
1560

    
1561
    bdrv_rebind(bs_new);
1562
    bdrv_rebind(bs_old);
1563
}
1564

    
1565
/*
1566
 * Add new bs contents at the top of an image chain while the chain is
1567
 * live, while keeping required fields on the top layer.
1568
 *
1569
 * This will modify the BlockDriverState fields, and swap contents
1570
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1571
 *
1572
 * bs_new is required to be anonymous.
1573
 *
1574
 * This function does not create any image files.
1575
 */
1576
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1577
{
1578
    bdrv_swap(bs_new, bs_top);
1579

    
1580
    /* The contents of 'tmp' will become bs_top, as we are
1581
     * swapping bs_new and bs_top contents. */
1582
    bs_top->backing_hd = bs_new;
1583
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1584
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1585
            bs_new->filename);
1586
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1587
            bs_new->drv ? bs_new->drv->format_name : "");
1588
}
1589

    
1590
void bdrv_delete(BlockDriverState *bs)
1591
{
1592
    assert(!bs->dev);
1593
    assert(!bs->job);
1594
    assert(!bs->in_use);
1595

    
1596
    /* remove from list, if necessary */
1597
    bdrv_make_anon(bs);
1598

    
1599
    bdrv_close(bs);
1600

    
1601
    g_free(bs);
1602
}
1603

    
1604
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1605
/* TODO change to DeviceState *dev when all users are qdevified */
1606
{
1607
    if (bs->dev) {
1608
        return -EBUSY;
1609
    }
1610
    bs->dev = dev;
1611
    bdrv_iostatus_reset(bs);
1612
    return 0;
1613
}
1614

    
1615
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1616
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1617
{
1618
    if (bdrv_attach_dev(bs, dev) < 0) {
1619
        abort();
1620
    }
1621
}
1622

    
1623
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1624
/* TODO change to DeviceState *dev when all users are qdevified */
1625
{
1626
    assert(bs->dev == dev);
1627
    bs->dev = NULL;
1628
    bs->dev_ops = NULL;
1629
    bs->dev_opaque = NULL;
1630
    bs->buffer_alignment = 512;
1631
}
1632

    
1633
/* TODO change to return DeviceState * when all users are qdevified */
1634
void *bdrv_get_attached_dev(BlockDriverState *bs)
1635
{
1636
    return bs->dev;
1637
}
1638

    
1639
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1640
                      void *opaque)
1641
{
1642
    bs->dev_ops = ops;
1643
    bs->dev_opaque = opaque;
1644
}
1645

    
1646
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1647
                               enum MonitorEvent ev,
1648
                               BlockErrorAction action, bool is_read)
1649
{
1650
    QObject *data;
1651
    const char *action_str;
1652

    
1653
    switch (action) {
1654
    case BDRV_ACTION_REPORT:
1655
        action_str = "report";
1656
        break;
1657
    case BDRV_ACTION_IGNORE:
1658
        action_str = "ignore";
1659
        break;
1660
    case BDRV_ACTION_STOP:
1661
        action_str = "stop";
1662
        break;
1663
    default:
1664
        abort();
1665
    }
1666

    
1667
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1668
                              bdrv->device_name,
1669
                              action_str,
1670
                              is_read ? "read" : "write");
1671
    monitor_protocol_event(ev, data);
1672

    
1673
    qobject_decref(data);
1674
}
1675

    
1676
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1677
{
1678
    QObject *data;
1679

    
1680
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1681
                              bdrv_get_device_name(bs), ejected);
1682
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1683

    
1684
    qobject_decref(data);
1685
}
1686

    
1687
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1688
{
1689
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1690
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1691
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1692
        if (tray_was_closed) {
1693
            /* tray open */
1694
            bdrv_emit_qmp_eject_event(bs, true);
1695
        }
1696
        if (load) {
1697
            /* tray close */
1698
            bdrv_emit_qmp_eject_event(bs, false);
1699
        }
1700
    }
1701
}
1702

    
1703
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1704
{
1705
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1706
}
1707

    
1708
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1709
{
1710
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1711
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1712
    }
1713
}
1714

    
1715
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1716
{
1717
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1718
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1719
    }
1720
    return false;
1721
}
1722

    
1723
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1724
{
1725
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1726
        bs->dev_ops->resize_cb(bs->dev_opaque);
1727
    }
1728
}
1729

    
1730
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1731
{
1732
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1733
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1734
    }
1735
    return false;
1736
}
1737

    
1738
/*
1739
 * Run consistency checks on an image
1740
 *
1741
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1742
 * free of errors) or -errno when an internal error occurred. The results of the
1743
 * check are stored in res.
1744
 */
1745
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1746
{
1747
    if (bs->drv->bdrv_check == NULL) {
1748
        return -ENOTSUP;
1749
    }
1750

    
1751
    memset(res, 0, sizeof(*res));
1752
    return bs->drv->bdrv_check(bs, res, fix);
1753
}
1754

    
1755
#define COMMIT_BUF_SECTORS 2048
1756

    
1757
/* commit COW file into the raw image */
1758
int bdrv_commit(BlockDriverState *bs)
1759
{
1760
    BlockDriver *drv = bs->drv;
1761
    int64_t sector, total_sectors;
1762
    int n, ro, open_flags;
1763
    int ret = 0;
1764
    uint8_t *buf;
1765
    char filename[PATH_MAX];
1766

    
1767
    if (!drv)
1768
        return -ENOMEDIUM;
1769
    
1770
    if (!bs->backing_hd) {
1771
        return -ENOTSUP;
1772
    }
1773

    
1774
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1775
        return -EBUSY;
1776
    }
1777

    
1778
    ro = bs->backing_hd->read_only;
1779
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1780
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1781
    open_flags =  bs->backing_hd->open_flags;
1782

    
1783
    if (ro) {
1784
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1785
            return -EACCES;
1786
        }
1787
    }
1788

    
1789
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1790
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1791

    
1792
    for (sector = 0; sector < total_sectors; sector += n) {
1793
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1794

    
1795
            if (bdrv_read(bs, sector, buf, n) != 0) {
1796
                ret = -EIO;
1797
                goto ro_cleanup;
1798
            }
1799

    
1800
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1801
                ret = -EIO;
1802
                goto ro_cleanup;
1803
            }
1804
        }
1805
    }
1806

    
1807
    if (drv->bdrv_make_empty) {
1808
        ret = drv->bdrv_make_empty(bs);
1809
        bdrv_flush(bs);
1810
    }
1811

    
1812
    /*
1813
     * Make sure all data we wrote to the backing device is actually
1814
     * stable on disk.
1815
     */
1816
    if (bs->backing_hd)
1817
        bdrv_flush(bs->backing_hd);
1818

    
1819
ro_cleanup:
1820
    g_free(buf);
1821

    
1822
    if (ro) {
1823
        /* ignoring error return here */
1824
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1825
    }
1826

    
1827
    return ret;
1828
}
1829

    
1830
int bdrv_commit_all(void)
1831
{
1832
    BlockDriverState *bs;
1833

    
1834
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1835
        if (bs->drv && bs->backing_hd) {
1836
            int ret = bdrv_commit(bs);
1837
            if (ret < 0) {
1838
                return ret;
1839
            }
1840
        }
1841
    }
1842
    return 0;
1843
}
1844

    
1845
/**
1846
 * Remove an active request from the tracked requests list
1847
 *
1848
 * This function should be called when a tracked request is completing.
1849
 */
1850
static void tracked_request_end(BdrvTrackedRequest *req)
1851
{
1852
    QLIST_REMOVE(req, list);
1853
    qemu_co_queue_restart_all(&req->wait_queue);
1854
}
1855

    
1856
/**
1857
 * Add an active request to the tracked requests list
1858
 */
1859
static void tracked_request_begin(BdrvTrackedRequest *req,
1860
                                  BlockDriverState *bs,
1861
                                  int64_t sector_num,
1862
                                  int nb_sectors, bool is_write)
1863
{
1864
    *req = (BdrvTrackedRequest){
1865
        .bs = bs,
1866
        .sector_num = sector_num,
1867
        .nb_sectors = nb_sectors,
1868
        .is_write = is_write,
1869
        .co = qemu_coroutine_self(),
1870
    };
1871

    
1872
    qemu_co_queue_init(&req->wait_queue);
1873

    
1874
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1875
}
1876

    
1877
/**
1878
 * Round a region to cluster boundaries
1879
 */
1880
void bdrv_round_to_clusters(BlockDriverState *bs,
1881
                            int64_t sector_num, int nb_sectors,
1882
                            int64_t *cluster_sector_num,
1883
                            int *cluster_nb_sectors)
1884
{
1885
    BlockDriverInfo bdi;
1886

    
1887
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1888
        *cluster_sector_num = sector_num;
1889
        *cluster_nb_sectors = nb_sectors;
1890
    } else {
1891
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1892
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1893
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1894
                                            nb_sectors, c);
1895
    }
1896
}
1897

    
1898
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1899
                                     int64_t sector_num, int nb_sectors) {
1900
    /*        aaaa   bbbb */
1901
    if (sector_num >= req->sector_num + req->nb_sectors) {
1902
        return false;
1903
    }
1904
    /* bbbb   aaaa        */
1905
    if (req->sector_num >= sector_num + nb_sectors) {
1906
        return false;
1907
    }
1908
    return true;
1909
}
1910

    
1911
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1912
        int64_t sector_num, int nb_sectors)
1913
{
1914
    BdrvTrackedRequest *req;
1915
    int64_t cluster_sector_num;
1916
    int cluster_nb_sectors;
1917
    bool retry;
1918

    
1919
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1920
     * that allocating writes will be serialized and not race with each other
1921
     * for the same cluster.  For example, in copy-on-read it ensures that the
1922
     * CoR read and write operations are atomic and guest writes cannot
1923
     * interleave between them.
1924
     */
1925
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
1926
                           &cluster_sector_num, &cluster_nb_sectors);
1927

    
1928
    do {
1929
        retry = false;
1930
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1931
            if (tracked_request_overlaps(req, cluster_sector_num,
1932
                                         cluster_nb_sectors)) {
1933
                /* Hitting this means there was a reentrant request, for
1934
                 * example, a block driver issuing nested requests.  This must
1935
                 * never happen since it means deadlock.
1936
                 */
1937
                assert(qemu_coroutine_self() != req->co);
1938

    
1939
                qemu_co_queue_wait(&req->wait_queue);
1940
                retry = true;
1941
                break;
1942
            }
1943
        }
1944
    } while (retry);
1945
}
1946

    
1947
/*
1948
 * Return values:
1949
 * 0        - success
1950
 * -EINVAL  - backing format specified, but no file
1951
 * -ENOSPC  - can't update the backing file because no space is left in the
1952
 *            image file header
1953
 * -ENOTSUP - format driver doesn't support changing the backing file
1954
 */
1955
int bdrv_change_backing_file(BlockDriverState *bs,
1956
    const char *backing_file, const char *backing_fmt)
1957
{
1958
    BlockDriver *drv = bs->drv;
1959
    int ret;
1960

    
1961
    /* Backing file format doesn't make sense without a backing file */
1962
    if (backing_fmt && !backing_file) {
1963
        return -EINVAL;
1964
    }
1965

    
1966
    if (drv->bdrv_change_backing_file != NULL) {
1967
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1968
    } else {
1969
        ret = -ENOTSUP;
1970
    }
1971

    
1972
    if (ret == 0) {
1973
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1974
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1975
    }
1976
    return ret;
1977
}
1978

    
1979
/*
1980
 * Finds the image layer in the chain that has 'bs' as its backing file.
1981
 *
1982
 * active is the current topmost image.
1983
 *
1984
 * Returns NULL if bs is not found in active's image chain,
1985
 * or if active == bs.
1986
 */
1987
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1988
                                    BlockDriverState *bs)
1989
{
1990
    BlockDriverState *overlay = NULL;
1991
    BlockDriverState *intermediate;
1992

    
1993
    assert(active != NULL);
1994
    assert(bs != NULL);
1995

    
1996
    /* if bs is the same as active, then by definition it has no overlay
1997
     */
1998
    if (active == bs) {
1999
        return NULL;
2000
    }
2001

    
2002
    intermediate = active;
2003
    while (intermediate->backing_hd) {
2004
        if (intermediate->backing_hd == bs) {
2005
            overlay = intermediate;
2006
            break;
2007
        }
2008
        intermediate = intermediate->backing_hd;
2009
    }
2010

    
2011
    return overlay;
2012
}
2013

    
2014
typedef struct BlkIntermediateStates {
2015
    BlockDriverState *bs;
2016
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2017
} BlkIntermediateStates;
2018

    
2019

    
2020
/*
2021
 * Drops images above 'base' up to and including 'top', and sets the image
2022
 * above 'top' to have base as its backing file.
2023
 *
2024
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2025
 * information in 'bs' can be properly updated.
2026
 *
2027
 * E.g., this will convert the following chain:
2028
 * bottom <- base <- intermediate <- top <- active
2029
 *
2030
 * to
2031
 *
2032
 * bottom <- base <- active
2033
 *
2034
 * It is allowed for bottom==base, in which case it converts:
2035
 *
2036
 * base <- intermediate <- top <- active
2037
 *
2038
 * to
2039
 *
2040
 * base <- active
2041
 *
2042
 * Error conditions:
2043
 *  if active == top, that is considered an error
2044
 *
2045
 */
2046
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2047
                           BlockDriverState *base)
2048
{
2049
    BlockDriverState *intermediate;
2050
    BlockDriverState *base_bs = NULL;
2051
    BlockDriverState *new_top_bs = NULL;
2052
    BlkIntermediateStates *intermediate_state, *next;
2053
    int ret = -EIO;
2054

    
2055
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2056
    QSIMPLEQ_INIT(&states_to_delete);
2057

    
2058
    if (!top->drv || !base->drv) {
2059
        goto exit;
2060
    }
2061

    
2062
    new_top_bs = bdrv_find_overlay(active, top);
2063

    
2064
    if (new_top_bs == NULL) {
2065
        /* we could not find the image above 'top', this is an error */
2066
        goto exit;
2067
    }
2068

    
2069
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2070
     * to do, no intermediate images */
2071
    if (new_top_bs->backing_hd == base) {
2072
        ret = 0;
2073
        goto exit;
2074
    }
2075

    
2076
    intermediate = top;
2077

    
2078
    /* now we will go down through the list, and add each BDS we find
2079
     * into our deletion queue, until we hit the 'base'
2080
     */
2081
    while (intermediate) {
2082
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2083
        intermediate_state->bs = intermediate;
2084
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2085

    
2086
        if (intermediate->backing_hd == base) {
2087
            base_bs = intermediate->backing_hd;
2088
            break;
2089
        }
2090
        intermediate = intermediate->backing_hd;
2091
    }
2092
    if (base_bs == NULL) {
2093
        /* something went wrong, we did not end at the base. safely
2094
         * unravel everything, and exit with error */
2095
        goto exit;
2096
    }
2097

    
2098
    /* success - we can delete the intermediate states, and link top->base */
2099
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2100
                                   base_bs->drv ? base_bs->drv->format_name : "");
2101
    if (ret) {
2102
        goto exit;
2103
    }
2104
    new_top_bs->backing_hd = base_bs;
2105

    
2106

    
2107
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2108
        /* so that bdrv_close() does not recursively close the chain */
2109
        intermediate_state->bs->backing_hd = NULL;
2110
        bdrv_delete(intermediate_state->bs);
2111
    }
2112
    ret = 0;
2113

    
2114
exit:
2115
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2116
        g_free(intermediate_state);
2117
    }
2118
    return ret;
2119
}
2120

    
2121

    
2122
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2123
                                   size_t size)
2124
{
2125
    int64_t len;
2126

    
2127
    if (!bdrv_is_inserted(bs))
2128
        return -ENOMEDIUM;
2129

    
2130
    if (bs->growable)
2131
        return 0;
2132

    
2133
    len = bdrv_getlength(bs);
2134

    
2135
    if (offset < 0)
2136
        return -EIO;
2137

    
2138
    if ((offset > len) || (len - offset < size))
2139
        return -EIO;
2140

    
2141
    return 0;
2142
}
2143

    
2144
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2145
                              int nb_sectors)
2146
{
2147
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2148
                                   nb_sectors * BDRV_SECTOR_SIZE);
2149
}
2150

    
2151
typedef struct RwCo {
2152
    BlockDriverState *bs;
2153
    int64_t sector_num;
2154
    int nb_sectors;
2155
    QEMUIOVector *qiov;
2156
    bool is_write;
2157
    int ret;
2158
} RwCo;
2159

    
2160
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2161
{
2162
    RwCo *rwco = opaque;
2163

    
2164
    if (!rwco->is_write) {
2165
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2166
                                     rwco->nb_sectors, rwco->qiov, 0);
2167
    } else {
2168
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2169
                                      rwco->nb_sectors, rwco->qiov, 0);
2170
    }
2171
}
2172

    
2173
/*
2174
 * Process a vectored synchronous request using coroutines
2175
 */
2176
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2177
                       QEMUIOVector *qiov, bool is_write)
2178
{
2179
    Coroutine *co;
2180
    RwCo rwco = {
2181
        .bs = bs,
2182
        .sector_num = sector_num,
2183
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2184
        .qiov = qiov,
2185
        .is_write = is_write,
2186
        .ret = NOT_DONE,
2187
    };
2188
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2189

    
2190
    /**
2191
     * In sync call context, when the vcpu is blocked, this throttling timer
2192
     * will not fire; so the I/O throttling function has to be disabled here
2193
     * if it has been enabled.
2194
     */
2195
    if (bs->io_limits_enabled) {
2196
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2197
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2198
        bdrv_io_limits_disable(bs);
2199
    }
2200

    
2201
    if (qemu_in_coroutine()) {
2202
        /* Fast-path if already in coroutine context */
2203
        bdrv_rw_co_entry(&rwco);
2204
    } else {
2205
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2206
        qemu_coroutine_enter(co, &rwco);
2207
        while (rwco.ret == NOT_DONE) {
2208
            qemu_aio_wait();
2209
        }
2210
    }
2211
    return rwco.ret;
2212
}
2213

    
2214
/*
2215
 * Process a synchronous request using coroutines
2216
 */
2217
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2218
                      int nb_sectors, bool is_write)
2219
{
2220
    QEMUIOVector qiov;
2221
    struct iovec iov = {
2222
        .iov_base = (void *)buf,
2223
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2224
    };
2225

    
2226
    qemu_iovec_init_external(&qiov, &iov, 1);
2227
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write);
2228
}
2229

    
2230
/* return < 0 if error. See bdrv_write() for the return codes */
2231
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2232
              uint8_t *buf, int nb_sectors)
2233
{
2234
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
2235
}
2236

    
2237
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2238
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2239
                          uint8_t *buf, int nb_sectors)
2240
{
2241
    bool enabled;
2242
    int ret;
2243

    
2244
    enabled = bs->io_limits_enabled;
2245
    bs->io_limits_enabled = false;
2246
    ret = bdrv_read(bs, 0, buf, 1);
2247
    bs->io_limits_enabled = enabled;
2248
    return ret;
2249
}
2250

    
2251
/* Return < 0 if error. Important errors are:
2252
  -EIO         generic I/O error (may happen for all errors)
2253
  -ENOMEDIUM   No media inserted.
2254
  -EINVAL      Invalid sector number or nb_sectors
2255
  -EACCES      Trying to write a read-only device
2256
*/
2257
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2258
               const uint8_t *buf, int nb_sectors)
2259
{
2260
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
2261
}
2262

    
2263
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2264
{
2265
    return bdrv_rwv_co(bs, sector_num, qiov, true);
2266
}
2267

    
2268
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2269
               void *buf, int count1)
2270
{
2271
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2272
    int len, nb_sectors, count;
2273
    int64_t sector_num;
2274
    int ret;
2275

    
2276
    count = count1;
2277
    /* first read to align to sector start */
2278
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2279
    if (len > count)
2280
        len = count;
2281
    sector_num = offset >> BDRV_SECTOR_BITS;
2282
    if (len > 0) {
2283
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2284
            return ret;
2285
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2286
        count -= len;
2287
        if (count == 0)
2288
            return count1;
2289
        sector_num++;
2290
        buf += len;
2291
    }
2292

    
2293
    /* read the sectors "in place" */
2294
    nb_sectors = count >> BDRV_SECTOR_BITS;
2295
    if (nb_sectors > 0) {
2296
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2297
            return ret;
2298
        sector_num += nb_sectors;
2299
        len = nb_sectors << BDRV_SECTOR_BITS;
2300
        buf += len;
2301
        count -= len;
2302
    }
2303

    
2304
    /* add data from the last sector */
2305
    if (count > 0) {
2306
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2307
            return ret;
2308
        memcpy(buf, tmp_buf, count);
2309
    }
2310
    return count1;
2311
}
2312

    
2313
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2314
{
2315
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2316
    int len, nb_sectors, count;
2317
    int64_t sector_num;
2318
    int ret;
2319

    
2320
    count = qiov->size;
2321

    
2322
    /* first write to align to sector start */
2323
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2324
    if (len > count)
2325
        len = count;
2326
    sector_num = offset >> BDRV_SECTOR_BITS;
2327
    if (len > 0) {
2328
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2329
            return ret;
2330
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2331
                          len);
2332
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2333
            return ret;
2334
        count -= len;
2335
        if (count == 0)
2336
            return qiov->size;
2337
        sector_num++;
2338
    }
2339

    
2340
    /* write the sectors "in place" */
2341
    nb_sectors = count >> BDRV_SECTOR_BITS;
2342
    if (nb_sectors > 0) {
2343
        QEMUIOVector qiov_inplace;
2344

    
2345
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2346
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2347
                          nb_sectors << BDRV_SECTOR_BITS);
2348
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2349
        qemu_iovec_destroy(&qiov_inplace);
2350
        if (ret < 0) {
2351
            return ret;
2352
        }
2353

    
2354
        sector_num += nb_sectors;
2355
        len = nb_sectors << BDRV_SECTOR_BITS;
2356
        count -= len;
2357
    }
2358

    
2359
    /* add data from the last sector */
2360
    if (count > 0) {
2361
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2362
            return ret;
2363
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2364
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2365
            return ret;
2366
    }
2367
    return qiov->size;
2368
}
2369

    
2370
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2371
                const void *buf, int count1)
2372
{
2373
    QEMUIOVector qiov;
2374
    struct iovec iov = {
2375
        .iov_base   = (void *) buf,
2376
        .iov_len    = count1,
2377
    };
2378

    
2379
    qemu_iovec_init_external(&qiov, &iov, 1);
2380
    return bdrv_pwritev(bs, offset, &qiov);
2381
}
2382

    
2383
/*
2384
 * Writes to the file and ensures that no writes are reordered across this
2385
 * request (acts as a barrier)
2386
 *
2387
 * Returns 0 on success, -errno in error cases.
2388
 */
2389
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2390
    const void *buf, int count)
2391
{
2392
    int ret;
2393

    
2394
    ret = bdrv_pwrite(bs, offset, buf, count);
2395
    if (ret < 0) {
2396
        return ret;
2397
    }
2398

    
2399
    /* No flush needed for cache modes that already do it */
2400
    if (bs->enable_write_cache) {
2401
        bdrv_flush(bs);
2402
    }
2403

    
2404
    return 0;
2405
}
2406

    
2407
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2408
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2409
{
2410
    /* Perform I/O through a temporary buffer so that users who scribble over
2411
     * their read buffer while the operation is in progress do not end up
2412
     * modifying the image file.  This is critical for zero-copy guest I/O
2413
     * where anything might happen inside guest memory.
2414
     */
2415
    void *bounce_buffer;
2416

    
2417
    BlockDriver *drv = bs->drv;
2418
    struct iovec iov;
2419
    QEMUIOVector bounce_qiov;
2420
    int64_t cluster_sector_num;
2421
    int cluster_nb_sectors;
2422
    size_t skip_bytes;
2423
    int ret;
2424

    
2425
    /* Cover entire cluster so no additional backing file I/O is required when
2426
     * allocating cluster in the image file.
2427
     */
2428
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2429
                           &cluster_sector_num, &cluster_nb_sectors);
2430

    
2431
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2432
                                   cluster_sector_num, cluster_nb_sectors);
2433

    
2434
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2435
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2436
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2437

    
2438
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2439
                             &bounce_qiov);
2440
    if (ret < 0) {
2441
        goto err;
2442
    }
2443

    
2444
    if (drv->bdrv_co_write_zeroes &&
2445
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2446
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2447
                                      cluster_nb_sectors);
2448
    } else {
2449
        /* This does not change the data on the disk, it is not necessary
2450
         * to flush even in cache=writethrough mode.
2451
         */
2452
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2453
                                  &bounce_qiov);
2454
    }
2455

    
2456
    if (ret < 0) {
2457
        /* It might be okay to ignore write errors for guest requests.  If this
2458
         * is a deliberate copy-on-read then we don't want to ignore the error.
2459
         * Simply report it in all cases.
2460
         */
2461
        goto err;
2462
    }
2463

    
2464
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2465
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2466
                        nb_sectors * BDRV_SECTOR_SIZE);
2467

    
2468
err:
2469
    qemu_vfree(bounce_buffer);
2470
    return ret;
2471
}
2472

    
2473
/*
2474
 * Handle a read request in coroutine context
2475
 */
2476
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2477
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2478
    BdrvRequestFlags flags)
2479
{
2480
    BlockDriver *drv = bs->drv;
2481
    BdrvTrackedRequest req;
2482
    int ret;
2483

    
2484
    if (!drv) {
2485
        return -ENOMEDIUM;
2486
    }
2487
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2488
        return -EIO;
2489
    }
2490

    
2491
    /* throttling disk read I/O */
2492
    if (bs->io_limits_enabled) {
2493
        bdrv_io_limits_intercept(bs, false, nb_sectors);
2494
    }
2495

    
2496
    if (bs->copy_on_read) {
2497
        flags |= BDRV_REQ_COPY_ON_READ;
2498
    }
2499
    if (flags & BDRV_REQ_COPY_ON_READ) {
2500
        bs->copy_on_read_in_flight++;
2501
    }
2502

    
2503
    if (bs->copy_on_read_in_flight) {
2504
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2505
    }
2506

    
2507
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2508

    
2509
    if (flags & BDRV_REQ_COPY_ON_READ) {
2510
        int pnum;
2511

    
2512
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2513
        if (ret < 0) {
2514
            goto out;
2515
        }
2516

    
2517
        if (!ret || pnum != nb_sectors) {
2518
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2519
            goto out;
2520
        }
2521
    }
2522

    
2523
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2524

    
2525
out:
2526
    tracked_request_end(&req);
2527

    
2528
    if (flags & BDRV_REQ_COPY_ON_READ) {
2529
        bs->copy_on_read_in_flight--;
2530
    }
2531

    
2532
    return ret;
2533
}
2534

    
2535
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2536
    int nb_sectors, QEMUIOVector *qiov)
2537
{
2538
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2539

    
2540
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2541
}
2542

    
2543
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2544
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2545
{
2546
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2547

    
2548
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2549
                            BDRV_REQ_COPY_ON_READ);
2550
}
2551

    
2552
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2553
    int64_t sector_num, int nb_sectors)
2554
{
2555
    BlockDriver *drv = bs->drv;
2556
    QEMUIOVector qiov;
2557
    struct iovec iov;
2558
    int ret;
2559

    
2560
    /* TODO Emulate only part of misaligned requests instead of letting block
2561
     * drivers return -ENOTSUP and emulate everything */
2562

    
2563
    /* First try the efficient write zeroes operation */
2564
    if (drv->bdrv_co_write_zeroes) {
2565
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2566
        if (ret != -ENOTSUP) {
2567
            return ret;
2568
        }
2569
    }
2570

    
2571
    /* Fall back to bounce buffer if write zeroes is unsupported */
2572
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2573
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2574
    memset(iov.iov_base, 0, iov.iov_len);
2575
    qemu_iovec_init_external(&qiov, &iov, 1);
2576

    
2577
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2578

    
2579
    qemu_vfree(iov.iov_base);
2580
    return ret;
2581
}
2582

    
2583
/*
2584
 * Handle a write request in coroutine context
2585
 */
2586
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2587
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2588
    BdrvRequestFlags flags)
2589
{
2590
    BlockDriver *drv = bs->drv;
2591
    BdrvTrackedRequest req;
2592
    int ret;
2593

    
2594
    if (!bs->drv) {
2595
        return -ENOMEDIUM;
2596
    }
2597
    if (bs->read_only) {
2598
        return -EACCES;
2599
    }
2600
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2601
        return -EIO;
2602
    }
2603

    
2604
    /* throttling disk write I/O */
2605
    if (bs->io_limits_enabled) {
2606
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2607
    }
2608

    
2609
    if (bs->copy_on_read_in_flight) {
2610
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2611
    }
2612

    
2613
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2614

    
2615
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2616

    
2617
    if (ret < 0) {
2618
        /* Do nothing, write notifier decided to fail this request */
2619
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
2620
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2621
    } else {
2622
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2623
    }
2624

    
2625
    if (ret == 0 && !bs->enable_write_cache) {
2626
        ret = bdrv_co_flush(bs);
2627
    }
2628

    
2629
    if (bs->dirty_bitmap) {
2630
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2631
    }
2632

    
2633
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2634
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2635
    }
2636

    
2637
    tracked_request_end(&req);
2638

    
2639
    return ret;
2640
}
2641

    
2642
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2643
    int nb_sectors, QEMUIOVector *qiov)
2644
{
2645
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2646

    
2647
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2648
}
2649

    
2650
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2651
                                      int64_t sector_num, int nb_sectors)
2652
{
2653
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2654

    
2655
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2656
                             BDRV_REQ_ZERO_WRITE);
2657
}
2658

    
2659
/**
2660
 * Truncate file to 'offset' bytes (needed only for file protocols)
2661
 */
2662
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2663
{
2664
    BlockDriver *drv = bs->drv;
2665
    int ret;
2666
    if (!drv)
2667
        return -ENOMEDIUM;
2668
    if (!drv->bdrv_truncate)
2669
        return -ENOTSUP;
2670
    if (bs->read_only)
2671
        return -EACCES;
2672
    if (bdrv_in_use(bs))
2673
        return -EBUSY;
2674
    ret = drv->bdrv_truncate(bs, offset);
2675
    if (ret == 0) {
2676
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2677
        bdrv_dev_resize_cb(bs);
2678
    }
2679
    return ret;
2680
}
2681

    
2682
/**
2683
 * Length of a allocated file in bytes. Sparse files are counted by actual
2684
 * allocated space. Return < 0 if error or unknown.
2685
 */
2686
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2687
{
2688
    BlockDriver *drv = bs->drv;
2689
    if (!drv) {
2690
        return -ENOMEDIUM;
2691
    }
2692
    if (drv->bdrv_get_allocated_file_size) {
2693
        return drv->bdrv_get_allocated_file_size(bs);
2694
    }
2695
    if (bs->file) {
2696
        return bdrv_get_allocated_file_size(bs->file);
2697
    }
2698
    return -ENOTSUP;
2699
}
2700

    
2701
/**
2702
 * Length of a file in bytes. Return < 0 if error or unknown.
2703
 */
2704
int64_t bdrv_getlength(BlockDriverState *bs)
2705
{
2706
    BlockDriver *drv = bs->drv;
2707
    if (!drv)
2708
        return -ENOMEDIUM;
2709

    
2710
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2711
        if (drv->bdrv_getlength) {
2712
            return drv->bdrv_getlength(bs);
2713
        }
2714
    }
2715
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2716
}
2717

    
2718
/* return 0 as number of sectors if no device present or error */
2719
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2720
{
2721
    int64_t length;
2722
    length = bdrv_getlength(bs);
2723
    if (length < 0)
2724
        length = 0;
2725
    else
2726
        length = length >> BDRV_SECTOR_BITS;
2727
    *nb_sectors_ptr = length;
2728
}
2729

    
2730
/* throttling disk io limits */
2731
void bdrv_set_io_limits(BlockDriverState *bs,
2732
                        BlockIOLimit *io_limits)
2733
{
2734
    bs->io_limits = *io_limits;
2735
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2736
}
2737

    
2738
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2739
                       BlockdevOnError on_write_error)
2740
{
2741
    bs->on_read_error = on_read_error;
2742
    bs->on_write_error = on_write_error;
2743
}
2744

    
2745
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2746
{
2747
    return is_read ? bs->on_read_error : bs->on_write_error;
2748
}
2749

    
2750
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2751
{
2752
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2753

    
2754
    switch (on_err) {
2755
    case BLOCKDEV_ON_ERROR_ENOSPC:
2756
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2757
    case BLOCKDEV_ON_ERROR_STOP:
2758
        return BDRV_ACTION_STOP;
2759
    case BLOCKDEV_ON_ERROR_REPORT:
2760
        return BDRV_ACTION_REPORT;
2761
    case BLOCKDEV_ON_ERROR_IGNORE:
2762
        return BDRV_ACTION_IGNORE;
2763
    default:
2764
        abort();
2765
    }
2766
}
2767

    
2768
/* This is done by device models because, while the block layer knows
2769
 * about the error, it does not know whether an operation comes from
2770
 * the device or the block layer (from a job, for example).
2771
 */
2772
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2773
                       bool is_read, int error)
2774
{
2775
    assert(error >= 0);
2776
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2777
    if (action == BDRV_ACTION_STOP) {
2778
        vm_stop(RUN_STATE_IO_ERROR);
2779
        bdrv_iostatus_set_err(bs, error);
2780
    }
2781
}
2782

    
2783
int bdrv_is_read_only(BlockDriverState *bs)
2784
{
2785
    return bs->read_only;
2786
}
2787

    
2788
int bdrv_is_sg(BlockDriverState *bs)
2789
{
2790
    return bs->sg;
2791
}
2792

    
2793
int bdrv_enable_write_cache(BlockDriverState *bs)
2794
{
2795
    return bs->enable_write_cache;
2796
}
2797

    
2798
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2799
{
2800
    bs->enable_write_cache = wce;
2801

    
2802
    /* so a reopen() will preserve wce */
2803
    if (wce) {
2804
        bs->open_flags |= BDRV_O_CACHE_WB;
2805
    } else {
2806
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2807
    }
2808
}
2809

    
2810
int bdrv_is_encrypted(BlockDriverState *bs)
2811
{
2812
    if (bs->backing_hd && bs->backing_hd->encrypted)
2813
        return 1;
2814
    return bs->encrypted;
2815
}
2816

    
2817
int bdrv_key_required(BlockDriverState *bs)
2818
{
2819
    BlockDriverState *backing_hd = bs->backing_hd;
2820

    
2821
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2822
        return 1;
2823
    return (bs->encrypted && !bs->valid_key);
2824
}
2825

    
2826
int bdrv_set_key(BlockDriverState *bs, const char *key)
2827
{
2828
    int ret;
2829
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2830
        ret = bdrv_set_key(bs->backing_hd, key);
2831
        if (ret < 0)
2832
            return ret;
2833
        if (!bs->encrypted)
2834
            return 0;
2835
    }
2836
    if (!bs->encrypted) {
2837
        return -EINVAL;
2838
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2839
        return -ENOMEDIUM;
2840
    }
2841
    ret = bs->drv->bdrv_set_key(bs, key);
2842
    if (ret < 0) {
2843
        bs->valid_key = 0;
2844
    } else if (!bs->valid_key) {
2845
        bs->valid_key = 1;
2846
        /* call the change callback now, we skipped it on open */
2847
        bdrv_dev_change_media_cb(bs, true);
2848
    }
2849
    return ret;
2850
}
2851

    
2852
const char *bdrv_get_format_name(BlockDriverState *bs)
2853
{
2854
    return bs->drv ? bs->drv->format_name : NULL;
2855
}
2856

    
2857
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2858
                         void *opaque)
2859
{
2860
    BlockDriver *drv;
2861

    
2862
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2863
        it(opaque, drv->format_name);
2864
    }
2865
}
2866

    
2867
BlockDriverState *bdrv_find(const char *name)
2868
{
2869
    BlockDriverState *bs;
2870

    
2871
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2872
        if (!strcmp(name, bs->device_name)) {
2873
            return bs;
2874
        }
2875
    }
2876
    return NULL;
2877
}
2878

    
2879
BlockDriverState *bdrv_next(BlockDriverState *bs)
2880
{
2881
    if (!bs) {
2882
        return QTAILQ_FIRST(&bdrv_states);
2883
    }
2884
    return QTAILQ_NEXT(bs, list);
2885
}
2886

    
2887
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2888
{
2889
    BlockDriverState *bs;
2890

    
2891
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2892
        it(opaque, bs);
2893
    }
2894
}
2895

    
2896
const char *bdrv_get_device_name(BlockDriverState *bs)
2897
{
2898
    return bs->device_name;
2899
}
2900

    
2901
int bdrv_get_flags(BlockDriverState *bs)
2902
{
2903
    return bs->open_flags;
2904
}
2905

    
2906
void bdrv_flush_all(void)
2907
{
2908
    BlockDriverState *bs;
2909

    
2910
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2911
        bdrv_flush(bs);
2912
    }
2913
}
2914

    
2915
int bdrv_has_zero_init_1(BlockDriverState *bs)
2916
{
2917
    return 1;
2918
}
2919

    
2920
int bdrv_has_zero_init(BlockDriverState *bs)
2921
{
2922
    assert(bs->drv);
2923

    
2924
    if (bs->drv->bdrv_has_zero_init) {
2925
        return bs->drv->bdrv_has_zero_init(bs);
2926
    }
2927

    
2928
    /* safe default */
2929
    return 0;
2930
}
2931

    
2932
typedef struct BdrvCoIsAllocatedData {
2933
    BlockDriverState *bs;
2934
    BlockDriverState *base;
2935
    int64_t sector_num;
2936
    int nb_sectors;
2937
    int *pnum;
2938
    int ret;
2939
    bool done;
2940
} BdrvCoIsAllocatedData;
2941

    
2942
/*
2943
 * Returns true iff the specified sector is present in the disk image. Drivers
2944
 * not implementing the functionality are assumed to not support backing files,
2945
 * hence all their sectors are reported as allocated.
2946
 *
2947
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2948
 * and 'pnum' is set to 0.
2949
 *
2950
 * 'pnum' is set to the number of sectors (including and immediately following
2951
 * the specified sector) that are known to be in the same
2952
 * allocated/unallocated state.
2953
 *
2954
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2955
 * beyond the end of the disk image it will be clamped.
2956
 */
2957
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2958
                                      int nb_sectors, int *pnum)
2959
{
2960
    int64_t n;
2961

    
2962
    if (sector_num >= bs->total_sectors) {
2963
        *pnum = 0;
2964
        return 0;
2965
    }
2966

    
2967
    n = bs->total_sectors - sector_num;
2968
    if (n < nb_sectors) {
2969
        nb_sectors = n;
2970
    }
2971

    
2972
    if (!bs->drv->bdrv_co_is_allocated) {
2973
        *pnum = nb_sectors;
2974
        return 1;
2975
    }
2976

    
2977
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2978
}
2979

    
2980
/* Coroutine wrapper for bdrv_is_allocated() */
2981
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2982
{
2983
    BdrvCoIsAllocatedData *data = opaque;
2984
    BlockDriverState *bs = data->bs;
2985

    
2986
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2987
                                     data->pnum);
2988
    data->done = true;
2989
}
2990

    
2991
/*
2992
 * Synchronous wrapper around bdrv_co_is_allocated().
2993
 *
2994
 * See bdrv_co_is_allocated() for details.
2995
 */
2996
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2997
                      int *pnum)
2998
{
2999
    Coroutine *co;
3000
    BdrvCoIsAllocatedData data = {
3001
        .bs = bs,
3002
        .sector_num = sector_num,
3003
        .nb_sectors = nb_sectors,
3004
        .pnum = pnum,
3005
        .done = false,
3006
    };
3007

    
3008
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
3009
    qemu_coroutine_enter(co, &data);
3010
    while (!data.done) {
3011
        qemu_aio_wait();
3012
    }
3013
    return data.ret;
3014
}
3015

    
3016
/*
3017
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3018
 *
3019
 * Return true if the given sector is allocated in any image between
3020
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3021
 * sector is allocated in any image of the chain.  Return false otherwise.
3022
 *
3023
 * 'pnum' is set to the number of sectors (including and immediately following
3024
 *  the specified sector) that are known to be in the same
3025
 *  allocated/unallocated state.
3026
 *
3027
 */
3028
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
3029
                                            BlockDriverState *base,
3030
                                            int64_t sector_num,
3031
                                            int nb_sectors, int *pnum)
3032
{
3033
    BlockDriverState *intermediate;
3034
    int ret, n = nb_sectors;
3035

    
3036
    intermediate = top;
3037
    while (intermediate && intermediate != base) {
3038
        int pnum_inter;
3039
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
3040
                                   &pnum_inter);
3041
        if (ret < 0) {
3042
            return ret;
3043
        } else if (ret) {
3044
            *pnum = pnum_inter;
3045
            return 1;
3046
        }
3047

    
3048
        /*
3049
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3050
         * might have
3051
         *
3052
         * [sector_num+x, nr_sectors] allocated.
3053
         */
3054
        if (n > pnum_inter &&
3055
            (intermediate == top ||
3056
             sector_num + pnum_inter < intermediate->total_sectors)) {
3057
            n = pnum_inter;
3058
        }
3059

    
3060
        intermediate = intermediate->backing_hd;
3061
    }
3062

    
3063
    *pnum = n;
3064
    return 0;
3065
}
3066

    
3067
/* Coroutine wrapper for bdrv_is_allocated_above() */
3068
static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque)
3069
{
3070
    BdrvCoIsAllocatedData *data = opaque;
3071
    BlockDriverState *top = data->bs;
3072
    BlockDriverState *base = data->base;
3073

    
3074
    data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num,
3075
                                           data->nb_sectors, data->pnum);
3076
    data->done = true;
3077
}
3078

    
3079
/*
3080
 * Synchronous wrapper around bdrv_co_is_allocated_above().
3081
 *
3082
 * See bdrv_co_is_allocated_above() for details.
3083
 */
3084
int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
3085
                            int64_t sector_num, int nb_sectors, int *pnum)
3086
{
3087
    Coroutine *co;
3088
    BdrvCoIsAllocatedData data = {
3089
        .bs = top,
3090
        .base = base,
3091
        .sector_num = sector_num,
3092
        .nb_sectors = nb_sectors,
3093
        .pnum = pnum,
3094
        .done = false,
3095
    };
3096

    
3097
    co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry);
3098
    qemu_coroutine_enter(co, &data);
3099
    while (!data.done) {
3100
        qemu_aio_wait();
3101
    }
3102
    return data.ret;
3103
}
3104

    
3105
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3106
{
3107
    if (bs->backing_hd && bs->backing_hd->encrypted)
3108
        return bs->backing_file;
3109
    else if (bs->encrypted)
3110
        return bs->filename;
3111
    else
3112
        return NULL;
3113
}
3114

    
3115
void bdrv_get_backing_filename(BlockDriverState *bs,
3116
                               char *filename, int filename_size)
3117
{
3118
    pstrcpy(filename, filename_size, bs->backing_file);
3119
}
3120

    
3121
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3122
                          const uint8_t *buf, int nb_sectors)
3123
{
3124
    BlockDriver *drv = bs->drv;
3125
    if (!drv)
3126
        return -ENOMEDIUM;
3127
    if (!drv->bdrv_write_compressed)
3128
        return -ENOTSUP;
3129
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3130
        return -EIO;
3131

    
3132
    assert(!bs->dirty_bitmap);
3133

    
3134
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3135
}
3136

    
3137
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3138
{
3139
    BlockDriver *drv = bs->drv;
3140
    if (!drv)
3141
        return -ENOMEDIUM;
3142
    if (!drv->bdrv_get_info)
3143
        return -ENOTSUP;
3144
    memset(bdi, 0, sizeof(*bdi));
3145
    return drv->bdrv_get_info(bs, bdi);
3146
}
3147

    
3148
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3149
                      int64_t pos, int size)
3150
{
3151
    QEMUIOVector qiov;
3152
    struct iovec iov = {
3153
        .iov_base   = (void *) buf,
3154
        .iov_len    = size,
3155
    };
3156

    
3157
    qemu_iovec_init_external(&qiov, &iov, 1);
3158
    return bdrv_writev_vmstate(bs, &qiov, pos);
3159
}
3160

    
3161
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3162
{
3163
    BlockDriver *drv = bs->drv;
3164

    
3165
    if (!drv) {
3166
        return -ENOMEDIUM;
3167
    } else if (drv->bdrv_save_vmstate) {
3168
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3169
    } else if (bs->file) {
3170
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3171
    }
3172

    
3173
    return -ENOTSUP;
3174
}
3175

    
3176
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3177
                      int64_t pos, int size)
3178
{
3179
    BlockDriver *drv = bs->drv;
3180
    if (!drv)
3181
        return -ENOMEDIUM;
3182
    if (drv->bdrv_load_vmstate)
3183
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3184
    if (bs->file)
3185
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3186
    return -ENOTSUP;
3187
}
3188

    
3189
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3190
{
3191
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3192
        return;
3193
    }
3194

    
3195
    bs->drv->bdrv_debug_event(bs, event);
3196
}
3197

    
3198
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3199
                          const char *tag)
3200
{
3201
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3202
        bs = bs->file;
3203
    }
3204

    
3205
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3206
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3207
    }
3208

    
3209
    return -ENOTSUP;
3210
}
3211

    
3212
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3213
{
3214
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3215
        bs = bs->file;
3216
    }
3217

    
3218
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3219
        return bs->drv->bdrv_debug_resume(bs, tag);
3220
    }
3221

    
3222
    return -ENOTSUP;
3223
}
3224

    
3225
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3226
{
3227
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3228
        bs = bs->file;
3229
    }
3230

    
3231
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3232
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3233
    }
3234

    
3235
    return false;
3236
}
3237

    
3238
int bdrv_is_snapshot(BlockDriverState *bs)
3239
{
3240
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3241
}
3242

    
3243
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3244
 * relative, it must be relative to the chain.  So, passing in bs->filename
3245
 * from a BDS as backing_file should not be done, as that may be relative to
3246
 * the CWD rather than the chain. */
3247
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3248
        const char *backing_file)
3249
{
3250
    char *filename_full = NULL;
3251
    char *backing_file_full = NULL;
3252
    char *filename_tmp = NULL;
3253
    int is_protocol = 0;
3254
    BlockDriverState *curr_bs = NULL;
3255
    BlockDriverState *retval = NULL;
3256

    
3257
    if (!bs || !bs->drv || !backing_file) {
3258
        return NULL;
3259
    }
3260

    
3261
    filename_full     = g_malloc(PATH_MAX);
3262
    backing_file_full = g_malloc(PATH_MAX);
3263
    filename_tmp      = g_malloc(PATH_MAX);
3264

    
3265
    is_protocol = path_has_protocol(backing_file);
3266

    
3267
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3268

    
3269
        /* If either of the filename paths is actually a protocol, then
3270
         * compare unmodified paths; otherwise make paths relative */
3271
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3272
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3273
                retval = curr_bs->backing_hd;
3274
                break;
3275
            }
3276
        } else {
3277
            /* If not an absolute filename path, make it relative to the current
3278
             * image's filename path */
3279
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3280
                         backing_file);
3281

    
3282
            /* We are going to compare absolute pathnames */
3283
            if (!realpath(filename_tmp, filename_full)) {
3284
                continue;
3285
            }
3286

    
3287
            /* We need to make sure the backing filename we are comparing against
3288
             * is relative to the current image filename (or absolute) */
3289
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3290
                         curr_bs->backing_file);
3291

    
3292
            if (!realpath(filename_tmp, backing_file_full)) {
3293
                continue;
3294
            }
3295

    
3296
            if (strcmp(backing_file_full, filename_full) == 0) {
3297
                retval = curr_bs->backing_hd;
3298
                break;
3299
            }
3300
        }
3301
    }
3302

    
3303
    g_free(filename_full);
3304
    g_free(backing_file_full);
3305
    g_free(filename_tmp);
3306
    return retval;
3307
}
3308

    
3309
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3310
{
3311
    if (!bs->drv) {
3312
        return 0;
3313
    }
3314

    
3315
    if (!bs->backing_hd) {
3316
        return 0;
3317
    }
3318

    
3319
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3320
}
3321

    
3322
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3323
{
3324
    BlockDriverState *curr_bs = NULL;
3325

    
3326
    if (!bs) {
3327
        return NULL;
3328
    }
3329

    
3330
    curr_bs = bs;
3331

    
3332
    while (curr_bs->backing_hd) {
3333
        curr_bs = curr_bs->backing_hd;
3334
    }
3335
    return curr_bs;
3336
}
3337

    
3338
/**************************************************************/
3339
/* async I/Os */
3340

    
3341
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3342
                                 QEMUIOVector *qiov, int nb_sectors,
3343
                                 BlockDriverCompletionFunc *cb, void *opaque)
3344
{
3345
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3346

    
3347
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3348
                                 cb, opaque, false);
3349
}
3350

    
3351
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3352
                                  QEMUIOVector *qiov, int nb_sectors,
3353
                                  BlockDriverCompletionFunc *cb, void *opaque)
3354
{
3355
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3356

    
3357
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3358
                                 cb, opaque, true);
3359
}
3360

    
3361

    
3362
typedef struct MultiwriteCB {
3363
    int error;
3364
    int num_requests;
3365
    int num_callbacks;
3366
    struct {
3367
        BlockDriverCompletionFunc *cb;
3368
        void *opaque;
3369
        QEMUIOVector *free_qiov;
3370
    } callbacks[];
3371
} MultiwriteCB;
3372

    
3373
static void multiwrite_user_cb(MultiwriteCB *mcb)
3374
{
3375
    int i;
3376

    
3377
    for (i = 0; i < mcb->num_callbacks; i++) {
3378
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3379
        if (mcb->callbacks[i].free_qiov) {
3380
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3381
        }
3382
        g_free(mcb->callbacks[i].free_qiov);
3383
    }
3384
}
3385

    
3386
static void multiwrite_cb(void *opaque, int ret)
3387
{
3388
    MultiwriteCB *mcb = opaque;
3389

    
3390
    trace_multiwrite_cb(mcb, ret);
3391

    
3392
    if (ret < 0 && !mcb->error) {
3393
        mcb->error = ret;
3394
    }
3395

    
3396
    mcb->num_requests--;
3397
    if (mcb->num_requests == 0) {
3398
        multiwrite_user_cb(mcb);
3399
        g_free(mcb);
3400
    }
3401
}
3402

    
3403
static int multiwrite_req_compare(const void *a, const void *b)
3404
{
3405
    const BlockRequest *req1 = a, *req2 = b;
3406

    
3407
    /*
3408
     * Note that we can't simply subtract req2->sector from req1->sector
3409
     * here as that could overflow the return value.
3410
     */
3411
    if (req1->sector > req2->sector) {
3412
        return 1;
3413
    } else if (req1->sector < req2->sector) {
3414
        return -1;
3415
    } else {
3416
        return 0;
3417
    }
3418
}
3419

    
3420
/*
3421
 * Takes a bunch of requests and tries to merge them. Returns the number of
3422
 * requests that remain after merging.
3423
 */
3424
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3425
    int num_reqs, MultiwriteCB *mcb)
3426
{
3427
    int i, outidx;
3428

    
3429
    // Sort requests by start sector
3430
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3431

    
3432
    // Check if adjacent requests touch the same clusters. If so, combine them,
3433
    // filling up gaps with zero sectors.
3434
    outidx = 0;
3435
    for (i = 1; i < num_reqs; i++) {
3436
        int merge = 0;
3437
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3438

    
3439
        // Handle exactly sequential writes and overlapping writes.
3440
        if (reqs[i].sector <= oldreq_last) {
3441
            merge = 1;
3442
        }
3443

    
3444
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3445
            merge = 0;
3446
        }
3447

    
3448
        if (merge) {
3449
            size_t size;
3450
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3451
            qemu_iovec_init(qiov,
3452
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3453

    
3454
            // Add the first request to the merged one. If the requests are
3455
            // overlapping, drop the last sectors of the first request.
3456
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3457
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3458

    
3459
            // We should need to add any zeros between the two requests
3460
            assert (reqs[i].sector <= oldreq_last);
3461

    
3462
            // Add the second request
3463
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3464

    
3465
            reqs[outidx].nb_sectors = qiov->size >> 9;
3466
            reqs[outidx].qiov = qiov;
3467

    
3468
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3469
        } else {
3470
            outidx++;
3471
            reqs[outidx].sector     = reqs[i].sector;
3472
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3473
            reqs[outidx].qiov       = reqs[i].qiov;
3474
        }
3475
    }
3476

    
3477
    return outidx + 1;
3478
}
3479

    
3480
/*
3481
 * Submit multiple AIO write requests at once.
3482
 *
3483
 * On success, the function returns 0 and all requests in the reqs array have
3484
 * been submitted. In error case this function returns -1, and any of the
3485
 * requests may or may not be submitted yet. In particular, this means that the
3486
 * callback will be called for some of the requests, for others it won't. The
3487
 * caller must check the error field of the BlockRequest to wait for the right
3488
 * callbacks (if error != 0, no callback will be called).
3489
 *
3490
 * The implementation may modify the contents of the reqs array, e.g. to merge
3491
 * requests. However, the fields opaque and error are left unmodified as they
3492
 * are used to signal failure for a single request to the caller.
3493
 */
3494
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3495
{
3496
    MultiwriteCB *mcb;
3497
    int i;
3498

    
3499
    /* don't submit writes if we don't have a medium */
3500
    if (bs->drv == NULL) {
3501
        for (i = 0; i < num_reqs; i++) {
3502
            reqs[i].error = -ENOMEDIUM;
3503
        }
3504
        return -1;
3505
    }
3506

    
3507
    if (num_reqs == 0) {
3508
        return 0;
3509
    }
3510

    
3511
    // Create MultiwriteCB structure
3512
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3513
    mcb->num_requests = 0;
3514
    mcb->num_callbacks = num_reqs;
3515

    
3516
    for (i = 0; i < num_reqs; i++) {
3517
        mcb->callbacks[i].cb = reqs[i].cb;
3518
        mcb->callbacks[i].opaque = reqs[i].opaque;
3519
    }
3520

    
3521
    // Check for mergable requests
3522
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3523

    
3524
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3525

    
3526
    /* Run the aio requests. */
3527
    mcb->num_requests = num_reqs;
3528
    for (i = 0; i < num_reqs; i++) {
3529
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3530
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3531
    }
3532

    
3533
    return 0;
3534
}
3535

    
3536
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3537
{
3538
    acb->aiocb_info->cancel(acb);
3539
}
3540

    
3541
/* block I/O throttling */
3542
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3543
                 bool is_write, double elapsed_time, uint64_t *wait)
3544
{
3545
    uint64_t bps_limit = 0;
3546
    uint64_t extension;
3547
    double   bytes_limit, bytes_base, bytes_res;
3548
    double   slice_time, wait_time;
3549

    
3550
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3551
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3552
    } else if (bs->io_limits.bps[is_write]) {
3553
        bps_limit = bs->io_limits.bps[is_write];
3554
    } else {
3555
        if (wait) {
3556
            *wait = 0;
3557
        }
3558

    
3559
        return false;
3560
    }
3561

    
3562
    slice_time = bs->slice_end - bs->slice_start;
3563
    slice_time /= (NANOSECONDS_PER_SECOND);
3564
    bytes_limit = bps_limit * slice_time;
3565
    bytes_base  = bs->slice_submitted.bytes[is_write];
3566
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3567
        bytes_base += bs->slice_submitted.bytes[!is_write];
3568
    }
3569

    
3570
    /* bytes_base: the bytes of data which have been read/written; and
3571
     *             it is obtained from the history statistic info.
3572
     * bytes_res: the remaining bytes of data which need to be read/written.
3573
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3574
     *             the total time for completing reading/writting all data.
3575
     */
3576
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3577

    
3578
    if (bytes_base + bytes_res <= bytes_limit) {
3579
        if (wait) {
3580
            *wait = 0;
3581
        }
3582

    
3583
        return false;
3584
    }
3585

    
3586
    /* Calc approx time to dispatch */
3587
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3588

    
3589
    /* When the I/O rate at runtime exceeds the limits,
3590
     * bs->slice_end need to be extended in order that the current statistic
3591
     * info can be kept until the timer fire, so it is increased and tuned
3592
     * based on the result of experiment.
3593
     */
3594
    extension = wait_time * NANOSECONDS_PER_SECOND;
3595
    extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
3596
                BLOCK_IO_SLICE_TIME;
3597
    bs->slice_end += extension;
3598
    if (wait) {
3599
        *wait = wait_time * NANOSECONDS_PER_SECOND;
3600
    }
3601

    
3602
    return true;
3603
}
3604

    
3605
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3606
                             double elapsed_time, uint64_t *wait)
3607
{
3608
    uint64_t iops_limit = 0;
3609
    double   ios_limit, ios_base;
3610
    double   slice_time, wait_time;
3611

    
3612
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3613
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3614
    } else if (bs->io_limits.iops[is_write]) {
3615
        iops_limit = bs->io_limits.iops[is_write];
3616
    } else {
3617
        if (wait) {
3618
            *wait = 0;
3619
        }
3620

    
3621
        return false;
3622
    }
3623

    
3624
    slice_time = bs->slice_end - bs->slice_start;
3625
    slice_time /= (NANOSECONDS_PER_SECOND);
3626
    ios_limit  = iops_limit * slice_time;
3627
    ios_base   = bs->slice_submitted.ios[is_write];
3628
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3629
        ios_base += bs->slice_submitted.ios[!is_write];
3630
    }
3631

    
3632
    if (ios_base + 1 <= ios_limit) {
3633
        if (wait) {
3634
            *wait = 0;
3635
        }
3636

    
3637
        return false;
3638
    }
3639

    
3640
    /* Calc approx time to dispatch, in seconds */
3641
    wait_time = (ios_base + 1) / iops_limit;
3642
    if (wait_time > elapsed_time) {
3643
        wait_time = wait_time - elapsed_time;
3644
    } else {
3645
        wait_time = 0;
3646
    }
3647

    
3648
    /* Exceeded current slice, extend it by another slice time */
3649
    bs->slice_end += BLOCK_IO_SLICE_TIME;
3650
    if (wait) {
3651
        *wait = wait_time * NANOSECONDS_PER_SECOND;
3652
    }
3653

    
3654
    return true;
3655
}
3656

    
3657
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3658
                           bool is_write, int64_t *wait)
3659
{
3660
    int64_t  now, max_wait;
3661
    uint64_t bps_wait = 0, iops_wait = 0;
3662
    double   elapsed_time;
3663
    int      bps_ret, iops_ret;
3664

    
3665
    now = qemu_get_clock_ns(vm_clock);
3666
    if (now > bs->slice_end) {
3667
        bs->slice_start = now;
3668
        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
3669
        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
3670
    }
3671

    
3672
    elapsed_time  = now - bs->slice_start;
3673
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3674

    
3675
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3676
                                      is_write, elapsed_time, &bps_wait);
3677
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3678
                                      elapsed_time, &iops_wait);
3679
    if (bps_ret || iops_ret) {
3680
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3681
        if (wait) {
3682
            *wait = max_wait;
3683
        }
3684

    
3685
        now = qemu_get_clock_ns(vm_clock);
3686
        if (bs->slice_end < now + max_wait) {
3687
            bs->slice_end = now + max_wait;
3688
        }
3689

    
3690
        return true;
3691
    }
3692

    
3693
    if (wait) {
3694
        *wait = 0;
3695
    }
3696

    
3697
    bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
3698
                                           BDRV_SECTOR_SIZE;
3699
    bs->slice_submitted.ios[is_write]++;
3700

    
3701
    return false;
3702
}
3703

    
3704
/**************************************************************/
3705
/* async block device emulation */
3706

    
3707
typedef struct BlockDriverAIOCBSync {
3708
    BlockDriverAIOCB common;
3709
    QEMUBH *bh;
3710
    int ret;
3711
    /* vector translation state */
3712
    QEMUIOVector *qiov;
3713
    uint8_t *bounce;
3714
    int is_write;
3715
} BlockDriverAIOCBSync;
3716

    
3717
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3718
{
3719
    BlockDriverAIOCBSync *acb =
3720
        container_of(blockacb, BlockDriverAIOCBSync, common);
3721
    qemu_bh_delete(acb->bh);
3722
    acb->bh = NULL;
3723
    qemu_aio_release(acb);
3724
}
3725

    
3726
static const AIOCBInfo bdrv_em_aiocb_info = {
3727
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3728
    .cancel             = bdrv_aio_cancel_em,
3729
};
3730

    
3731
static void bdrv_aio_bh_cb(void *opaque)
3732
{
3733
    BlockDriverAIOCBSync *acb = opaque;
3734

    
3735
    if (!acb->is_write)
3736
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3737
    qemu_vfree(acb->bounce);
3738
    acb->common.cb(acb->common.opaque, acb->ret);
3739
    qemu_bh_delete(acb->bh);
3740
    acb->bh = NULL;
3741
    qemu_aio_release(acb);
3742
}
3743

    
3744
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3745
                                            int64_t sector_num,
3746
                                            QEMUIOVector *qiov,
3747
                                            int nb_sectors,
3748
                                            BlockDriverCompletionFunc *cb,
3749
                                            void *opaque,
3750
                                            int is_write)
3751

    
3752
{
3753
    BlockDriverAIOCBSync *acb;
3754

    
3755
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3756
    acb->is_write = is_write;
3757
    acb->qiov = qiov;
3758
    acb->bounce = qemu_blockalign(bs, qiov->size);
3759
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3760

    
3761
    if (is_write) {
3762
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3763
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3764
    } else {
3765
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3766
    }
3767

    
3768
    qemu_bh_schedule(acb->bh);
3769

    
3770
    return &acb->common;
3771
}
3772

    
3773
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3774
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3775
        BlockDriverCompletionFunc *cb, void *opaque)
3776
{
3777
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3778
}
3779

    
3780
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3781
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3782
        BlockDriverCompletionFunc *cb, void *opaque)
3783
{
3784
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3785
}
3786

    
3787

    
3788
typedef struct BlockDriverAIOCBCoroutine {
3789
    BlockDriverAIOCB common;
3790
    BlockRequest req;
3791
    bool is_write;
3792
    bool *done;
3793
    QEMUBH* bh;
3794
} BlockDriverAIOCBCoroutine;
3795

    
3796
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3797
{
3798
    BlockDriverAIOCBCoroutine *acb =
3799
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3800
    bool done = false;
3801

    
3802
    acb->done = &done;
3803
    while (!done) {
3804
        qemu_aio_wait();
3805
    }
3806
}
3807

    
3808
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3809
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3810
    .cancel             = bdrv_aio_co_cancel_em,
3811
};
3812

    
3813
static void bdrv_co_em_bh(void *opaque)
3814
{
3815
    BlockDriverAIOCBCoroutine *acb = opaque;
3816

    
3817
    acb->common.cb(acb->common.opaque, acb->req.error);
3818

    
3819
    if (acb->done) {
3820
        *acb->done = true;
3821
    }
3822

    
3823
    qemu_bh_delete(acb->bh);
3824
    qemu_aio_release(acb);
3825
}
3826

    
3827
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3828
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3829
{
3830
    BlockDriverAIOCBCoroutine *acb = opaque;
3831
    BlockDriverState *bs = acb->common.bs;
3832

    
3833
    if (!acb->is_write) {
3834
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3835
            acb->req.nb_sectors, acb->req.qiov, 0);
3836
    } else {
3837
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3838
            acb->req.nb_sectors, acb->req.qiov, 0);
3839
    }
3840

    
3841
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3842
    qemu_bh_schedule(acb->bh);
3843
}
3844

    
3845
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3846
                                               int64_t sector_num,
3847
                                               QEMUIOVector *qiov,
3848
                                               int nb_sectors,
3849
                                               BlockDriverCompletionFunc *cb,
3850
                                               void *opaque,
3851
                                               bool is_write)
3852
{
3853
    Coroutine *co;
3854
    BlockDriverAIOCBCoroutine *acb;
3855

    
3856
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3857
    acb->req.sector = sector_num;
3858
    acb->req.nb_sectors = nb_sectors;
3859
    acb->req.qiov = qiov;
3860
    acb->is_write = is_write;
3861
    acb->done = NULL;
3862

    
3863
    co = qemu_coroutine_create(bdrv_co_do_rw);
3864
    qemu_coroutine_enter(co, acb);
3865

    
3866
    return &acb->common;
3867
}
3868

    
3869
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3870
{
3871
    BlockDriverAIOCBCoroutine *acb = opaque;
3872
    BlockDriverState *bs = acb->common.bs;
3873

    
3874
    acb->req.error = bdrv_co_flush(bs);
3875
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3876
    qemu_bh_schedule(acb->bh);
3877
}
3878

    
3879
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3880
        BlockDriverCompletionFunc *cb, void *opaque)
3881
{
3882
    trace_bdrv_aio_flush(bs, opaque);
3883

    
3884
    Coroutine *co;
3885
    BlockDriverAIOCBCoroutine *acb;
3886

    
3887
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3888
    acb->done = NULL;
3889

    
3890
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3891
    qemu_coroutine_enter(co, acb);
3892

    
3893
    return &acb->common;
3894
}
3895

    
3896
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3897
{
3898
    BlockDriverAIOCBCoroutine *acb = opaque;
3899
    BlockDriverState *bs = acb->common.bs;
3900

    
3901
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3902
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3903
    qemu_bh_schedule(acb->bh);
3904
}
3905

    
3906
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3907
        int64_t sector_num, int nb_sectors,
3908
        BlockDriverCompletionFunc *cb, void *opaque)
3909
{
3910
    Coroutine *co;
3911
    BlockDriverAIOCBCoroutine *acb;
3912

    
3913
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3914

    
3915
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3916
    acb->req.sector = sector_num;
3917
    acb->req.nb_sectors = nb_sectors;
3918
    acb->done = NULL;
3919
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3920
    qemu_coroutine_enter(co, acb);
3921

    
3922
    return &acb->common;
3923
}
3924

    
3925
void bdrv_init(void)
3926
{
3927
    module_call_init(MODULE_INIT_BLOCK);
3928
}
3929

    
3930
void bdrv_init_with_whitelist(void)
3931
{
3932
    use_bdrv_whitelist = 1;
3933
    bdrv_init();
3934
}
3935

    
3936
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
3937
                   BlockDriverCompletionFunc *cb, void *opaque)
3938
{
3939
    BlockDriverAIOCB *acb;
3940

    
3941
    acb = g_slice_alloc(aiocb_info->aiocb_size);
3942
    acb->aiocb_info = aiocb_info;
3943
    acb->bs = bs;
3944
    acb->cb = cb;
3945
    acb->opaque = opaque;
3946
    return acb;
3947
}
3948

    
3949
void qemu_aio_release(void *p)
3950
{
3951
    BlockDriverAIOCB *acb = p;
3952
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
3953
}
3954

    
3955
/**************************************************************/
3956
/* Coroutine block device emulation */
3957

    
3958
typedef struct CoroutineIOCompletion {
3959
    Coroutine *coroutine;
3960
    int ret;
3961
} CoroutineIOCompletion;
3962

    
3963
static void bdrv_co_io_em_complete(void *opaque, int ret)
3964
{
3965
    CoroutineIOCompletion *co = opaque;
3966

    
3967
    co->ret = ret;
3968
    qemu_coroutine_enter(co->coroutine, NULL);
3969
}
3970

    
3971
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3972
                                      int nb_sectors, QEMUIOVector *iov,
3973
                                      bool is_write)
3974
{
3975
    CoroutineIOCompletion co = {
3976
        .coroutine = qemu_coroutine_self(),
3977
    };
3978
    BlockDriverAIOCB *acb;
3979

    
3980
    if (is_write) {
3981
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3982
                                       bdrv_co_io_em_complete, &co);
3983
    } else {
3984
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3985
                                      bdrv_co_io_em_complete, &co);
3986
    }
3987

    
3988
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3989
    if (!acb) {
3990
        return -EIO;
3991
    }
3992
    qemu_coroutine_yield();
3993

    
3994
    return co.ret;
3995
}
3996

    
3997
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3998
                                         int64_t sector_num, int nb_sectors,
3999
                                         QEMUIOVector *iov)
4000
{
4001
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4002
}
4003

    
4004
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4005
                                         int64_t sector_num, int nb_sectors,
4006
                                         QEMUIOVector *iov)
4007
{
4008
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4009
}
4010

    
4011
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4012
{
4013
    RwCo *rwco = opaque;
4014

    
4015
    rwco->ret = bdrv_co_flush(rwco->bs);
4016
}
4017

    
4018
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4019
{
4020
    int ret;
4021

    
4022
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4023
        return 0;
4024
    }
4025

    
4026
    /* Write back cached data to the OS even with cache=unsafe */
4027
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4028
    if (bs->drv->bdrv_co_flush_to_os) {
4029
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4030
        if (ret < 0) {
4031
            return ret;
4032
        }
4033
    }
4034

    
4035
    /* But don't actually force it to the disk with cache=unsafe */
4036
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4037
        goto flush_parent;
4038
    }
4039

    
4040
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4041
    if (bs->drv->bdrv_co_flush_to_disk) {
4042
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4043
    } else if (bs->drv->bdrv_aio_flush) {
4044
        BlockDriverAIOCB *acb;
4045
        CoroutineIOCompletion co = {
4046
            .coroutine = qemu_coroutine_self(),
4047
        };
4048

    
4049
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4050
        if (acb == NULL) {
4051
            ret = -EIO;
4052
        } else {
4053
            qemu_coroutine_yield();
4054
            ret = co.ret;
4055
        }
4056
    } else {
4057
        /*
4058
         * Some block drivers always operate in either writethrough or unsafe
4059
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4060
         * know how the server works (because the behaviour is hardcoded or
4061
         * depends on server-side configuration), so we can't ensure that
4062
         * everything is safe on disk. Returning an error doesn't work because
4063
         * that would break guests even if the server operates in writethrough
4064
         * mode.
4065
         *
4066
         * Let's hope the user knows what he's doing.
4067
         */
4068
        ret = 0;
4069
    }
4070
    if (ret < 0) {
4071
        return ret;
4072
    }
4073

    
4074
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4075
     * in the case of cache=unsafe, so there are no useless flushes.
4076
     */
4077
flush_parent:
4078
    return bdrv_co_flush(bs->file);
4079
}
4080

    
4081
void bdrv_invalidate_cache(BlockDriverState *bs)
4082
{
4083
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4084
        bs->drv->bdrv_invalidate_cache(bs);
4085
    }
4086
}
4087

    
4088
void bdrv_invalidate_cache_all(void)
4089
{
4090
    BlockDriverState *bs;
4091

    
4092
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4093
        bdrv_invalidate_cache(bs);
4094
    }
4095
}
4096

    
4097
void bdrv_clear_incoming_migration_all(void)
4098
{
4099
    BlockDriverState *bs;
4100

    
4101
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4102
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4103
    }
4104
}
4105

    
4106
int bdrv_flush(BlockDriverState *bs)
4107
{
4108
    Coroutine *co;
4109
    RwCo rwco = {
4110
        .bs = bs,
4111
        .ret = NOT_DONE,
4112
    };
4113

    
4114
    if (qemu_in_coroutine()) {
4115
        /* Fast-path if already in coroutine context */
4116
        bdrv_flush_co_entry(&rwco);
4117
    } else {
4118
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4119
        qemu_coroutine_enter(co, &rwco);
4120
        while (rwco.ret == NOT_DONE) {
4121
            qemu_aio_wait();
4122
        }
4123
    }
4124

    
4125
    return rwco.ret;
4126
}
4127

    
4128
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4129
{
4130
    RwCo *rwco = opaque;
4131

    
4132
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4133
}
4134

    
4135
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4136
                                 int nb_sectors)
4137
{
4138
    if (!bs->drv) {
4139
        return -ENOMEDIUM;
4140
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4141
        return -EIO;
4142
    } else if (bs->read_only) {
4143
        return -EROFS;
4144
    }
4145

    
4146
    if (bs->dirty_bitmap) {
4147
        bdrv_reset_dirty(bs, sector_num, nb_sectors);
4148
    }
4149

    
4150
    /* Do nothing if disabled.  */
4151
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4152
        return 0;
4153
    }
4154

    
4155
    if (bs->drv->bdrv_co_discard) {
4156
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4157
    } else if (bs->drv->bdrv_aio_discard) {
4158
        BlockDriverAIOCB *acb;
4159
        CoroutineIOCompletion co = {
4160
            .coroutine = qemu_coroutine_self(),
4161
        };
4162

    
4163
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4164
                                        bdrv_co_io_em_complete, &co);
4165
        if (acb == NULL) {
4166
            return -EIO;
4167
        } else {
4168
            qemu_coroutine_yield();
4169
            return co.ret;
4170
        }
4171
    } else {
4172
        return 0;
4173
    }
4174
}
4175

    
4176
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4177
{
4178
    Coroutine *co;
4179
    RwCo rwco = {
4180
        .bs = bs,
4181
        .sector_num = sector_num,
4182
        .nb_sectors = nb_sectors,
4183
        .ret = NOT_DONE,
4184
    };
4185

    
4186
    if (qemu_in_coroutine()) {
4187
        /* Fast-path if already in coroutine context */
4188
        bdrv_discard_co_entry(&rwco);
4189
    } else {
4190
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4191
        qemu_coroutine_enter(co, &rwco);
4192
        while (rwco.ret == NOT_DONE) {
4193
            qemu_aio_wait();
4194
        }
4195
    }
4196

    
4197
    return rwco.ret;
4198
}
4199

    
4200
/**************************************************************/
4201
/* removable device support */
4202

    
4203
/**
4204
 * Return TRUE if the media is present
4205
 */
4206
int bdrv_is_inserted(BlockDriverState *bs)
4207
{
4208
    BlockDriver *drv = bs->drv;
4209

    
4210
    if (!drv)
4211
        return 0;
4212
    if (!drv->bdrv_is_inserted)
4213
        return 1;
4214
    return drv->bdrv_is_inserted(bs);
4215
}
4216

    
4217
/**
4218
 * Return whether the media changed since the last call to this
4219
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4220
 */
4221
int bdrv_media_changed(BlockDriverState *bs)
4222
{
4223
    BlockDriver *drv = bs->drv;
4224

    
4225
    if (drv && drv->bdrv_media_changed) {
4226
        return drv->bdrv_media_changed(bs);
4227
    }
4228
    return -ENOTSUP;
4229
}
4230

    
4231
/**
4232
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4233
 */
4234
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4235
{
4236
    BlockDriver *drv = bs->drv;
4237

    
4238
    if (drv && drv->bdrv_eject) {
4239
        drv->bdrv_eject(bs, eject_flag);
4240
    }
4241

    
4242
    if (bs->device_name[0] != '\0') {
4243
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4244
    }
4245
}
4246

    
4247
/**
4248
 * Lock or unlock the media (if it is locked, the user won't be able
4249
 * to eject it manually).
4250
 */
4251
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4252
{
4253
    BlockDriver *drv = bs->drv;
4254

    
4255
    trace_bdrv_lock_medium(bs, locked);
4256

    
4257
    if (drv && drv->bdrv_lock_medium) {
4258
        drv->bdrv_lock_medium(bs, locked);
4259
    }
4260
}
4261

    
4262
/* needed for generic scsi interface */
4263

    
4264
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4265
{
4266
    BlockDriver *drv = bs->drv;
4267

    
4268
    if (drv && drv->bdrv_ioctl)
4269
        return drv->bdrv_ioctl(bs, req, buf);
4270
    return -ENOTSUP;
4271
}
4272

    
4273
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4274
        unsigned long int req, void *buf,
4275
        BlockDriverCompletionFunc *cb, void *opaque)
4276
{
4277
    BlockDriver *drv = bs->drv;
4278

    
4279
    if (drv && drv->bdrv_aio_ioctl)
4280
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4281
    return NULL;
4282
}
4283

    
4284
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4285
{
4286
    bs->buffer_alignment = align;
4287
}
4288

    
4289
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4290
{
4291
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4292
}
4293

    
4294
/*
4295
 * Check if all memory in this vector is sector aligned.
4296
 */
4297
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4298
{
4299
    int i;
4300

    
4301
    for (i = 0; i < qiov->niov; i++) {
4302
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4303
            return false;
4304
        }
4305
    }
4306

    
4307
    return true;
4308
}
4309

    
4310
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4311
{
4312
    int64_t bitmap_size;
4313

    
4314
    assert((granularity & (granularity - 1)) == 0);
4315

    
4316
    if (granularity) {
4317
        granularity >>= BDRV_SECTOR_BITS;
4318
        assert(!bs->dirty_bitmap);
4319
        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4320
        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4321
    } else {
4322
        if (bs->dirty_bitmap) {
4323
            hbitmap_free(bs->dirty_bitmap);
4324
            bs->dirty_bitmap = NULL;
4325
        }
4326
    }
4327
}
4328

    
4329
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4330
{
4331
    if (bs->dirty_bitmap) {
4332
        return hbitmap_get(bs->dirty_bitmap, sector);
4333
    } else {
4334
        return 0;
4335
    }
4336
}
4337

    
4338
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4339
{
4340
    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4341
}
4342

    
4343
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4344
                    int nr_sectors)
4345
{
4346
    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4347
}
4348

    
4349
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4350
                      int nr_sectors)
4351
{
4352
    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4353
}
4354

    
4355
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4356
{
4357
    if (bs->dirty_bitmap) {
4358
        return hbitmap_count(bs->dirty_bitmap);
4359
    } else {
4360
        return 0;
4361
    }
4362
}
4363

    
4364
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4365
{
4366
    assert(bs->in_use != in_use);
4367
    bs->in_use = in_use;
4368
}
4369

    
4370
int bdrv_in_use(BlockDriverState *bs)
4371
{
4372
    return bs->in_use;
4373
}
4374

    
4375
void bdrv_iostatus_enable(BlockDriverState *bs)
4376
{
4377
    bs->iostatus_enabled = true;
4378
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4379
}
4380

    
4381
/* The I/O status is only enabled if the drive explicitly
4382
 * enables it _and_ the VM is configured to stop on errors */
4383
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4384
{
4385
    return (bs->iostatus_enabled &&
4386
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4387
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4388
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4389
}
4390

    
4391
void bdrv_iostatus_disable(BlockDriverState *bs)
4392
{
4393
    bs->iostatus_enabled = false;
4394
}
4395

    
4396
void bdrv_iostatus_reset(BlockDriverState *bs)
4397
{
4398
    if (bdrv_iostatus_is_enabled(bs)) {
4399
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4400
        if (bs->job) {
4401
            block_job_iostatus_reset(bs->job);
4402
        }
4403
    }
4404
}
4405

    
4406
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4407
{
4408
    assert(bdrv_iostatus_is_enabled(bs));
4409
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4410
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4411
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4412
    }
4413
}
4414

    
4415
void
4416
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4417
        enum BlockAcctType type)
4418
{
4419
    assert(type < BDRV_MAX_IOTYPE);
4420

    
4421
    cookie->bytes = bytes;
4422
    cookie->start_time_ns = get_clock();
4423
    cookie->type = type;
4424
}
4425

    
4426
void
4427
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4428
{
4429
    assert(cookie->type < BDRV_MAX_IOTYPE);
4430

    
4431
    bs->nr_bytes[cookie->type] += cookie->bytes;
4432
    bs->nr_ops[cookie->type]++;
4433
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4434
}
4435

    
4436
void bdrv_img_create(const char *filename, const char *fmt,
4437
                     const char *base_filename, const char *base_fmt,
4438
                     char *options, uint64_t img_size, int flags,
4439
                     Error **errp, bool quiet)
4440
{
4441
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4442
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4443
    BlockDriverState *bs = NULL;
4444
    BlockDriver *drv, *proto_drv;
4445
    BlockDriver *backing_drv = NULL;
4446
    int ret = 0;
4447

    
4448
    /* Find driver and parse its options */
4449
    drv = bdrv_find_format(fmt);
4450
    if (!drv) {
4451
        error_setg(errp, "Unknown file format '%s'", fmt);
4452
        return;
4453
    }
4454

    
4455
    proto_drv = bdrv_find_protocol(filename);
4456
    if (!proto_drv) {
4457
        error_setg(errp, "Unknown protocol '%s'", filename);
4458
        return;
4459
    }
4460

    
4461
    create_options = append_option_parameters(create_options,
4462
                                              drv->create_options);
4463
    create_options = append_option_parameters(create_options,
4464
                                              proto_drv->create_options);
4465

    
4466
    /* Create parameter list with default values */
4467
    param = parse_option_parameters("", create_options, param);
4468

    
4469
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4470

    
4471
    /* Parse -o options */
4472
    if (options) {
4473
        param = parse_option_parameters(options, create_options, param);
4474
        if (param == NULL) {
4475
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4476
            goto out;
4477
        }
4478
    }
4479

    
4480
    if (base_filename) {
4481
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4482
                                 base_filename)) {
4483
            error_setg(errp, "Backing file not supported for file format '%s'",
4484
                       fmt);
4485
            goto out;
4486
        }
4487
    }
4488

    
4489
    if (base_fmt) {
4490
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4491
            error_setg(errp, "Backing file format not supported for file "
4492
                             "format '%s'", fmt);
4493
            goto out;
4494
        }
4495
    }
4496

    
4497
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4498
    if (backing_file && backing_file->value.s) {
4499
        if (!strcmp(filename, backing_file->value.s)) {
4500
            error_setg(errp, "Error: Trying to create an image with the "
4501
                             "same filename as the backing file");
4502
            goto out;
4503
        }
4504
    }
4505

    
4506
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4507
    if (backing_fmt && backing_fmt->value.s) {
4508
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4509
        if (!backing_drv) {
4510
            error_setg(errp, "Unknown backing file format '%s'",
4511
                       backing_fmt->value.s);
4512
            goto out;
4513
        }
4514
    }
4515

    
4516
    // The size for the image must always be specified, with one exception:
4517
    // If we are using a backing file, we can obtain the size from there
4518
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4519
    if (size && size->value.n == -1) {
4520
        if (backing_file && backing_file->value.s) {
4521
            uint64_t size;
4522
            char buf[32];
4523
            int back_flags;
4524

    
4525
            /* backing files always opened read-only */
4526
            back_flags =
4527
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4528

    
4529
            bs = bdrv_new("");
4530

    
4531
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4532
                            backing_drv);
4533
            if (ret < 0) {
4534
                error_setg_errno(errp, -ret, "Could not open '%s'",
4535
                                 backing_file->value.s);
4536
                goto out;
4537
            }
4538
            bdrv_get_geometry(bs, &size);
4539
            size *= 512;
4540

    
4541
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4542
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4543
        } else {
4544
            error_setg(errp, "Image creation needs a size parameter");
4545
            goto out;
4546
        }
4547
    }
4548

    
4549
    if (!quiet) {
4550
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4551
        print_option_parameters(param);
4552
        puts("");
4553
    }
4554
    ret = bdrv_create(drv, filename, param);
4555
    if (ret < 0) {
4556
        if (ret == -ENOTSUP) {
4557
            error_setg(errp,"Formatting or formatting option not supported for "
4558
                            "file format '%s'", fmt);
4559
        } else if (ret == -EFBIG) {
4560
            const char *cluster_size_hint = "";
4561
            if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4562
                cluster_size_hint = " (try using a larger cluster size)";
4563
            }
4564
            error_setg(errp, "The image size is too large for file format '%s'%s",
4565
                       fmt, cluster_size_hint);
4566
        } else {
4567
            error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
4568
                       strerror(-ret));
4569
        }
4570
    }
4571

    
4572
out:
4573
    free_option_parameters(create_options);
4574
    free_option_parameters(param);
4575

    
4576
    if (bs) {
4577
        bdrv_delete(bs);
4578
    }
4579
}
4580

    
4581
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4582
{
4583
    /* Currently BlockDriverState always uses the main loop AioContext */
4584
    return qemu_get_aio_context();
4585
}
4586

    
4587
void bdrv_add_before_write_notifier(BlockDriverState *bs,
4588
                                    NotifierWithReturn *notifier)
4589
{
4590
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4591
}