Statistics
| Branch: | Revision:

root / block.c @ 834574ea

History | View | Annotate | Download (124 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90
        bool is_write, double elapsed_time, uint64_t *wait);
91
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92
        double elapsed_time, uint64_t *wait);
93
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94
        bool is_write, int64_t *wait);
95

    
96
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
98

    
99
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
101

    
102
/* The device to use for VM snapshots */
103
static BlockDriverState *bs_snapshots;
104

    
105
/* If non-zero, use only whitelisted block drivers */
106
static int use_bdrv_whitelist;
107

    
108
#ifdef _WIN32
109
static int is_windows_drive_prefix(const char *filename)
110
{
111
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
112
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113
            filename[1] == ':');
114
}
115

    
116
int is_windows_drive(const char *filename)
117
{
118
    if (is_windows_drive_prefix(filename) &&
119
        filename[2] == '\0')
120
        return 1;
121
    if (strstart(filename, "\\\\.\\", NULL) ||
122
        strstart(filename, "//./", NULL))
123
        return 1;
124
    return 0;
125
}
126
#endif
127

    
128
/* throttling disk I/O limits */
129
void bdrv_io_limits_disable(BlockDriverState *bs)
130
{
131
    bs->io_limits_enabled = false;
132

    
133
    while (qemu_co_queue_next(&bs->throttled_reqs));
134

    
135
    if (bs->block_timer) {
136
        qemu_del_timer(bs->block_timer);
137
        qemu_free_timer(bs->block_timer);
138
        bs->block_timer = NULL;
139
    }
140

    
141
    bs->slice_start = 0;
142
    bs->slice_end   = 0;
143
    bs->slice_time  = 0;
144
    memset(&bs->io_base, 0, sizeof(bs->io_base));
145
}
146

    
147
static void bdrv_block_timer(void *opaque)
148
{
149
    BlockDriverState *bs = opaque;
150

    
151
    qemu_co_queue_next(&bs->throttled_reqs);
152
}
153

    
154
void bdrv_io_limits_enable(BlockDriverState *bs)
155
{
156
    qemu_co_queue_init(&bs->throttled_reqs);
157
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
158
    bs->io_limits_enabled = true;
159
}
160

    
161
bool bdrv_io_limits_enabled(BlockDriverState *bs)
162
{
163
    BlockIOLimit *io_limits = &bs->io_limits;
164
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
165
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
166
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
167
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
168
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
169
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
170
}
171

    
172
static void bdrv_io_limits_intercept(BlockDriverState *bs,
173
                                     bool is_write, int nb_sectors)
174
{
175
    int64_t wait_time = -1;
176

    
177
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
178
        qemu_co_queue_wait(&bs->throttled_reqs);
179
    }
180

    
181
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
182
     * throttled requests will not be dequeued until the current request is
183
     * allowed to be serviced. So if the current request still exceeds the
184
     * limits, it will be inserted to the head. All requests followed it will
185
     * be still in throttled_reqs queue.
186
     */
187

    
188
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
189
        qemu_mod_timer(bs->block_timer,
190
                       wait_time + qemu_get_clock_ns(vm_clock));
191
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
192
    }
193

    
194
    qemu_co_queue_next(&bs->throttled_reqs);
195
}
196

    
197
/* check if the path starts with "<protocol>:" */
198
static int path_has_protocol(const char *path)
199
{
200
    const char *p;
201

    
202
#ifdef _WIN32
203
    if (is_windows_drive(path) ||
204
        is_windows_drive_prefix(path)) {
205
        return 0;
206
    }
207
    p = path + strcspn(path, ":/\\");
208
#else
209
    p = path + strcspn(path, ":/");
210
#endif
211

    
212
    return *p == ':';
213
}
214

    
215
int path_is_absolute(const char *path)
216
{
217
#ifdef _WIN32
218
    /* specific case for names like: "\\.\d:" */
219
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
220
        return 1;
221
    }
222
    return (*path == '/' || *path == '\\');
223
#else
224
    return (*path == '/');
225
#endif
226
}
227

    
228
/* if filename is absolute, just copy it to dest. Otherwise, build a
229
   path to it by considering it is relative to base_path. URL are
230
   supported. */
231
void path_combine(char *dest, int dest_size,
232
                  const char *base_path,
233
                  const char *filename)
234
{
235
    const char *p, *p1;
236
    int len;
237

    
238
    if (dest_size <= 0)
239
        return;
240
    if (path_is_absolute(filename)) {
241
        pstrcpy(dest, dest_size, filename);
242
    } else {
243
        p = strchr(base_path, ':');
244
        if (p)
245
            p++;
246
        else
247
            p = base_path;
248
        p1 = strrchr(base_path, '/');
249
#ifdef _WIN32
250
        {
251
            const char *p2;
252
            p2 = strrchr(base_path, '\\');
253
            if (!p1 || p2 > p1)
254
                p1 = p2;
255
        }
256
#endif
257
        if (p1)
258
            p1++;
259
        else
260
            p1 = base_path;
261
        if (p1 > p)
262
            p = p1;
263
        len = p - base_path;
264
        if (len > dest_size - 1)
265
            len = dest_size - 1;
266
        memcpy(dest, base_path, len);
267
        dest[len] = '\0';
268
        pstrcat(dest, dest_size, filename);
269
    }
270
}
271

    
272
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
273
{
274
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
275
        pstrcpy(dest, sz, bs->backing_file);
276
    } else {
277
        path_combine(dest, sz, bs->filename, bs->backing_file);
278
    }
279
}
280

    
281
void bdrv_register(BlockDriver *bdrv)
282
{
283
    /* Block drivers without coroutine functions need emulation */
284
    if (!bdrv->bdrv_co_readv) {
285
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
286
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
287

    
288
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
289
         * the block driver lacks aio we need to emulate that too.
290
         */
291
        if (!bdrv->bdrv_aio_readv) {
292
            /* add AIO emulation layer */
293
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
294
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
295
        }
296
    }
297

    
298
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
299
}
300

    
301
/* create a new block device (by default it is empty) */
302
BlockDriverState *bdrv_new(const char *device_name)
303
{
304
    BlockDriverState *bs;
305

    
306
    bs = g_malloc0(sizeof(BlockDriverState));
307
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
308
    if (device_name[0] != '\0') {
309
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
310
    }
311
    bdrv_iostatus_disable(bs);
312
    notifier_list_init(&bs->close_notifiers);
313

    
314
    return bs;
315
}
316

    
317
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
318
{
319
    notifier_list_add(&bs->close_notifiers, notify);
320
}
321

    
322
BlockDriver *bdrv_find_format(const char *format_name)
323
{
324
    BlockDriver *drv1;
325
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
326
        if (!strcmp(drv1->format_name, format_name)) {
327
            return drv1;
328
        }
329
    }
330
    return NULL;
331
}
332

    
333
static int bdrv_is_whitelisted(BlockDriver *drv)
334
{
335
    static const char *whitelist[] = {
336
        CONFIG_BDRV_WHITELIST
337
    };
338
    const char **p;
339

    
340
    if (!whitelist[0])
341
        return 1;               /* no whitelist, anything goes */
342

    
343
    for (p = whitelist; *p; p++) {
344
        if (!strcmp(drv->format_name, *p)) {
345
            return 1;
346
        }
347
    }
348
    return 0;
349
}
350

    
351
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
352
{
353
    BlockDriver *drv = bdrv_find_format(format_name);
354
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
355
}
356

    
357
typedef struct CreateCo {
358
    BlockDriver *drv;
359
    char *filename;
360
    QEMUOptionParameter *options;
361
    int ret;
362
} CreateCo;
363

    
364
static void coroutine_fn bdrv_create_co_entry(void *opaque)
365
{
366
    CreateCo *cco = opaque;
367
    assert(cco->drv);
368

    
369
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
370
}
371

    
372
int bdrv_create(BlockDriver *drv, const char* filename,
373
    QEMUOptionParameter *options)
374
{
375
    int ret;
376

    
377
    Coroutine *co;
378
    CreateCo cco = {
379
        .drv = drv,
380
        .filename = g_strdup(filename),
381
        .options = options,
382
        .ret = NOT_DONE,
383
    };
384

    
385
    if (!drv->bdrv_create) {
386
        ret = -ENOTSUP;
387
        goto out;
388
    }
389

    
390
    if (qemu_in_coroutine()) {
391
        /* Fast-path if already in coroutine context */
392
        bdrv_create_co_entry(&cco);
393
    } else {
394
        co = qemu_coroutine_create(bdrv_create_co_entry);
395
        qemu_coroutine_enter(co, &cco);
396
        while (cco.ret == NOT_DONE) {
397
            qemu_aio_wait();
398
        }
399
    }
400

    
401
    ret = cco.ret;
402

    
403
out:
404
    g_free(cco.filename);
405
    return ret;
406
}
407

    
408
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
409
{
410
    BlockDriver *drv;
411

    
412
    drv = bdrv_find_protocol(filename);
413
    if (drv == NULL) {
414
        return -ENOENT;
415
    }
416

    
417
    return bdrv_create(drv, filename, options);
418
}
419

    
420
/*
421
 * Create a uniquely-named empty temporary file.
422
 * Return 0 upon success, otherwise a negative errno value.
423
 */
424
int get_tmp_filename(char *filename, int size)
425
{
426
#ifdef _WIN32
427
    char temp_dir[MAX_PATH];
428
    /* GetTempFileName requires that its output buffer (4th param)
429
       have length MAX_PATH or greater.  */
430
    assert(size >= MAX_PATH);
431
    return (GetTempPath(MAX_PATH, temp_dir)
432
            && GetTempFileName(temp_dir, "qem", 0, filename)
433
            ? 0 : -GetLastError());
434
#else
435
    int fd;
436
    const char *tmpdir;
437
    tmpdir = getenv("TMPDIR");
438
    if (!tmpdir)
439
        tmpdir = "/tmp";
440
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
441
        return -EOVERFLOW;
442
    }
443
    fd = mkstemp(filename);
444
    if (fd < 0) {
445
        return -errno;
446
    }
447
    if (close(fd) != 0) {
448
        unlink(filename);
449
        return -errno;
450
    }
451
    return 0;
452
#endif
453
}
454

    
455
/*
456
 * Detect host devices. By convention, /dev/cdrom[N] is always
457
 * recognized as a host CDROM.
458
 */
459
static BlockDriver *find_hdev_driver(const char *filename)
460
{
461
    int score_max = 0, score;
462
    BlockDriver *drv = NULL, *d;
463

    
464
    QLIST_FOREACH(d, &bdrv_drivers, list) {
465
        if (d->bdrv_probe_device) {
466
            score = d->bdrv_probe_device(filename);
467
            if (score > score_max) {
468
                score_max = score;
469
                drv = d;
470
            }
471
        }
472
    }
473

    
474
    return drv;
475
}
476

    
477
BlockDriver *bdrv_find_protocol(const char *filename)
478
{
479
    BlockDriver *drv1;
480
    char protocol[128];
481
    int len;
482
    const char *p;
483

    
484
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
485

    
486
    /*
487
     * XXX(hch): we really should not let host device detection
488
     * override an explicit protocol specification, but moving this
489
     * later breaks access to device names with colons in them.
490
     * Thanks to the brain-dead persistent naming schemes on udev-
491
     * based Linux systems those actually are quite common.
492
     */
493
    drv1 = find_hdev_driver(filename);
494
    if (drv1) {
495
        return drv1;
496
    }
497

    
498
    if (!path_has_protocol(filename)) {
499
        return bdrv_find_format("file");
500
    }
501
    p = strchr(filename, ':');
502
    assert(p != NULL);
503
    len = p - filename;
504
    if (len > sizeof(protocol) - 1)
505
        len = sizeof(protocol) - 1;
506
    memcpy(protocol, filename, len);
507
    protocol[len] = '\0';
508
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
509
        if (drv1->protocol_name &&
510
            !strcmp(drv1->protocol_name, protocol)) {
511
            return drv1;
512
        }
513
    }
514
    return NULL;
515
}
516

    
517
static int find_image_format(BlockDriverState *bs, const char *filename,
518
                             BlockDriver **pdrv)
519
{
520
    int score, score_max;
521
    BlockDriver *drv1, *drv;
522
    uint8_t buf[2048];
523
    int ret = 0;
524

    
525
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
526
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
527
        drv = bdrv_find_format("raw");
528
        if (!drv) {
529
            ret = -ENOENT;
530
        }
531
        *pdrv = drv;
532
        return ret;
533
    }
534

    
535
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
536
    if (ret < 0) {
537
        *pdrv = NULL;
538
        return ret;
539
    }
540

    
541
    score_max = 0;
542
    drv = NULL;
543
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
544
        if (drv1->bdrv_probe) {
545
            score = drv1->bdrv_probe(buf, ret, filename);
546
            if (score > score_max) {
547
                score_max = score;
548
                drv = drv1;
549
            }
550
        }
551
    }
552
    if (!drv) {
553
        ret = -ENOENT;
554
    }
555
    *pdrv = drv;
556
    return ret;
557
}
558

    
559
/**
560
 * Set the current 'total_sectors' value
561
 */
562
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
563
{
564
    BlockDriver *drv = bs->drv;
565

    
566
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
567
    if (bs->sg)
568
        return 0;
569

    
570
    /* query actual device if possible, otherwise just trust the hint */
571
    if (drv->bdrv_getlength) {
572
        int64_t length = drv->bdrv_getlength(bs);
573
        if (length < 0) {
574
            return length;
575
        }
576
        hint = length >> BDRV_SECTOR_BITS;
577
    }
578

    
579
    bs->total_sectors = hint;
580
    return 0;
581
}
582

    
583
/**
584
 * Set open flags for a given cache mode
585
 *
586
 * Return 0 on success, -1 if the cache mode was invalid.
587
 */
588
int bdrv_parse_cache_flags(const char *mode, int *flags)
589
{
590
    *flags &= ~BDRV_O_CACHE_MASK;
591

    
592
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
593
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
594
    } else if (!strcmp(mode, "directsync")) {
595
        *flags |= BDRV_O_NOCACHE;
596
    } else if (!strcmp(mode, "writeback")) {
597
        *flags |= BDRV_O_CACHE_WB;
598
    } else if (!strcmp(mode, "unsafe")) {
599
        *flags |= BDRV_O_CACHE_WB;
600
        *flags |= BDRV_O_NO_FLUSH;
601
    } else if (!strcmp(mode, "writethrough")) {
602
        /* this is the default */
603
    } else {
604
        return -1;
605
    }
606

    
607
    return 0;
608
}
609

    
610
/**
611
 * The copy-on-read flag is actually a reference count so multiple users may
612
 * use the feature without worrying about clobbering its previous state.
613
 * Copy-on-read stays enabled until all users have called to disable it.
614
 */
615
void bdrv_enable_copy_on_read(BlockDriverState *bs)
616
{
617
    bs->copy_on_read++;
618
}
619

    
620
void bdrv_disable_copy_on_read(BlockDriverState *bs)
621
{
622
    assert(bs->copy_on_read > 0);
623
    bs->copy_on_read--;
624
}
625

    
626
static int bdrv_open_flags(BlockDriverState *bs, int flags)
627
{
628
    int open_flags = flags | BDRV_O_CACHE_WB;
629

    
630
    /*
631
     * Clear flags that are internal to the block layer before opening the
632
     * image.
633
     */
634
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
635

    
636
    /*
637
     * Snapshots should be writable.
638
     */
639
    if (bs->is_temporary) {
640
        open_flags |= BDRV_O_RDWR;
641
    }
642

    
643
    return open_flags;
644
}
645

    
646
/*
647
 * Common part for opening disk images and files
648
 */
649
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
650
    const char *filename,
651
    int flags, BlockDriver *drv)
652
{
653
    int ret, open_flags;
654

    
655
    assert(drv != NULL);
656
    assert(bs->file == NULL);
657

    
658
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
659

    
660
    bs->open_flags = flags;
661
    bs->buffer_alignment = 512;
662

    
663
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
664
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
665
        bdrv_enable_copy_on_read(bs);
666
    }
667

    
668
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
669

    
670
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
671
        return -ENOTSUP;
672
    }
673

    
674
    bs->drv = drv;
675
    bs->opaque = g_malloc0(drv->instance_size);
676

    
677
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
678
    open_flags = bdrv_open_flags(bs, flags);
679

    
680
    bs->read_only = !(open_flags & BDRV_O_RDWR);
681

    
682
    /* Open the image, either directly or using a protocol */
683
    if (drv->bdrv_file_open) {
684
        if (file != NULL) {
685
            bdrv_swap(file, bs);
686
            ret = 0;
687
        } else {
688
            ret = drv->bdrv_file_open(bs, filename, open_flags);
689
        }
690
    } else {
691
        assert(file != NULL);
692
        bs->file = file;
693
        ret = drv->bdrv_open(bs, open_flags);
694
    }
695

    
696
    if (ret < 0) {
697
        goto free_and_fail;
698
    }
699

    
700
    ret = refresh_total_sectors(bs, bs->total_sectors);
701
    if (ret < 0) {
702
        goto free_and_fail;
703
    }
704

    
705
#ifndef _WIN32
706
    if (bs->is_temporary) {
707
        unlink(filename);
708
    }
709
#endif
710
    return 0;
711

    
712
free_and_fail:
713
    bs->file = NULL;
714
    g_free(bs->opaque);
715
    bs->opaque = NULL;
716
    bs->drv = NULL;
717
    return ret;
718
}
719

    
720
/*
721
 * Opens a file using a protocol (file, host_device, nbd, ...)
722
 */
723
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
724
{
725
    BlockDriverState *bs;
726
    BlockDriver *drv;
727
    int ret;
728

    
729
    drv = bdrv_find_protocol(filename);
730
    if (!drv) {
731
        return -ENOENT;
732
    }
733

    
734
    bs = bdrv_new("");
735
    ret = bdrv_open_common(bs, NULL, filename, flags, drv);
736
    if (ret < 0) {
737
        bdrv_delete(bs);
738
        return ret;
739
    }
740
    bs->growable = 1;
741
    *pbs = bs;
742
    return 0;
743
}
744

    
745
int bdrv_open_backing_file(BlockDriverState *bs)
746
{
747
    char backing_filename[PATH_MAX];
748
    int back_flags, ret;
749
    BlockDriver *back_drv = NULL;
750

    
751
    if (bs->backing_hd != NULL) {
752
        return 0;
753
    }
754

    
755
    bs->open_flags &= ~BDRV_O_NO_BACKING;
756
    if (bs->backing_file[0] == '\0') {
757
        return 0;
758
    }
759

    
760
    bs->backing_hd = bdrv_new("");
761
    bdrv_get_full_backing_filename(bs, backing_filename,
762
                                   sizeof(backing_filename));
763

    
764
    if (bs->backing_format[0] != '\0') {
765
        back_drv = bdrv_find_format(bs->backing_format);
766
    }
767

    
768
    /* backing files always opened read-only */
769
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
770

    
771
    ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
772
    if (ret < 0) {
773
        bdrv_delete(bs->backing_hd);
774
        bs->backing_hd = NULL;
775
        bs->open_flags |= BDRV_O_NO_BACKING;
776
        return ret;
777
    }
778
    return 0;
779
}
780

    
781
/*
782
 * Opens a disk image (raw, qcow2, vmdk, ...)
783
 */
784
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
785
              BlockDriver *drv)
786
{
787
    int ret;
788
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
789
    char tmp_filename[PATH_MAX + 1];
790
    BlockDriverState *file = NULL;
791

    
792
    if (flags & BDRV_O_SNAPSHOT) {
793
        BlockDriverState *bs1;
794
        int64_t total_size;
795
        int is_protocol = 0;
796
        BlockDriver *bdrv_qcow2;
797
        QEMUOptionParameter *options;
798
        char backing_filename[PATH_MAX];
799

    
800
        /* if snapshot, we create a temporary backing file and open it
801
           instead of opening 'filename' directly */
802

    
803
        /* if there is a backing file, use it */
804
        bs1 = bdrv_new("");
805
        ret = bdrv_open(bs1, filename, 0, drv);
806
        if (ret < 0) {
807
            bdrv_delete(bs1);
808
            return ret;
809
        }
810
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
811

    
812
        if (bs1->drv && bs1->drv->protocol_name)
813
            is_protocol = 1;
814

    
815
        bdrv_delete(bs1);
816

    
817
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
818
        if (ret < 0) {
819
            return ret;
820
        }
821

    
822
        /* Real path is meaningless for protocols */
823
        if (is_protocol)
824
            snprintf(backing_filename, sizeof(backing_filename),
825
                     "%s", filename);
826
        else if (!realpath(filename, backing_filename))
827
            return -errno;
828

    
829
        bdrv_qcow2 = bdrv_find_format("qcow2");
830
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
831

    
832
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
833
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
834
        if (drv) {
835
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
836
                drv->format_name);
837
        }
838

    
839
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
840
        free_option_parameters(options);
841
        if (ret < 0) {
842
            return ret;
843
        }
844

    
845
        filename = tmp_filename;
846
        drv = bdrv_qcow2;
847
        bs->is_temporary = 1;
848
    }
849

    
850
    /* Open image file without format layer */
851
    if (flags & BDRV_O_RDWR) {
852
        flags |= BDRV_O_ALLOW_RDWR;
853
    }
854

    
855
    ret = bdrv_file_open(&file, filename, bdrv_open_flags(bs, flags));
856
    if (ret < 0) {
857
        return ret;
858
    }
859

    
860
    /* Find the right image format driver */
861
    if (!drv) {
862
        ret = find_image_format(file, filename, &drv);
863
    }
864

    
865
    if (!drv) {
866
        goto unlink_and_fail;
867
    }
868

    
869
    /* Open the image */
870
    ret = bdrv_open_common(bs, file, filename, flags, drv);
871
    if (ret < 0) {
872
        goto unlink_and_fail;
873
    }
874

    
875
    if (bs->file != file) {
876
        bdrv_delete(file);
877
        file = NULL;
878
    }
879

    
880
    /* If there is a backing file, use it */
881
    if ((flags & BDRV_O_NO_BACKING) == 0) {
882
        ret = bdrv_open_backing_file(bs);
883
        if (ret < 0) {
884
            bdrv_close(bs);
885
            return ret;
886
        }
887
    }
888

    
889
    if (!bdrv_key_required(bs)) {
890
        bdrv_dev_change_media_cb(bs, true);
891
    }
892

    
893
    /* throttling disk I/O limits */
894
    if (bs->io_limits_enabled) {
895
        bdrv_io_limits_enable(bs);
896
    }
897

    
898
    return 0;
899

    
900
unlink_and_fail:
901
    if (file != NULL) {
902
        bdrv_delete(file);
903
    }
904
    if (bs->is_temporary) {
905
        unlink(filename);
906
    }
907
    return ret;
908
}
909

    
910
typedef struct BlockReopenQueueEntry {
911
     bool prepared;
912
     BDRVReopenState state;
913
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
914
} BlockReopenQueueEntry;
915

    
916
/*
917
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
918
 * reopen of multiple devices.
919
 *
920
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
921
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
922
 * be created and initialized. This newly created BlockReopenQueue should be
923
 * passed back in for subsequent calls that are intended to be of the same
924
 * atomic 'set'.
925
 *
926
 * bs is the BlockDriverState to add to the reopen queue.
927
 *
928
 * flags contains the open flags for the associated bs
929
 *
930
 * returns a pointer to bs_queue, which is either the newly allocated
931
 * bs_queue, or the existing bs_queue being used.
932
 *
933
 */
934
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
935
                                    BlockDriverState *bs, int flags)
936
{
937
    assert(bs != NULL);
938

    
939
    BlockReopenQueueEntry *bs_entry;
940
    if (bs_queue == NULL) {
941
        bs_queue = g_new0(BlockReopenQueue, 1);
942
        QSIMPLEQ_INIT(bs_queue);
943
    }
944

    
945
    if (bs->file) {
946
        bdrv_reopen_queue(bs_queue, bs->file, flags);
947
    }
948

    
949
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
950
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
951

    
952
    bs_entry->state.bs = bs;
953
    bs_entry->state.flags = flags;
954

    
955
    return bs_queue;
956
}
957

    
958
/*
959
 * Reopen multiple BlockDriverStates atomically & transactionally.
960
 *
961
 * The queue passed in (bs_queue) must have been built up previous
962
 * via bdrv_reopen_queue().
963
 *
964
 * Reopens all BDS specified in the queue, with the appropriate
965
 * flags.  All devices are prepared for reopen, and failure of any
966
 * device will cause all device changes to be abandonded, and intermediate
967
 * data cleaned up.
968
 *
969
 * If all devices prepare successfully, then the changes are committed
970
 * to all devices.
971
 *
972
 */
973
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
974
{
975
    int ret = -1;
976
    BlockReopenQueueEntry *bs_entry, *next;
977
    Error *local_err = NULL;
978

    
979
    assert(bs_queue != NULL);
980

    
981
    bdrv_drain_all();
982

    
983
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
984
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
985
            error_propagate(errp, local_err);
986
            goto cleanup;
987
        }
988
        bs_entry->prepared = true;
989
    }
990

    
991
    /* If we reach this point, we have success and just need to apply the
992
     * changes
993
     */
994
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
995
        bdrv_reopen_commit(&bs_entry->state);
996
    }
997

    
998
    ret = 0;
999

    
1000
cleanup:
1001
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1002
        if (ret && bs_entry->prepared) {
1003
            bdrv_reopen_abort(&bs_entry->state);
1004
        }
1005
        g_free(bs_entry);
1006
    }
1007
    g_free(bs_queue);
1008
    return ret;
1009
}
1010

    
1011

    
1012
/* Reopen a single BlockDriverState with the specified flags. */
1013
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1014
{
1015
    int ret = -1;
1016
    Error *local_err = NULL;
1017
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1018

    
1019
    ret = bdrv_reopen_multiple(queue, &local_err);
1020
    if (local_err != NULL) {
1021
        error_propagate(errp, local_err);
1022
    }
1023
    return ret;
1024
}
1025

    
1026

    
1027
/*
1028
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1029
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1030
 * the block driver layer .bdrv_reopen_prepare()
1031
 *
1032
 * bs is the BlockDriverState to reopen
1033
 * flags are the new open flags
1034
 * queue is the reopen queue
1035
 *
1036
 * Returns 0 on success, non-zero on error.  On error errp will be set
1037
 * as well.
1038
 *
1039
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1040
 * It is the responsibility of the caller to then call the abort() or
1041
 * commit() for any other BDS that have been left in a prepare() state
1042
 *
1043
 */
1044
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1045
                        Error **errp)
1046
{
1047
    int ret = -1;
1048
    Error *local_err = NULL;
1049
    BlockDriver *drv;
1050

    
1051
    assert(reopen_state != NULL);
1052
    assert(reopen_state->bs->drv != NULL);
1053
    drv = reopen_state->bs->drv;
1054

    
1055
    /* if we are to stay read-only, do not allow permission change
1056
     * to r/w */
1057
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1058
        reopen_state->flags & BDRV_O_RDWR) {
1059
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1060
                  reopen_state->bs->device_name);
1061
        goto error;
1062
    }
1063

    
1064

    
1065
    ret = bdrv_flush(reopen_state->bs);
1066
    if (ret) {
1067
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1068
                  strerror(-ret));
1069
        goto error;
1070
    }
1071

    
1072
    if (drv->bdrv_reopen_prepare) {
1073
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1074
        if (ret) {
1075
            if (local_err != NULL) {
1076
                error_propagate(errp, local_err);
1077
            } else {
1078
                error_set(errp, QERR_OPEN_FILE_FAILED,
1079
                          reopen_state->bs->filename);
1080
            }
1081
            goto error;
1082
        }
1083
    } else {
1084
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1085
         * handler for each supported drv. */
1086
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1087
                  drv->format_name, reopen_state->bs->device_name,
1088
                 "reopening of file");
1089
        ret = -1;
1090
        goto error;
1091
    }
1092

    
1093
    ret = 0;
1094

    
1095
error:
1096
    return ret;
1097
}
1098

    
1099
/*
1100
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1101
 * makes them final by swapping the staging BlockDriverState contents into
1102
 * the active BlockDriverState contents.
1103
 */
1104
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1105
{
1106
    BlockDriver *drv;
1107

    
1108
    assert(reopen_state != NULL);
1109
    drv = reopen_state->bs->drv;
1110
    assert(drv != NULL);
1111

    
1112
    /* If there are any driver level actions to take */
1113
    if (drv->bdrv_reopen_commit) {
1114
        drv->bdrv_reopen_commit(reopen_state);
1115
    }
1116

    
1117
    /* set BDS specific flags now */
1118
    reopen_state->bs->open_flags         = reopen_state->flags;
1119
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1120
                                              BDRV_O_CACHE_WB);
1121
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1122
}
1123

    
1124
/*
1125
 * Abort the reopen, and delete and free the staged changes in
1126
 * reopen_state
1127
 */
1128
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1129
{
1130
    BlockDriver *drv;
1131

    
1132
    assert(reopen_state != NULL);
1133
    drv = reopen_state->bs->drv;
1134
    assert(drv != NULL);
1135

    
1136
    if (drv->bdrv_reopen_abort) {
1137
        drv->bdrv_reopen_abort(reopen_state);
1138
    }
1139
}
1140

    
1141

    
1142
void bdrv_close(BlockDriverState *bs)
1143
{
1144
    bdrv_flush(bs);
1145
    if (bs->job) {
1146
        block_job_cancel_sync(bs->job);
1147
    }
1148
    bdrv_drain_all();
1149
    notifier_list_notify(&bs->close_notifiers, bs);
1150

    
1151
    if (bs->drv) {
1152
        if (bs == bs_snapshots) {
1153
            bs_snapshots = NULL;
1154
        }
1155
        if (bs->backing_hd) {
1156
            bdrv_delete(bs->backing_hd);
1157
            bs->backing_hd = NULL;
1158
        }
1159
        bs->drv->bdrv_close(bs);
1160
        g_free(bs->opaque);
1161
#ifdef _WIN32
1162
        if (bs->is_temporary) {
1163
            unlink(bs->filename);
1164
        }
1165
#endif
1166
        bs->opaque = NULL;
1167
        bs->drv = NULL;
1168
        bs->copy_on_read = 0;
1169
        bs->backing_file[0] = '\0';
1170
        bs->backing_format[0] = '\0';
1171
        bs->total_sectors = 0;
1172
        bs->encrypted = 0;
1173
        bs->valid_key = 0;
1174
        bs->sg = 0;
1175
        bs->growable = 0;
1176

    
1177
        if (bs->file != NULL) {
1178
            bdrv_delete(bs->file);
1179
            bs->file = NULL;
1180
        }
1181
    }
1182

    
1183
    bdrv_dev_change_media_cb(bs, false);
1184

    
1185
    /*throttling disk I/O limits*/
1186
    if (bs->io_limits_enabled) {
1187
        bdrv_io_limits_disable(bs);
1188
    }
1189
}
1190

    
1191
void bdrv_close_all(void)
1192
{
1193
    BlockDriverState *bs;
1194

    
1195
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1196
        bdrv_close(bs);
1197
    }
1198
}
1199

    
1200
/*
1201
 * Wait for pending requests to complete across all BlockDriverStates
1202
 *
1203
 * This function does not flush data to disk, use bdrv_flush_all() for that
1204
 * after calling this function.
1205
 *
1206
 * Note that completion of an asynchronous I/O operation can trigger any
1207
 * number of other I/O operations on other devices---for example a coroutine
1208
 * can be arbitrarily complex and a constant flow of I/O can come until the
1209
 * coroutine is complete.  Because of this, it is not possible to have a
1210
 * function to drain a single device's I/O queue.
1211
 */
1212
void bdrv_drain_all(void)
1213
{
1214
    BlockDriverState *bs;
1215
    bool busy;
1216

    
1217
    do {
1218
        busy = qemu_aio_wait();
1219

    
1220
        /* FIXME: We do not have timer support here, so this is effectively
1221
         * a busy wait.
1222
         */
1223
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1224
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1225
                qemu_co_queue_restart_all(&bs->throttled_reqs);
1226
                busy = true;
1227
            }
1228
        }
1229
    } while (busy);
1230

    
1231
    /* If requests are still pending there is a bug somewhere */
1232
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1233
        assert(QLIST_EMPTY(&bs->tracked_requests));
1234
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
1235
    }
1236
}
1237

    
1238
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1239
   Also, NULL terminate the device_name to prevent double remove */
1240
void bdrv_make_anon(BlockDriverState *bs)
1241
{
1242
    if (bs->device_name[0] != '\0') {
1243
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1244
    }
1245
    bs->device_name[0] = '\0';
1246
}
1247

    
1248
static void bdrv_rebind(BlockDriverState *bs)
1249
{
1250
    if (bs->drv && bs->drv->bdrv_rebind) {
1251
        bs->drv->bdrv_rebind(bs);
1252
    }
1253
}
1254

    
1255
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1256
                                     BlockDriverState *bs_src)
1257
{
1258
    /* move some fields that need to stay attached to the device */
1259
    bs_dest->open_flags         = bs_src->open_flags;
1260

    
1261
    /* dev info */
1262
    bs_dest->dev_ops            = bs_src->dev_ops;
1263
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1264
    bs_dest->dev                = bs_src->dev;
1265
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1266
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1267

    
1268
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1269

    
1270
    /* i/o timing parameters */
1271
    bs_dest->slice_time         = bs_src->slice_time;
1272
    bs_dest->slice_start        = bs_src->slice_start;
1273
    bs_dest->slice_end          = bs_src->slice_end;
1274
    bs_dest->io_limits          = bs_src->io_limits;
1275
    bs_dest->io_base            = bs_src->io_base;
1276
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1277
    bs_dest->block_timer        = bs_src->block_timer;
1278
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1279

    
1280
    /* r/w error */
1281
    bs_dest->on_read_error      = bs_src->on_read_error;
1282
    bs_dest->on_write_error     = bs_src->on_write_error;
1283

    
1284
    /* i/o status */
1285
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1286
    bs_dest->iostatus           = bs_src->iostatus;
1287

    
1288
    /* dirty bitmap */
1289
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1290

    
1291
    /* job */
1292
    bs_dest->in_use             = bs_src->in_use;
1293
    bs_dest->job                = bs_src->job;
1294

    
1295
    /* keep the same entry in bdrv_states */
1296
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1297
            bs_src->device_name);
1298
    bs_dest->list = bs_src->list;
1299
}
1300

    
1301
/*
1302
 * Swap bs contents for two image chains while they are live,
1303
 * while keeping required fields on the BlockDriverState that is
1304
 * actually attached to a device.
1305
 *
1306
 * This will modify the BlockDriverState fields, and swap contents
1307
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1308
 *
1309
 * bs_new is required to be anonymous.
1310
 *
1311
 * This function does not create any image files.
1312
 */
1313
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1314
{
1315
    BlockDriverState tmp;
1316

    
1317
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1318
    assert(bs_new->device_name[0] == '\0');
1319
    assert(bs_new->dirty_bitmap == NULL);
1320
    assert(bs_new->job == NULL);
1321
    assert(bs_new->dev == NULL);
1322
    assert(bs_new->in_use == 0);
1323
    assert(bs_new->io_limits_enabled == false);
1324
    assert(bs_new->block_timer == NULL);
1325

    
1326
    tmp = *bs_new;
1327
    *bs_new = *bs_old;
1328
    *bs_old = tmp;
1329

    
1330
    /* there are some fields that should not be swapped, move them back */
1331
    bdrv_move_feature_fields(&tmp, bs_old);
1332
    bdrv_move_feature_fields(bs_old, bs_new);
1333
    bdrv_move_feature_fields(bs_new, &tmp);
1334

    
1335
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1336
    assert(bs_new->device_name[0] == '\0');
1337

    
1338
    /* Check a few fields that should remain attached to the device */
1339
    assert(bs_new->dev == NULL);
1340
    assert(bs_new->job == NULL);
1341
    assert(bs_new->in_use == 0);
1342
    assert(bs_new->io_limits_enabled == false);
1343
    assert(bs_new->block_timer == NULL);
1344

    
1345
    bdrv_rebind(bs_new);
1346
    bdrv_rebind(bs_old);
1347
}
1348

    
1349
/*
1350
 * Add new bs contents at the top of an image chain while the chain is
1351
 * live, while keeping required fields on the top layer.
1352
 *
1353
 * This will modify the BlockDriverState fields, and swap contents
1354
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1355
 *
1356
 * bs_new is required to be anonymous.
1357
 *
1358
 * This function does not create any image files.
1359
 */
1360
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1361
{
1362
    bdrv_swap(bs_new, bs_top);
1363

    
1364
    /* The contents of 'tmp' will become bs_top, as we are
1365
     * swapping bs_new and bs_top contents. */
1366
    bs_top->backing_hd = bs_new;
1367
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1368
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1369
            bs_new->filename);
1370
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1371
            bs_new->drv ? bs_new->drv->format_name : "");
1372
}
1373

    
1374
void bdrv_delete(BlockDriverState *bs)
1375
{
1376
    assert(!bs->dev);
1377
    assert(!bs->job);
1378
    assert(!bs->in_use);
1379

    
1380
    /* remove from list, if necessary */
1381
    bdrv_make_anon(bs);
1382

    
1383
    bdrv_close(bs);
1384

    
1385
    assert(bs != bs_snapshots);
1386
    g_free(bs);
1387
}
1388

    
1389
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1390
/* TODO change to DeviceState *dev when all users are qdevified */
1391
{
1392
    if (bs->dev) {
1393
        return -EBUSY;
1394
    }
1395
    bs->dev = dev;
1396
    bdrv_iostatus_reset(bs);
1397
    return 0;
1398
}
1399

    
1400
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1401
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1402
{
1403
    if (bdrv_attach_dev(bs, dev) < 0) {
1404
        abort();
1405
    }
1406
}
1407

    
1408
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1409
/* TODO change to DeviceState *dev when all users are qdevified */
1410
{
1411
    assert(bs->dev == dev);
1412
    bs->dev = NULL;
1413
    bs->dev_ops = NULL;
1414
    bs->dev_opaque = NULL;
1415
    bs->buffer_alignment = 512;
1416
}
1417

    
1418
/* TODO change to return DeviceState * when all users are qdevified */
1419
void *bdrv_get_attached_dev(BlockDriverState *bs)
1420
{
1421
    return bs->dev;
1422
}
1423

    
1424
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1425
                      void *opaque)
1426
{
1427
    bs->dev_ops = ops;
1428
    bs->dev_opaque = opaque;
1429
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1430
        bs_snapshots = NULL;
1431
    }
1432
}
1433

    
1434
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1435
                               enum MonitorEvent ev,
1436
                               BlockErrorAction action, bool is_read)
1437
{
1438
    QObject *data;
1439
    const char *action_str;
1440

    
1441
    switch (action) {
1442
    case BDRV_ACTION_REPORT:
1443
        action_str = "report";
1444
        break;
1445
    case BDRV_ACTION_IGNORE:
1446
        action_str = "ignore";
1447
        break;
1448
    case BDRV_ACTION_STOP:
1449
        action_str = "stop";
1450
        break;
1451
    default:
1452
        abort();
1453
    }
1454

    
1455
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1456
                              bdrv->device_name,
1457
                              action_str,
1458
                              is_read ? "read" : "write");
1459
    monitor_protocol_event(ev, data);
1460

    
1461
    qobject_decref(data);
1462
}
1463

    
1464
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1465
{
1466
    QObject *data;
1467

    
1468
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1469
                              bdrv_get_device_name(bs), ejected);
1470
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1471

    
1472
    qobject_decref(data);
1473
}
1474

    
1475
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1476
{
1477
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1478
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1479
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1480
        if (tray_was_closed) {
1481
            /* tray open */
1482
            bdrv_emit_qmp_eject_event(bs, true);
1483
        }
1484
        if (load) {
1485
            /* tray close */
1486
            bdrv_emit_qmp_eject_event(bs, false);
1487
        }
1488
    }
1489
}
1490

    
1491
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1492
{
1493
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1494
}
1495

    
1496
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1497
{
1498
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1499
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1500
    }
1501
}
1502

    
1503
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1504
{
1505
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1506
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1507
    }
1508
    return false;
1509
}
1510

    
1511
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1512
{
1513
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1514
        bs->dev_ops->resize_cb(bs->dev_opaque);
1515
    }
1516
}
1517

    
1518
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1519
{
1520
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1521
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1522
    }
1523
    return false;
1524
}
1525

    
1526
/*
1527
 * Run consistency checks on an image
1528
 *
1529
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1530
 * free of errors) or -errno when an internal error occurred. The results of the
1531
 * check are stored in res.
1532
 */
1533
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1534
{
1535
    if (bs->drv->bdrv_check == NULL) {
1536
        return -ENOTSUP;
1537
    }
1538

    
1539
    memset(res, 0, sizeof(*res));
1540
    return bs->drv->bdrv_check(bs, res, fix);
1541
}
1542

    
1543
#define COMMIT_BUF_SECTORS 2048
1544

    
1545
/* commit COW file into the raw image */
1546
int bdrv_commit(BlockDriverState *bs)
1547
{
1548
    BlockDriver *drv = bs->drv;
1549
    int64_t sector, total_sectors;
1550
    int n, ro, open_flags;
1551
    int ret = 0;
1552
    uint8_t *buf;
1553
    char filename[PATH_MAX];
1554

    
1555
    if (!drv)
1556
        return -ENOMEDIUM;
1557
    
1558
    if (!bs->backing_hd) {
1559
        return -ENOTSUP;
1560
    }
1561

    
1562
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1563
        return -EBUSY;
1564
    }
1565

    
1566
    ro = bs->backing_hd->read_only;
1567
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1568
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1569
    open_flags =  bs->backing_hd->open_flags;
1570

    
1571
    if (ro) {
1572
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1573
            return -EACCES;
1574
        }
1575
    }
1576

    
1577
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1578
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1579

    
1580
    for (sector = 0; sector < total_sectors; sector += n) {
1581
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1582

    
1583
            if (bdrv_read(bs, sector, buf, n) != 0) {
1584
                ret = -EIO;
1585
                goto ro_cleanup;
1586
            }
1587

    
1588
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1589
                ret = -EIO;
1590
                goto ro_cleanup;
1591
            }
1592
        }
1593
    }
1594

    
1595
    if (drv->bdrv_make_empty) {
1596
        ret = drv->bdrv_make_empty(bs);
1597
        bdrv_flush(bs);
1598
    }
1599

    
1600
    /*
1601
     * Make sure all data we wrote to the backing device is actually
1602
     * stable on disk.
1603
     */
1604
    if (bs->backing_hd)
1605
        bdrv_flush(bs->backing_hd);
1606

    
1607
ro_cleanup:
1608
    g_free(buf);
1609

    
1610
    if (ro) {
1611
        /* ignoring error return here */
1612
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1613
    }
1614

    
1615
    return ret;
1616
}
1617

    
1618
int bdrv_commit_all(void)
1619
{
1620
    BlockDriverState *bs;
1621

    
1622
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1623
        int ret = bdrv_commit(bs);
1624
        if (ret < 0) {
1625
            return ret;
1626
        }
1627
    }
1628
    return 0;
1629
}
1630

    
1631
struct BdrvTrackedRequest {
1632
    BlockDriverState *bs;
1633
    int64_t sector_num;
1634
    int nb_sectors;
1635
    bool is_write;
1636
    QLIST_ENTRY(BdrvTrackedRequest) list;
1637
    Coroutine *co; /* owner, used for deadlock detection */
1638
    CoQueue wait_queue; /* coroutines blocked on this request */
1639
};
1640

    
1641
/**
1642
 * Remove an active request from the tracked requests list
1643
 *
1644
 * This function should be called when a tracked request is completing.
1645
 */
1646
static void tracked_request_end(BdrvTrackedRequest *req)
1647
{
1648
    QLIST_REMOVE(req, list);
1649
    qemu_co_queue_restart_all(&req->wait_queue);
1650
}
1651

    
1652
/**
1653
 * Add an active request to the tracked requests list
1654
 */
1655
static void tracked_request_begin(BdrvTrackedRequest *req,
1656
                                  BlockDriverState *bs,
1657
                                  int64_t sector_num,
1658
                                  int nb_sectors, bool is_write)
1659
{
1660
    *req = (BdrvTrackedRequest){
1661
        .bs = bs,
1662
        .sector_num = sector_num,
1663
        .nb_sectors = nb_sectors,
1664
        .is_write = is_write,
1665
        .co = qemu_coroutine_self(),
1666
    };
1667

    
1668
    qemu_co_queue_init(&req->wait_queue);
1669

    
1670
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1671
}
1672

    
1673
/**
1674
 * Round a region to cluster boundaries
1675
 */
1676
void bdrv_round_to_clusters(BlockDriverState *bs,
1677
                            int64_t sector_num, int nb_sectors,
1678
                            int64_t *cluster_sector_num,
1679
                            int *cluster_nb_sectors)
1680
{
1681
    BlockDriverInfo bdi;
1682

    
1683
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1684
        *cluster_sector_num = sector_num;
1685
        *cluster_nb_sectors = nb_sectors;
1686
    } else {
1687
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1688
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1689
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1690
                                            nb_sectors, c);
1691
    }
1692
}
1693

    
1694
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1695
                                     int64_t sector_num, int nb_sectors) {
1696
    /*        aaaa   bbbb */
1697
    if (sector_num >= req->sector_num + req->nb_sectors) {
1698
        return false;
1699
    }
1700
    /* bbbb   aaaa        */
1701
    if (req->sector_num >= sector_num + nb_sectors) {
1702
        return false;
1703
    }
1704
    return true;
1705
}
1706

    
1707
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1708
        int64_t sector_num, int nb_sectors)
1709
{
1710
    BdrvTrackedRequest *req;
1711
    int64_t cluster_sector_num;
1712
    int cluster_nb_sectors;
1713
    bool retry;
1714

    
1715
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1716
     * that allocating writes will be serialized and not race with each other
1717
     * for the same cluster.  For example, in copy-on-read it ensures that the
1718
     * CoR read and write operations are atomic and guest writes cannot
1719
     * interleave between them.
1720
     */
1721
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
1722
                           &cluster_sector_num, &cluster_nb_sectors);
1723

    
1724
    do {
1725
        retry = false;
1726
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1727
            if (tracked_request_overlaps(req, cluster_sector_num,
1728
                                         cluster_nb_sectors)) {
1729
                /* Hitting this means there was a reentrant request, for
1730
                 * example, a block driver issuing nested requests.  This must
1731
                 * never happen since it means deadlock.
1732
                 */
1733
                assert(qemu_coroutine_self() != req->co);
1734

    
1735
                qemu_co_queue_wait(&req->wait_queue);
1736
                retry = true;
1737
                break;
1738
            }
1739
        }
1740
    } while (retry);
1741
}
1742

    
1743
/*
1744
 * Return values:
1745
 * 0        - success
1746
 * -EINVAL  - backing format specified, but no file
1747
 * -ENOSPC  - can't update the backing file because no space is left in the
1748
 *            image file header
1749
 * -ENOTSUP - format driver doesn't support changing the backing file
1750
 */
1751
int bdrv_change_backing_file(BlockDriverState *bs,
1752
    const char *backing_file, const char *backing_fmt)
1753
{
1754
    BlockDriver *drv = bs->drv;
1755
    int ret;
1756

    
1757
    /* Backing file format doesn't make sense without a backing file */
1758
    if (backing_fmt && !backing_file) {
1759
        return -EINVAL;
1760
    }
1761

    
1762
    if (drv->bdrv_change_backing_file != NULL) {
1763
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1764
    } else {
1765
        ret = -ENOTSUP;
1766
    }
1767

    
1768
    if (ret == 0) {
1769
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1770
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1771
    }
1772
    return ret;
1773
}
1774

    
1775
/*
1776
 * Finds the image layer in the chain that has 'bs' as its backing file.
1777
 *
1778
 * active is the current topmost image.
1779
 *
1780
 * Returns NULL if bs is not found in active's image chain,
1781
 * or if active == bs.
1782
 */
1783
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1784
                                    BlockDriverState *bs)
1785
{
1786
    BlockDriverState *overlay = NULL;
1787
    BlockDriverState *intermediate;
1788

    
1789
    assert(active != NULL);
1790
    assert(bs != NULL);
1791

    
1792
    /* if bs is the same as active, then by definition it has no overlay
1793
     */
1794
    if (active == bs) {
1795
        return NULL;
1796
    }
1797

    
1798
    intermediate = active;
1799
    while (intermediate->backing_hd) {
1800
        if (intermediate->backing_hd == bs) {
1801
            overlay = intermediate;
1802
            break;
1803
        }
1804
        intermediate = intermediate->backing_hd;
1805
    }
1806

    
1807
    return overlay;
1808
}
1809

    
1810
typedef struct BlkIntermediateStates {
1811
    BlockDriverState *bs;
1812
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
1813
} BlkIntermediateStates;
1814

    
1815

    
1816
/*
1817
 * Drops images above 'base' up to and including 'top', and sets the image
1818
 * above 'top' to have base as its backing file.
1819
 *
1820
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
1821
 * information in 'bs' can be properly updated.
1822
 *
1823
 * E.g., this will convert the following chain:
1824
 * bottom <- base <- intermediate <- top <- active
1825
 *
1826
 * to
1827
 *
1828
 * bottom <- base <- active
1829
 *
1830
 * It is allowed for bottom==base, in which case it converts:
1831
 *
1832
 * base <- intermediate <- top <- active
1833
 *
1834
 * to
1835
 *
1836
 * base <- active
1837
 *
1838
 * Error conditions:
1839
 *  if active == top, that is considered an error
1840
 *
1841
 */
1842
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
1843
                           BlockDriverState *base)
1844
{
1845
    BlockDriverState *intermediate;
1846
    BlockDriverState *base_bs = NULL;
1847
    BlockDriverState *new_top_bs = NULL;
1848
    BlkIntermediateStates *intermediate_state, *next;
1849
    int ret = -EIO;
1850

    
1851
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
1852
    QSIMPLEQ_INIT(&states_to_delete);
1853

    
1854
    if (!top->drv || !base->drv) {
1855
        goto exit;
1856
    }
1857

    
1858
    new_top_bs = bdrv_find_overlay(active, top);
1859

    
1860
    if (new_top_bs == NULL) {
1861
        /* we could not find the image above 'top', this is an error */
1862
        goto exit;
1863
    }
1864

    
1865
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
1866
     * to do, no intermediate images */
1867
    if (new_top_bs->backing_hd == base) {
1868
        ret = 0;
1869
        goto exit;
1870
    }
1871

    
1872
    intermediate = top;
1873

    
1874
    /* now we will go down through the list, and add each BDS we find
1875
     * into our deletion queue, until we hit the 'base'
1876
     */
1877
    while (intermediate) {
1878
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
1879
        intermediate_state->bs = intermediate;
1880
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
1881

    
1882
        if (intermediate->backing_hd == base) {
1883
            base_bs = intermediate->backing_hd;
1884
            break;
1885
        }
1886
        intermediate = intermediate->backing_hd;
1887
    }
1888
    if (base_bs == NULL) {
1889
        /* something went wrong, we did not end at the base. safely
1890
         * unravel everything, and exit with error */
1891
        goto exit;
1892
    }
1893

    
1894
    /* success - we can delete the intermediate states, and link top->base */
1895
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
1896
                                   base_bs->drv ? base_bs->drv->format_name : "");
1897
    if (ret) {
1898
        goto exit;
1899
    }
1900
    new_top_bs->backing_hd = base_bs;
1901

    
1902

    
1903
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1904
        /* so that bdrv_close() does not recursively close the chain */
1905
        intermediate_state->bs->backing_hd = NULL;
1906
        bdrv_delete(intermediate_state->bs);
1907
    }
1908
    ret = 0;
1909

    
1910
exit:
1911
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1912
        g_free(intermediate_state);
1913
    }
1914
    return ret;
1915
}
1916

    
1917

    
1918
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1919
                                   size_t size)
1920
{
1921
    int64_t len;
1922

    
1923
    if (!bdrv_is_inserted(bs))
1924
        return -ENOMEDIUM;
1925

    
1926
    if (bs->growable)
1927
        return 0;
1928

    
1929
    len = bdrv_getlength(bs);
1930

    
1931
    if (offset < 0)
1932
        return -EIO;
1933

    
1934
    if ((offset > len) || (len - offset < size))
1935
        return -EIO;
1936

    
1937
    return 0;
1938
}
1939

    
1940
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1941
                              int nb_sectors)
1942
{
1943
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1944
                                   nb_sectors * BDRV_SECTOR_SIZE);
1945
}
1946

    
1947
typedef struct RwCo {
1948
    BlockDriverState *bs;
1949
    int64_t sector_num;
1950
    int nb_sectors;
1951
    QEMUIOVector *qiov;
1952
    bool is_write;
1953
    int ret;
1954
} RwCo;
1955

    
1956
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1957
{
1958
    RwCo *rwco = opaque;
1959

    
1960
    if (!rwco->is_write) {
1961
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1962
                                     rwco->nb_sectors, rwco->qiov, 0);
1963
    } else {
1964
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1965
                                      rwco->nb_sectors, rwco->qiov, 0);
1966
    }
1967
}
1968

    
1969
/*
1970
 * Process a synchronous request using coroutines
1971
 */
1972
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1973
                      int nb_sectors, bool is_write)
1974
{
1975
    QEMUIOVector qiov;
1976
    struct iovec iov = {
1977
        .iov_base = (void *)buf,
1978
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1979
    };
1980
    Coroutine *co;
1981
    RwCo rwco = {
1982
        .bs = bs,
1983
        .sector_num = sector_num,
1984
        .nb_sectors = nb_sectors,
1985
        .qiov = &qiov,
1986
        .is_write = is_write,
1987
        .ret = NOT_DONE,
1988
    };
1989

    
1990
    qemu_iovec_init_external(&qiov, &iov, 1);
1991

    
1992
    /**
1993
     * In sync call context, when the vcpu is blocked, this throttling timer
1994
     * will not fire; so the I/O throttling function has to be disabled here
1995
     * if it has been enabled.
1996
     */
1997
    if (bs->io_limits_enabled) {
1998
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
1999
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2000
        bdrv_io_limits_disable(bs);
2001
    }
2002

    
2003
    if (qemu_in_coroutine()) {
2004
        /* Fast-path if already in coroutine context */
2005
        bdrv_rw_co_entry(&rwco);
2006
    } else {
2007
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2008
        qemu_coroutine_enter(co, &rwco);
2009
        while (rwco.ret == NOT_DONE) {
2010
            qemu_aio_wait();
2011
        }
2012
    }
2013
    return rwco.ret;
2014
}
2015

    
2016
/* return < 0 if error. See bdrv_write() for the return codes */
2017
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2018
              uint8_t *buf, int nb_sectors)
2019
{
2020
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
2021
}
2022

    
2023
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2024
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2025
                          uint8_t *buf, int nb_sectors)
2026
{
2027
    bool enabled;
2028
    int ret;
2029

    
2030
    enabled = bs->io_limits_enabled;
2031
    bs->io_limits_enabled = false;
2032
    ret = bdrv_read(bs, 0, buf, 1);
2033
    bs->io_limits_enabled = enabled;
2034
    return ret;
2035
}
2036

    
2037
/* Return < 0 if error. Important errors are:
2038
  -EIO         generic I/O error (may happen for all errors)
2039
  -ENOMEDIUM   No media inserted.
2040
  -EINVAL      Invalid sector number or nb_sectors
2041
  -EACCES      Trying to write a read-only device
2042
*/
2043
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2044
               const uint8_t *buf, int nb_sectors)
2045
{
2046
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
2047
}
2048

    
2049
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2050
               void *buf, int count1)
2051
{
2052
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2053
    int len, nb_sectors, count;
2054
    int64_t sector_num;
2055
    int ret;
2056

    
2057
    count = count1;
2058
    /* first read to align to sector start */
2059
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2060
    if (len > count)
2061
        len = count;
2062
    sector_num = offset >> BDRV_SECTOR_BITS;
2063
    if (len > 0) {
2064
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2065
            return ret;
2066
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2067
        count -= len;
2068
        if (count == 0)
2069
            return count1;
2070
        sector_num++;
2071
        buf += len;
2072
    }
2073

    
2074
    /* read the sectors "in place" */
2075
    nb_sectors = count >> BDRV_SECTOR_BITS;
2076
    if (nb_sectors > 0) {
2077
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2078
            return ret;
2079
        sector_num += nb_sectors;
2080
        len = nb_sectors << BDRV_SECTOR_BITS;
2081
        buf += len;
2082
        count -= len;
2083
    }
2084

    
2085
    /* add data from the last sector */
2086
    if (count > 0) {
2087
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2088
            return ret;
2089
        memcpy(buf, tmp_buf, count);
2090
    }
2091
    return count1;
2092
}
2093

    
2094
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2095
                const void *buf, int count1)
2096
{
2097
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2098
    int len, nb_sectors, count;
2099
    int64_t sector_num;
2100
    int ret;
2101

    
2102
    count = count1;
2103
    /* first write to align to sector start */
2104
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2105
    if (len > count)
2106
        len = count;
2107
    sector_num = offset >> BDRV_SECTOR_BITS;
2108
    if (len > 0) {
2109
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2110
            return ret;
2111
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
2112
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2113
            return ret;
2114
        count -= len;
2115
        if (count == 0)
2116
            return count1;
2117
        sector_num++;
2118
        buf += len;
2119
    }
2120

    
2121
    /* write the sectors "in place" */
2122
    nb_sectors = count >> BDRV_SECTOR_BITS;
2123
    if (nb_sectors > 0) {
2124
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
2125
            return ret;
2126
        sector_num += nb_sectors;
2127
        len = nb_sectors << BDRV_SECTOR_BITS;
2128
        buf += len;
2129
        count -= len;
2130
    }
2131

    
2132
    /* add data from the last sector */
2133
    if (count > 0) {
2134
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2135
            return ret;
2136
        memcpy(tmp_buf, buf, count);
2137
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2138
            return ret;
2139
    }
2140
    return count1;
2141
}
2142

    
2143
/*
2144
 * Writes to the file and ensures that no writes are reordered across this
2145
 * request (acts as a barrier)
2146
 *
2147
 * Returns 0 on success, -errno in error cases.
2148
 */
2149
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2150
    const void *buf, int count)
2151
{
2152
    int ret;
2153

    
2154
    ret = bdrv_pwrite(bs, offset, buf, count);
2155
    if (ret < 0) {
2156
        return ret;
2157
    }
2158

    
2159
    /* No flush needed for cache modes that already do it */
2160
    if (bs->enable_write_cache) {
2161
        bdrv_flush(bs);
2162
    }
2163

    
2164
    return 0;
2165
}
2166

    
2167
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2168
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2169
{
2170
    /* Perform I/O through a temporary buffer so that users who scribble over
2171
     * their read buffer while the operation is in progress do not end up
2172
     * modifying the image file.  This is critical for zero-copy guest I/O
2173
     * where anything might happen inside guest memory.
2174
     */
2175
    void *bounce_buffer;
2176

    
2177
    BlockDriver *drv = bs->drv;
2178
    struct iovec iov;
2179
    QEMUIOVector bounce_qiov;
2180
    int64_t cluster_sector_num;
2181
    int cluster_nb_sectors;
2182
    size_t skip_bytes;
2183
    int ret;
2184

    
2185
    /* Cover entire cluster so no additional backing file I/O is required when
2186
     * allocating cluster in the image file.
2187
     */
2188
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2189
                           &cluster_sector_num, &cluster_nb_sectors);
2190

    
2191
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2192
                                   cluster_sector_num, cluster_nb_sectors);
2193

    
2194
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2195
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2196
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2197

    
2198
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2199
                             &bounce_qiov);
2200
    if (ret < 0) {
2201
        goto err;
2202
    }
2203

    
2204
    if (drv->bdrv_co_write_zeroes &&
2205
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2206
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2207
                                      cluster_nb_sectors);
2208
    } else {
2209
        /* This does not change the data on the disk, it is not necessary
2210
         * to flush even in cache=writethrough mode.
2211
         */
2212
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2213
                                  &bounce_qiov);
2214
    }
2215

    
2216
    if (ret < 0) {
2217
        /* It might be okay to ignore write errors for guest requests.  If this
2218
         * is a deliberate copy-on-read then we don't want to ignore the error.
2219
         * Simply report it in all cases.
2220
         */
2221
        goto err;
2222
    }
2223

    
2224
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2225
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2226
                        nb_sectors * BDRV_SECTOR_SIZE);
2227

    
2228
err:
2229
    qemu_vfree(bounce_buffer);
2230
    return ret;
2231
}
2232

    
2233
/*
2234
 * Handle a read request in coroutine context
2235
 */
2236
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2237
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2238
    BdrvRequestFlags flags)
2239
{
2240
    BlockDriver *drv = bs->drv;
2241
    BdrvTrackedRequest req;
2242
    int ret;
2243

    
2244
    if (!drv) {
2245
        return -ENOMEDIUM;
2246
    }
2247
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2248
        return -EIO;
2249
    }
2250

    
2251
    /* throttling disk read I/O */
2252
    if (bs->io_limits_enabled) {
2253
        bdrv_io_limits_intercept(bs, false, nb_sectors);
2254
    }
2255

    
2256
    if (bs->copy_on_read) {
2257
        flags |= BDRV_REQ_COPY_ON_READ;
2258
    }
2259
    if (flags & BDRV_REQ_COPY_ON_READ) {
2260
        bs->copy_on_read_in_flight++;
2261
    }
2262

    
2263
    if (bs->copy_on_read_in_flight) {
2264
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2265
    }
2266

    
2267
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2268

    
2269
    if (flags & BDRV_REQ_COPY_ON_READ) {
2270
        int pnum;
2271

    
2272
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2273
        if (ret < 0) {
2274
            goto out;
2275
        }
2276

    
2277
        if (!ret || pnum != nb_sectors) {
2278
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2279
            goto out;
2280
        }
2281
    }
2282

    
2283
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2284

    
2285
out:
2286
    tracked_request_end(&req);
2287

    
2288
    if (flags & BDRV_REQ_COPY_ON_READ) {
2289
        bs->copy_on_read_in_flight--;
2290
    }
2291

    
2292
    return ret;
2293
}
2294

    
2295
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2296
    int nb_sectors, QEMUIOVector *qiov)
2297
{
2298
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2299

    
2300
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2301
}
2302

    
2303
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2304
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2305
{
2306
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2307

    
2308
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2309
                            BDRV_REQ_COPY_ON_READ);
2310
}
2311

    
2312
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2313
    int64_t sector_num, int nb_sectors)
2314
{
2315
    BlockDriver *drv = bs->drv;
2316
    QEMUIOVector qiov;
2317
    struct iovec iov;
2318
    int ret;
2319

    
2320
    /* TODO Emulate only part of misaligned requests instead of letting block
2321
     * drivers return -ENOTSUP and emulate everything */
2322

    
2323
    /* First try the efficient write zeroes operation */
2324
    if (drv->bdrv_co_write_zeroes) {
2325
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2326
        if (ret != -ENOTSUP) {
2327
            return ret;
2328
        }
2329
    }
2330

    
2331
    /* Fall back to bounce buffer if write zeroes is unsupported */
2332
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2333
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2334
    memset(iov.iov_base, 0, iov.iov_len);
2335
    qemu_iovec_init_external(&qiov, &iov, 1);
2336

    
2337
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2338

    
2339
    qemu_vfree(iov.iov_base);
2340
    return ret;
2341
}
2342

    
2343
/*
2344
 * Handle a write request in coroutine context
2345
 */
2346
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2347
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2348
    BdrvRequestFlags flags)
2349
{
2350
    BlockDriver *drv = bs->drv;
2351
    BdrvTrackedRequest req;
2352
    int ret;
2353

    
2354
    if (!bs->drv) {
2355
        return -ENOMEDIUM;
2356
    }
2357
    if (bs->read_only) {
2358
        return -EACCES;
2359
    }
2360
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2361
        return -EIO;
2362
    }
2363

    
2364
    /* throttling disk write I/O */
2365
    if (bs->io_limits_enabled) {
2366
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2367
    }
2368

    
2369
    if (bs->copy_on_read_in_flight) {
2370
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2371
    }
2372

    
2373
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2374

    
2375
    if (flags & BDRV_REQ_ZERO_WRITE) {
2376
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2377
    } else {
2378
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2379
    }
2380

    
2381
    if (ret == 0 && !bs->enable_write_cache) {
2382
        ret = bdrv_co_flush(bs);
2383
    }
2384

    
2385
    if (bs->dirty_bitmap) {
2386
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2387
    }
2388

    
2389
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2390
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2391
    }
2392

    
2393
    tracked_request_end(&req);
2394

    
2395
    return ret;
2396
}
2397

    
2398
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2399
    int nb_sectors, QEMUIOVector *qiov)
2400
{
2401
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2402

    
2403
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2404
}
2405

    
2406
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2407
                                      int64_t sector_num, int nb_sectors)
2408
{
2409
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2410

    
2411
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2412
                             BDRV_REQ_ZERO_WRITE);
2413
}
2414

    
2415
/**
2416
 * Truncate file to 'offset' bytes (needed only for file protocols)
2417
 */
2418
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2419
{
2420
    BlockDriver *drv = bs->drv;
2421
    int ret;
2422
    if (!drv)
2423
        return -ENOMEDIUM;
2424
    if (!drv->bdrv_truncate)
2425
        return -ENOTSUP;
2426
    if (bs->read_only)
2427
        return -EACCES;
2428
    if (bdrv_in_use(bs))
2429
        return -EBUSY;
2430
    ret = drv->bdrv_truncate(bs, offset);
2431
    if (ret == 0) {
2432
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2433
        bdrv_dev_resize_cb(bs);
2434
    }
2435
    return ret;
2436
}
2437

    
2438
/**
2439
 * Length of a allocated file in bytes. Sparse files are counted by actual
2440
 * allocated space. Return < 0 if error or unknown.
2441
 */
2442
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2443
{
2444
    BlockDriver *drv = bs->drv;
2445
    if (!drv) {
2446
        return -ENOMEDIUM;
2447
    }
2448
    if (drv->bdrv_get_allocated_file_size) {
2449
        return drv->bdrv_get_allocated_file_size(bs);
2450
    }
2451
    if (bs->file) {
2452
        return bdrv_get_allocated_file_size(bs->file);
2453
    }
2454
    return -ENOTSUP;
2455
}
2456

    
2457
/**
2458
 * Length of a file in bytes. Return < 0 if error or unknown.
2459
 */
2460
int64_t bdrv_getlength(BlockDriverState *bs)
2461
{
2462
    BlockDriver *drv = bs->drv;
2463
    if (!drv)
2464
        return -ENOMEDIUM;
2465

    
2466
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2467
        if (drv->bdrv_getlength) {
2468
            return drv->bdrv_getlength(bs);
2469
        }
2470
    }
2471
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2472
}
2473

    
2474
/* return 0 as number of sectors if no device present or error */
2475
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2476
{
2477
    int64_t length;
2478
    length = bdrv_getlength(bs);
2479
    if (length < 0)
2480
        length = 0;
2481
    else
2482
        length = length >> BDRV_SECTOR_BITS;
2483
    *nb_sectors_ptr = length;
2484
}
2485

    
2486
/* throttling disk io limits */
2487
void bdrv_set_io_limits(BlockDriverState *bs,
2488
                        BlockIOLimit *io_limits)
2489
{
2490
    bs->io_limits = *io_limits;
2491
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2492
}
2493

    
2494
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2495
                       BlockdevOnError on_write_error)
2496
{
2497
    bs->on_read_error = on_read_error;
2498
    bs->on_write_error = on_write_error;
2499
}
2500

    
2501
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2502
{
2503
    return is_read ? bs->on_read_error : bs->on_write_error;
2504
}
2505

    
2506
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2507
{
2508
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2509

    
2510
    switch (on_err) {
2511
    case BLOCKDEV_ON_ERROR_ENOSPC:
2512
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2513
    case BLOCKDEV_ON_ERROR_STOP:
2514
        return BDRV_ACTION_STOP;
2515
    case BLOCKDEV_ON_ERROR_REPORT:
2516
        return BDRV_ACTION_REPORT;
2517
    case BLOCKDEV_ON_ERROR_IGNORE:
2518
        return BDRV_ACTION_IGNORE;
2519
    default:
2520
        abort();
2521
    }
2522
}
2523

    
2524
/* This is done by device models because, while the block layer knows
2525
 * about the error, it does not know whether an operation comes from
2526
 * the device or the block layer (from a job, for example).
2527
 */
2528
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2529
                       bool is_read, int error)
2530
{
2531
    assert(error >= 0);
2532
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2533
    if (action == BDRV_ACTION_STOP) {
2534
        vm_stop(RUN_STATE_IO_ERROR);
2535
        bdrv_iostatus_set_err(bs, error);
2536
    }
2537
}
2538

    
2539
int bdrv_is_read_only(BlockDriverState *bs)
2540
{
2541
    return bs->read_only;
2542
}
2543

    
2544
int bdrv_is_sg(BlockDriverState *bs)
2545
{
2546
    return bs->sg;
2547
}
2548

    
2549
int bdrv_enable_write_cache(BlockDriverState *bs)
2550
{
2551
    return bs->enable_write_cache;
2552
}
2553

    
2554
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2555
{
2556
    bs->enable_write_cache = wce;
2557

    
2558
    /* so a reopen() will preserve wce */
2559
    if (wce) {
2560
        bs->open_flags |= BDRV_O_CACHE_WB;
2561
    } else {
2562
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2563
    }
2564
}
2565

    
2566
int bdrv_is_encrypted(BlockDriverState *bs)
2567
{
2568
    if (bs->backing_hd && bs->backing_hd->encrypted)
2569
        return 1;
2570
    return bs->encrypted;
2571
}
2572

    
2573
int bdrv_key_required(BlockDriverState *bs)
2574
{
2575
    BlockDriverState *backing_hd = bs->backing_hd;
2576

    
2577
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2578
        return 1;
2579
    return (bs->encrypted && !bs->valid_key);
2580
}
2581

    
2582
int bdrv_set_key(BlockDriverState *bs, const char *key)
2583
{
2584
    int ret;
2585
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2586
        ret = bdrv_set_key(bs->backing_hd, key);
2587
        if (ret < 0)
2588
            return ret;
2589
        if (!bs->encrypted)
2590
            return 0;
2591
    }
2592
    if (!bs->encrypted) {
2593
        return -EINVAL;
2594
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2595
        return -ENOMEDIUM;
2596
    }
2597
    ret = bs->drv->bdrv_set_key(bs, key);
2598
    if (ret < 0) {
2599
        bs->valid_key = 0;
2600
    } else if (!bs->valid_key) {
2601
        bs->valid_key = 1;
2602
        /* call the change callback now, we skipped it on open */
2603
        bdrv_dev_change_media_cb(bs, true);
2604
    }
2605
    return ret;
2606
}
2607

    
2608
const char *bdrv_get_format_name(BlockDriverState *bs)
2609
{
2610
    return bs->drv ? bs->drv->format_name : NULL;
2611
}
2612

    
2613
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2614
                         void *opaque)
2615
{
2616
    BlockDriver *drv;
2617

    
2618
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2619
        it(opaque, drv->format_name);
2620
    }
2621
}
2622

    
2623
BlockDriverState *bdrv_find(const char *name)
2624
{
2625
    BlockDriverState *bs;
2626

    
2627
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2628
        if (!strcmp(name, bs->device_name)) {
2629
            return bs;
2630
        }
2631
    }
2632
    return NULL;
2633
}
2634

    
2635
BlockDriverState *bdrv_next(BlockDriverState *bs)
2636
{
2637
    if (!bs) {
2638
        return QTAILQ_FIRST(&bdrv_states);
2639
    }
2640
    return QTAILQ_NEXT(bs, list);
2641
}
2642

    
2643
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2644
{
2645
    BlockDriverState *bs;
2646

    
2647
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2648
        it(opaque, bs);
2649
    }
2650
}
2651

    
2652
const char *bdrv_get_device_name(BlockDriverState *bs)
2653
{
2654
    return bs->device_name;
2655
}
2656

    
2657
int bdrv_get_flags(BlockDriverState *bs)
2658
{
2659
    return bs->open_flags;
2660
}
2661

    
2662
void bdrv_flush_all(void)
2663
{
2664
    BlockDriverState *bs;
2665

    
2666
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2667
        bdrv_flush(bs);
2668
    }
2669
}
2670

    
2671
int bdrv_has_zero_init(BlockDriverState *bs)
2672
{
2673
    assert(bs->drv);
2674

    
2675
    if (bs->drv->bdrv_has_zero_init) {
2676
        return bs->drv->bdrv_has_zero_init(bs);
2677
    }
2678

    
2679
    return 1;
2680
}
2681

    
2682
typedef struct BdrvCoIsAllocatedData {
2683
    BlockDriverState *bs;
2684
    int64_t sector_num;
2685
    int nb_sectors;
2686
    int *pnum;
2687
    int ret;
2688
    bool done;
2689
} BdrvCoIsAllocatedData;
2690

    
2691
/*
2692
 * Returns true iff the specified sector is present in the disk image. Drivers
2693
 * not implementing the functionality are assumed to not support backing files,
2694
 * hence all their sectors are reported as allocated.
2695
 *
2696
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2697
 * and 'pnum' is set to 0.
2698
 *
2699
 * 'pnum' is set to the number of sectors (including and immediately following
2700
 * the specified sector) that are known to be in the same
2701
 * allocated/unallocated state.
2702
 *
2703
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2704
 * beyond the end of the disk image it will be clamped.
2705
 */
2706
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2707
                                      int nb_sectors, int *pnum)
2708
{
2709
    int64_t n;
2710

    
2711
    if (sector_num >= bs->total_sectors) {
2712
        *pnum = 0;
2713
        return 0;
2714
    }
2715

    
2716
    n = bs->total_sectors - sector_num;
2717
    if (n < nb_sectors) {
2718
        nb_sectors = n;
2719
    }
2720

    
2721
    if (!bs->drv->bdrv_co_is_allocated) {
2722
        *pnum = nb_sectors;
2723
        return 1;
2724
    }
2725

    
2726
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2727
}
2728

    
2729
/* Coroutine wrapper for bdrv_is_allocated() */
2730
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2731
{
2732
    BdrvCoIsAllocatedData *data = opaque;
2733
    BlockDriverState *bs = data->bs;
2734

    
2735
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2736
                                     data->pnum);
2737
    data->done = true;
2738
}
2739

    
2740
/*
2741
 * Synchronous wrapper around bdrv_co_is_allocated().
2742
 *
2743
 * See bdrv_co_is_allocated() for details.
2744
 */
2745
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2746
                      int *pnum)
2747
{
2748
    Coroutine *co;
2749
    BdrvCoIsAllocatedData data = {
2750
        .bs = bs,
2751
        .sector_num = sector_num,
2752
        .nb_sectors = nb_sectors,
2753
        .pnum = pnum,
2754
        .done = false,
2755
    };
2756

    
2757
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2758
    qemu_coroutine_enter(co, &data);
2759
    while (!data.done) {
2760
        qemu_aio_wait();
2761
    }
2762
    return data.ret;
2763
}
2764

    
2765
/*
2766
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2767
 *
2768
 * Return true if the given sector is allocated in any image between
2769
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2770
 * sector is allocated in any image of the chain.  Return false otherwise.
2771
 *
2772
 * 'pnum' is set to the number of sectors (including and immediately following
2773
 *  the specified sector) that are known to be in the same
2774
 *  allocated/unallocated state.
2775
 *
2776
 */
2777
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2778
                                            BlockDriverState *base,
2779
                                            int64_t sector_num,
2780
                                            int nb_sectors, int *pnum)
2781
{
2782
    BlockDriverState *intermediate;
2783
    int ret, n = nb_sectors;
2784

    
2785
    intermediate = top;
2786
    while (intermediate && intermediate != base) {
2787
        int pnum_inter;
2788
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2789
                                   &pnum_inter);
2790
        if (ret < 0) {
2791
            return ret;
2792
        } else if (ret) {
2793
            *pnum = pnum_inter;
2794
            return 1;
2795
        }
2796

    
2797
        /*
2798
         * [sector_num, nb_sectors] is unallocated on top but intermediate
2799
         * might have
2800
         *
2801
         * [sector_num+x, nr_sectors] allocated.
2802
         */
2803
        if (n > pnum_inter &&
2804
            (intermediate == top ||
2805
             sector_num + pnum_inter < intermediate->total_sectors)) {
2806
            n = pnum_inter;
2807
        }
2808

    
2809
        intermediate = intermediate->backing_hd;
2810
    }
2811

    
2812
    *pnum = n;
2813
    return 0;
2814
}
2815

    
2816
BlockInfo *bdrv_query_info(BlockDriverState *bs)
2817
{
2818
    BlockInfo *info = g_malloc0(sizeof(*info));
2819
    info->device = g_strdup(bs->device_name);
2820
    info->type = g_strdup("unknown");
2821
    info->locked = bdrv_dev_is_medium_locked(bs);
2822
    info->removable = bdrv_dev_has_removable_media(bs);
2823

    
2824
    if (bdrv_dev_has_removable_media(bs)) {
2825
        info->has_tray_open = true;
2826
        info->tray_open = bdrv_dev_is_tray_open(bs);
2827
    }
2828

    
2829
    if (bdrv_iostatus_is_enabled(bs)) {
2830
        info->has_io_status = true;
2831
        info->io_status = bs->iostatus;
2832
    }
2833

    
2834
    if (bs->dirty_bitmap) {
2835
        info->has_dirty = true;
2836
        info->dirty = g_malloc0(sizeof(*info->dirty));
2837
        info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
2838
        info->dirty->granularity =
2839
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
2840
    }
2841

    
2842
    if (bs->drv) {
2843
        info->has_inserted = true;
2844
        info->inserted = g_malloc0(sizeof(*info->inserted));
2845
        info->inserted->file = g_strdup(bs->filename);
2846
        info->inserted->ro = bs->read_only;
2847
        info->inserted->drv = g_strdup(bs->drv->format_name);
2848
        info->inserted->encrypted = bs->encrypted;
2849
        info->inserted->encryption_key_missing = bdrv_key_required(bs);
2850

    
2851
        if (bs->backing_file[0]) {
2852
            info->inserted->has_backing_file = true;
2853
            info->inserted->backing_file = g_strdup(bs->backing_file);
2854
        }
2855

    
2856
        info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
2857

    
2858
        if (bs->io_limits_enabled) {
2859
            info->inserted->bps =
2860
                           bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2861
            info->inserted->bps_rd =
2862
                           bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2863
            info->inserted->bps_wr =
2864
                           bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2865
            info->inserted->iops =
2866
                           bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2867
            info->inserted->iops_rd =
2868
                           bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2869
            info->inserted->iops_wr =
2870
                           bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2871
        }
2872
    }
2873
    return info;
2874
}
2875

    
2876
BlockInfoList *qmp_query_block(Error **errp)
2877
{
2878
    BlockInfoList *head = NULL, **p_next = &head;
2879
    BlockDriverState *bs;
2880

    
2881
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2882
        BlockInfoList *info = g_malloc0(sizeof(*info));
2883
        info->value = bdrv_query_info(bs);
2884

    
2885
        *p_next = info;
2886
        p_next = &info->next;
2887
    }
2888

    
2889
    return head;
2890
}
2891

    
2892
BlockStats *bdrv_query_stats(const BlockDriverState *bs)
2893
{
2894
    BlockStats *s;
2895

    
2896
    s = g_malloc0(sizeof(*s));
2897

    
2898
    if (bs->device_name[0]) {
2899
        s->has_device = true;
2900
        s->device = g_strdup(bs->device_name);
2901
    }
2902

    
2903
    s->stats = g_malloc0(sizeof(*s->stats));
2904
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2905
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2906
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2907
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2908
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2909
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2910
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2911
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2912
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2913

    
2914
    if (bs->file) {
2915
        s->has_parent = true;
2916
        s->parent = bdrv_query_stats(bs->file);
2917
    }
2918

    
2919
    return s;
2920
}
2921

    
2922
BlockStatsList *qmp_query_blockstats(Error **errp)
2923
{
2924
    BlockStatsList *head = NULL, **p_next = &head;
2925
    BlockDriverState *bs;
2926

    
2927
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2928
        BlockStatsList *info = g_malloc0(sizeof(*info));
2929
        info->value = bdrv_query_stats(bs);
2930

    
2931
        *p_next = info;
2932
        p_next = &info->next;
2933
    }
2934

    
2935
    return head;
2936
}
2937

    
2938
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2939
{
2940
    if (bs->backing_hd && bs->backing_hd->encrypted)
2941
        return bs->backing_file;
2942
    else if (bs->encrypted)
2943
        return bs->filename;
2944
    else
2945
        return NULL;
2946
}
2947

    
2948
void bdrv_get_backing_filename(BlockDriverState *bs,
2949
                               char *filename, int filename_size)
2950
{
2951
    pstrcpy(filename, filename_size, bs->backing_file);
2952
}
2953

    
2954
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2955
                          const uint8_t *buf, int nb_sectors)
2956
{
2957
    BlockDriver *drv = bs->drv;
2958
    if (!drv)
2959
        return -ENOMEDIUM;
2960
    if (!drv->bdrv_write_compressed)
2961
        return -ENOTSUP;
2962
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2963
        return -EIO;
2964

    
2965
    assert(!bs->dirty_bitmap);
2966

    
2967
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2968
}
2969

    
2970
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2971
{
2972
    BlockDriver *drv = bs->drv;
2973
    if (!drv)
2974
        return -ENOMEDIUM;
2975
    if (!drv->bdrv_get_info)
2976
        return -ENOTSUP;
2977
    memset(bdi, 0, sizeof(*bdi));
2978
    return drv->bdrv_get_info(bs, bdi);
2979
}
2980

    
2981
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2982
                      int64_t pos, int size)
2983
{
2984
    BlockDriver *drv = bs->drv;
2985
    if (!drv)
2986
        return -ENOMEDIUM;
2987
    if (drv->bdrv_save_vmstate)
2988
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
2989
    if (bs->file)
2990
        return bdrv_save_vmstate(bs->file, buf, pos, size);
2991
    return -ENOTSUP;
2992
}
2993

    
2994
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2995
                      int64_t pos, int size)
2996
{
2997
    BlockDriver *drv = bs->drv;
2998
    if (!drv)
2999
        return -ENOMEDIUM;
3000
    if (drv->bdrv_load_vmstate)
3001
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3002
    if (bs->file)
3003
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3004
    return -ENOTSUP;
3005
}
3006

    
3007
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3008
{
3009
    BlockDriver *drv = bs->drv;
3010

    
3011
    if (!drv || !drv->bdrv_debug_event) {
3012
        return;
3013
    }
3014

    
3015
    drv->bdrv_debug_event(bs, event);
3016
}
3017

    
3018
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3019
                          const char *tag)
3020
{
3021
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3022
        bs = bs->file;
3023
    }
3024

    
3025
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3026
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3027
    }
3028

    
3029
    return -ENOTSUP;
3030
}
3031

    
3032
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3033
{
3034
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3035
        bs = bs->file;
3036
    }
3037

    
3038
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3039
        return bs->drv->bdrv_debug_resume(bs, tag);
3040
    }
3041

    
3042
    return -ENOTSUP;
3043
}
3044

    
3045
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3046
{
3047
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3048
        bs = bs->file;
3049
    }
3050

    
3051
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3052
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3053
    }
3054

    
3055
    return false;
3056
}
3057

    
3058
/**************************************************************/
3059
/* handling of snapshots */
3060

    
3061
int bdrv_can_snapshot(BlockDriverState *bs)
3062
{
3063
    BlockDriver *drv = bs->drv;
3064
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3065
        return 0;
3066
    }
3067

    
3068
    if (!drv->bdrv_snapshot_create) {
3069
        if (bs->file != NULL) {
3070
            return bdrv_can_snapshot(bs->file);
3071
        }
3072
        return 0;
3073
    }
3074

    
3075
    return 1;
3076
}
3077

    
3078
int bdrv_is_snapshot(BlockDriverState *bs)
3079
{
3080
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3081
}
3082

    
3083
BlockDriverState *bdrv_snapshots(void)
3084
{
3085
    BlockDriverState *bs;
3086

    
3087
    if (bs_snapshots) {
3088
        return bs_snapshots;
3089
    }
3090

    
3091
    bs = NULL;
3092
    while ((bs = bdrv_next(bs))) {
3093
        if (bdrv_can_snapshot(bs)) {
3094
            bs_snapshots = bs;
3095
            return bs;
3096
        }
3097
    }
3098
    return NULL;
3099
}
3100

    
3101
int bdrv_snapshot_create(BlockDriverState *bs,
3102
                         QEMUSnapshotInfo *sn_info)
3103
{
3104
    BlockDriver *drv = bs->drv;
3105
    if (!drv)
3106
        return -ENOMEDIUM;
3107
    if (drv->bdrv_snapshot_create)
3108
        return drv->bdrv_snapshot_create(bs, sn_info);
3109
    if (bs->file)
3110
        return bdrv_snapshot_create(bs->file, sn_info);
3111
    return -ENOTSUP;
3112
}
3113

    
3114
int bdrv_snapshot_goto(BlockDriverState *bs,
3115
                       const char *snapshot_id)
3116
{
3117
    BlockDriver *drv = bs->drv;
3118
    int ret, open_ret;
3119

    
3120
    if (!drv)
3121
        return -ENOMEDIUM;
3122
    if (drv->bdrv_snapshot_goto)
3123
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
3124

    
3125
    if (bs->file) {
3126
        drv->bdrv_close(bs);
3127
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
3128
        open_ret = drv->bdrv_open(bs, bs->open_flags);
3129
        if (open_ret < 0) {
3130
            bdrv_delete(bs->file);
3131
            bs->drv = NULL;
3132
            return open_ret;
3133
        }
3134
        return ret;
3135
    }
3136

    
3137
    return -ENOTSUP;
3138
}
3139

    
3140
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
3141
{
3142
    BlockDriver *drv = bs->drv;
3143
    if (!drv)
3144
        return -ENOMEDIUM;
3145
    if (drv->bdrv_snapshot_delete)
3146
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
3147
    if (bs->file)
3148
        return bdrv_snapshot_delete(bs->file, snapshot_id);
3149
    return -ENOTSUP;
3150
}
3151

    
3152
int bdrv_snapshot_list(BlockDriverState *bs,
3153
                       QEMUSnapshotInfo **psn_info)
3154
{
3155
    BlockDriver *drv = bs->drv;
3156
    if (!drv)
3157
        return -ENOMEDIUM;
3158
    if (drv->bdrv_snapshot_list)
3159
        return drv->bdrv_snapshot_list(bs, psn_info);
3160
    if (bs->file)
3161
        return bdrv_snapshot_list(bs->file, psn_info);
3162
    return -ENOTSUP;
3163
}
3164

    
3165
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
3166
        const char *snapshot_name)
3167
{
3168
    BlockDriver *drv = bs->drv;
3169
    if (!drv) {
3170
        return -ENOMEDIUM;
3171
    }
3172
    if (!bs->read_only) {
3173
        return -EINVAL;
3174
    }
3175
    if (drv->bdrv_snapshot_load_tmp) {
3176
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
3177
    }
3178
    return -ENOTSUP;
3179
}
3180

    
3181
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3182
 * relative, it must be relative to the chain.  So, passing in bs->filename
3183
 * from a BDS as backing_file should not be done, as that may be relative to
3184
 * the CWD rather than the chain. */
3185
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3186
        const char *backing_file)
3187
{
3188
    char *filename_full = NULL;
3189
    char *backing_file_full = NULL;
3190
    char *filename_tmp = NULL;
3191
    int is_protocol = 0;
3192
    BlockDriverState *curr_bs = NULL;
3193
    BlockDriverState *retval = NULL;
3194

    
3195
    if (!bs || !bs->drv || !backing_file) {
3196
        return NULL;
3197
    }
3198

    
3199
    filename_full     = g_malloc(PATH_MAX);
3200
    backing_file_full = g_malloc(PATH_MAX);
3201
    filename_tmp      = g_malloc(PATH_MAX);
3202

    
3203
    is_protocol = path_has_protocol(backing_file);
3204

    
3205
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3206

    
3207
        /* If either of the filename paths is actually a protocol, then
3208
         * compare unmodified paths; otherwise make paths relative */
3209
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3210
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3211
                retval = curr_bs->backing_hd;
3212
                break;
3213
            }
3214
        } else {
3215
            /* If not an absolute filename path, make it relative to the current
3216
             * image's filename path */
3217
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3218
                         backing_file);
3219

    
3220
            /* We are going to compare absolute pathnames */
3221
            if (!realpath(filename_tmp, filename_full)) {
3222
                continue;
3223
            }
3224

    
3225
            /* We need to make sure the backing filename we are comparing against
3226
             * is relative to the current image filename (or absolute) */
3227
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3228
                         curr_bs->backing_file);
3229

    
3230
            if (!realpath(filename_tmp, backing_file_full)) {
3231
                continue;
3232
            }
3233

    
3234
            if (strcmp(backing_file_full, filename_full) == 0) {
3235
                retval = curr_bs->backing_hd;
3236
                break;
3237
            }
3238
        }
3239
    }
3240

    
3241
    g_free(filename_full);
3242
    g_free(backing_file_full);
3243
    g_free(filename_tmp);
3244
    return retval;
3245
}
3246

    
3247
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3248
{
3249
    if (!bs->drv) {
3250
        return 0;
3251
    }
3252

    
3253
    if (!bs->backing_hd) {
3254
        return 0;
3255
    }
3256

    
3257
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3258
}
3259

    
3260
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3261
{
3262
    BlockDriverState *curr_bs = NULL;
3263

    
3264
    if (!bs) {
3265
        return NULL;
3266
    }
3267

    
3268
    curr_bs = bs;
3269

    
3270
    while (curr_bs->backing_hd) {
3271
        curr_bs = curr_bs->backing_hd;
3272
    }
3273
    return curr_bs;
3274
}
3275

    
3276
#define NB_SUFFIXES 4
3277

    
3278
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
3279
{
3280
    static const char suffixes[NB_SUFFIXES] = "KMGT";
3281
    int64_t base;
3282
    int i;
3283

    
3284
    if (size <= 999) {
3285
        snprintf(buf, buf_size, "%" PRId64, size);
3286
    } else {
3287
        base = 1024;
3288
        for(i = 0; i < NB_SUFFIXES; i++) {
3289
            if (size < (10 * base)) {
3290
                snprintf(buf, buf_size, "%0.1f%c",
3291
                         (double)size / base,
3292
                         suffixes[i]);
3293
                break;
3294
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3295
                snprintf(buf, buf_size, "%" PRId64 "%c",
3296
                         ((size + (base >> 1)) / base),
3297
                         suffixes[i]);
3298
                break;
3299
            }
3300
            base = base * 1024;
3301
        }
3302
    }
3303
    return buf;
3304
}
3305

    
3306
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3307
{
3308
    char buf1[128], date_buf[128], clock_buf[128];
3309
    struct tm tm;
3310
    time_t ti;
3311
    int64_t secs;
3312

    
3313
    if (!sn) {
3314
        snprintf(buf, buf_size,
3315
                 "%-10s%-20s%7s%20s%15s",
3316
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3317
    } else {
3318
        ti = sn->date_sec;
3319
        localtime_r(&ti, &tm);
3320
        strftime(date_buf, sizeof(date_buf),
3321
                 "%Y-%m-%d %H:%M:%S", &tm);
3322
        secs = sn->vm_clock_nsec / 1000000000;
3323
        snprintf(clock_buf, sizeof(clock_buf),
3324
                 "%02d:%02d:%02d.%03d",
3325
                 (int)(secs / 3600),
3326
                 (int)((secs / 60) % 60),
3327
                 (int)(secs % 60),
3328
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3329
        snprintf(buf, buf_size,
3330
                 "%-10s%-20s%7s%20s%15s",
3331
                 sn->id_str, sn->name,
3332
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3333
                 date_buf,
3334
                 clock_buf);
3335
    }
3336
    return buf;
3337
}
3338

    
3339
/**************************************************************/
3340
/* async I/Os */
3341

    
3342
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3343
                                 QEMUIOVector *qiov, int nb_sectors,
3344
                                 BlockDriverCompletionFunc *cb, void *opaque)
3345
{
3346
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3347

    
3348
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3349
                                 cb, opaque, false);
3350
}
3351

    
3352
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3353
                                  QEMUIOVector *qiov, int nb_sectors,
3354
                                  BlockDriverCompletionFunc *cb, void *opaque)
3355
{
3356
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3357

    
3358
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3359
                                 cb, opaque, true);
3360
}
3361

    
3362

    
3363
typedef struct MultiwriteCB {
3364
    int error;
3365
    int num_requests;
3366
    int num_callbacks;
3367
    struct {
3368
        BlockDriverCompletionFunc *cb;
3369
        void *opaque;
3370
        QEMUIOVector *free_qiov;
3371
    } callbacks[];
3372
} MultiwriteCB;
3373

    
3374
static void multiwrite_user_cb(MultiwriteCB *mcb)
3375
{
3376
    int i;
3377

    
3378
    for (i = 0; i < mcb->num_callbacks; i++) {
3379
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3380
        if (mcb->callbacks[i].free_qiov) {
3381
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3382
        }
3383
        g_free(mcb->callbacks[i].free_qiov);
3384
    }
3385
}
3386

    
3387
static void multiwrite_cb(void *opaque, int ret)
3388
{
3389
    MultiwriteCB *mcb = opaque;
3390

    
3391
    trace_multiwrite_cb(mcb, ret);
3392

    
3393
    if (ret < 0 && !mcb->error) {
3394
        mcb->error = ret;
3395
    }
3396

    
3397
    mcb->num_requests--;
3398
    if (mcb->num_requests == 0) {
3399
        multiwrite_user_cb(mcb);
3400
        g_free(mcb);
3401
    }
3402
}
3403

    
3404
static int multiwrite_req_compare(const void *a, const void *b)
3405
{
3406
    const BlockRequest *req1 = a, *req2 = b;
3407

    
3408
    /*
3409
     * Note that we can't simply subtract req2->sector from req1->sector
3410
     * here as that could overflow the return value.
3411
     */
3412
    if (req1->sector > req2->sector) {
3413
        return 1;
3414
    } else if (req1->sector < req2->sector) {
3415
        return -1;
3416
    } else {
3417
        return 0;
3418
    }
3419
}
3420

    
3421
/*
3422
 * Takes a bunch of requests and tries to merge them. Returns the number of
3423
 * requests that remain after merging.
3424
 */
3425
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3426
    int num_reqs, MultiwriteCB *mcb)
3427
{
3428
    int i, outidx;
3429

    
3430
    // Sort requests by start sector
3431
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3432

    
3433
    // Check if adjacent requests touch the same clusters. If so, combine them,
3434
    // filling up gaps with zero sectors.
3435
    outidx = 0;
3436
    for (i = 1; i < num_reqs; i++) {
3437
        int merge = 0;
3438
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3439

    
3440
        // Handle exactly sequential writes and overlapping writes.
3441
        if (reqs[i].sector <= oldreq_last) {
3442
            merge = 1;
3443
        }
3444

    
3445
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3446
            merge = 0;
3447
        }
3448

    
3449
        if (merge) {
3450
            size_t size;
3451
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3452
            qemu_iovec_init(qiov,
3453
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3454

    
3455
            // Add the first request to the merged one. If the requests are
3456
            // overlapping, drop the last sectors of the first request.
3457
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3458
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3459

    
3460
            // We should need to add any zeros between the two requests
3461
            assert (reqs[i].sector <= oldreq_last);
3462

    
3463
            // Add the second request
3464
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3465

    
3466
            reqs[outidx].nb_sectors = qiov->size >> 9;
3467
            reqs[outidx].qiov = qiov;
3468

    
3469
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3470
        } else {
3471
            outidx++;
3472
            reqs[outidx].sector     = reqs[i].sector;
3473
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3474
            reqs[outidx].qiov       = reqs[i].qiov;
3475
        }
3476
    }
3477

    
3478
    return outidx + 1;
3479
}
3480

    
3481
/*
3482
 * Submit multiple AIO write requests at once.
3483
 *
3484
 * On success, the function returns 0 and all requests in the reqs array have
3485
 * been submitted. In error case this function returns -1, and any of the
3486
 * requests may or may not be submitted yet. In particular, this means that the
3487
 * callback will be called for some of the requests, for others it won't. The
3488
 * caller must check the error field of the BlockRequest to wait for the right
3489
 * callbacks (if error != 0, no callback will be called).
3490
 *
3491
 * The implementation may modify the contents of the reqs array, e.g. to merge
3492
 * requests. However, the fields opaque and error are left unmodified as they
3493
 * are used to signal failure for a single request to the caller.
3494
 */
3495
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3496
{
3497
    MultiwriteCB *mcb;
3498
    int i;
3499

    
3500
    /* don't submit writes if we don't have a medium */
3501
    if (bs->drv == NULL) {
3502
        for (i = 0; i < num_reqs; i++) {
3503
            reqs[i].error = -ENOMEDIUM;
3504
        }
3505
        return -1;
3506
    }
3507

    
3508
    if (num_reqs == 0) {
3509
        return 0;
3510
    }
3511

    
3512
    // Create MultiwriteCB structure
3513
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3514
    mcb->num_requests = 0;
3515
    mcb->num_callbacks = num_reqs;
3516

    
3517
    for (i = 0; i < num_reqs; i++) {
3518
        mcb->callbacks[i].cb = reqs[i].cb;
3519
        mcb->callbacks[i].opaque = reqs[i].opaque;
3520
    }
3521

    
3522
    // Check for mergable requests
3523
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3524

    
3525
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3526

    
3527
    /* Run the aio requests. */
3528
    mcb->num_requests = num_reqs;
3529
    for (i = 0; i < num_reqs; i++) {
3530
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3531
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3532
    }
3533

    
3534
    return 0;
3535
}
3536

    
3537
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3538
{
3539
    acb->aiocb_info->cancel(acb);
3540
}
3541

    
3542
/* block I/O throttling */
3543
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3544
                 bool is_write, double elapsed_time, uint64_t *wait)
3545
{
3546
    uint64_t bps_limit = 0;
3547
    double   bytes_limit, bytes_base, bytes_res;
3548
    double   slice_time, wait_time;
3549

    
3550
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3551
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3552
    } else if (bs->io_limits.bps[is_write]) {
3553
        bps_limit = bs->io_limits.bps[is_write];
3554
    } else {
3555
        if (wait) {
3556
            *wait = 0;
3557
        }
3558

    
3559
        return false;
3560
    }
3561

    
3562
    slice_time = bs->slice_end - bs->slice_start;
3563
    slice_time /= (NANOSECONDS_PER_SECOND);
3564
    bytes_limit = bps_limit * slice_time;
3565
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3566
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3567
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3568
    }
3569

    
3570
    /* bytes_base: the bytes of data which have been read/written; and
3571
     *             it is obtained from the history statistic info.
3572
     * bytes_res: the remaining bytes of data which need to be read/written.
3573
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3574
     *             the total time for completing reading/writting all data.
3575
     */
3576
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3577

    
3578
    if (bytes_base + bytes_res <= bytes_limit) {
3579
        if (wait) {
3580
            *wait = 0;
3581
        }
3582

    
3583
        return false;
3584
    }
3585

    
3586
    /* Calc approx time to dispatch */
3587
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3588

    
3589
    /* When the I/O rate at runtime exceeds the limits,
3590
     * bs->slice_end need to be extended in order that the current statistic
3591
     * info can be kept until the timer fire, so it is increased and tuned
3592
     * based on the result of experiment.
3593
     */
3594
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3595
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3596
    if (wait) {
3597
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3598
    }
3599

    
3600
    return true;
3601
}
3602

    
3603
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3604
                             double elapsed_time, uint64_t *wait)
3605
{
3606
    uint64_t iops_limit = 0;
3607
    double   ios_limit, ios_base;
3608
    double   slice_time, wait_time;
3609

    
3610
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3611
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3612
    } else if (bs->io_limits.iops[is_write]) {
3613
        iops_limit = bs->io_limits.iops[is_write];
3614
    } else {
3615
        if (wait) {
3616
            *wait = 0;
3617
        }
3618

    
3619
        return false;
3620
    }
3621

    
3622
    slice_time = bs->slice_end - bs->slice_start;
3623
    slice_time /= (NANOSECONDS_PER_SECOND);
3624
    ios_limit  = iops_limit * slice_time;
3625
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3626
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3627
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3628
    }
3629

    
3630
    if (ios_base + 1 <= ios_limit) {
3631
        if (wait) {
3632
            *wait = 0;
3633
        }
3634

    
3635
        return false;
3636
    }
3637

    
3638
    /* Calc approx time to dispatch */
3639
    wait_time = (ios_base + 1) / iops_limit;
3640
    if (wait_time > elapsed_time) {
3641
        wait_time = wait_time - elapsed_time;
3642
    } else {
3643
        wait_time = 0;
3644
    }
3645

    
3646
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3647
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3648
    if (wait) {
3649
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3650
    }
3651

    
3652
    return true;
3653
}
3654

    
3655
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3656
                           bool is_write, int64_t *wait)
3657
{
3658
    int64_t  now, max_wait;
3659
    uint64_t bps_wait = 0, iops_wait = 0;
3660
    double   elapsed_time;
3661
    int      bps_ret, iops_ret;
3662

    
3663
    now = qemu_get_clock_ns(vm_clock);
3664
    if ((bs->slice_start < now)
3665
        && (bs->slice_end > now)) {
3666
        bs->slice_end = now + bs->slice_time;
3667
    } else {
3668
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3669
        bs->slice_start = now;
3670
        bs->slice_end   = now + bs->slice_time;
3671

    
3672
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3673
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3674

    
3675
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3676
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3677
    }
3678

    
3679
    elapsed_time  = now - bs->slice_start;
3680
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3681

    
3682
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3683
                                      is_write, elapsed_time, &bps_wait);
3684
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3685
                                      elapsed_time, &iops_wait);
3686
    if (bps_ret || iops_ret) {
3687
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3688
        if (wait) {
3689
            *wait = max_wait;
3690
        }
3691

    
3692
        now = qemu_get_clock_ns(vm_clock);
3693
        if (bs->slice_end < now + max_wait) {
3694
            bs->slice_end = now + max_wait;
3695
        }
3696

    
3697
        return true;
3698
    }
3699

    
3700
    if (wait) {
3701
        *wait = 0;
3702
    }
3703

    
3704
    return false;
3705
}
3706

    
3707
/**************************************************************/
3708
/* async block device emulation */
3709

    
3710
typedef struct BlockDriverAIOCBSync {
3711
    BlockDriverAIOCB common;
3712
    QEMUBH *bh;
3713
    int ret;
3714
    /* vector translation state */
3715
    QEMUIOVector *qiov;
3716
    uint8_t *bounce;
3717
    int is_write;
3718
} BlockDriverAIOCBSync;
3719

    
3720
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3721
{
3722
    BlockDriverAIOCBSync *acb =
3723
        container_of(blockacb, BlockDriverAIOCBSync, common);
3724
    qemu_bh_delete(acb->bh);
3725
    acb->bh = NULL;
3726
    qemu_aio_release(acb);
3727
}
3728

    
3729
static const AIOCBInfo bdrv_em_aiocb_info = {
3730
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3731
    .cancel             = bdrv_aio_cancel_em,
3732
};
3733

    
3734
static void bdrv_aio_bh_cb(void *opaque)
3735
{
3736
    BlockDriverAIOCBSync *acb = opaque;
3737

    
3738
    if (!acb->is_write)
3739
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3740
    qemu_vfree(acb->bounce);
3741
    acb->common.cb(acb->common.opaque, acb->ret);
3742
    qemu_bh_delete(acb->bh);
3743
    acb->bh = NULL;
3744
    qemu_aio_release(acb);
3745
}
3746

    
3747
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3748
                                            int64_t sector_num,
3749
                                            QEMUIOVector *qiov,
3750
                                            int nb_sectors,
3751
                                            BlockDriverCompletionFunc *cb,
3752
                                            void *opaque,
3753
                                            int is_write)
3754

    
3755
{
3756
    BlockDriverAIOCBSync *acb;
3757

    
3758
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3759
    acb->is_write = is_write;
3760
    acb->qiov = qiov;
3761
    acb->bounce = qemu_blockalign(bs, qiov->size);
3762
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3763

    
3764
    if (is_write) {
3765
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3766
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3767
    } else {
3768
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3769
    }
3770

    
3771
    qemu_bh_schedule(acb->bh);
3772

    
3773
    return &acb->common;
3774
}
3775

    
3776
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3777
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3778
        BlockDriverCompletionFunc *cb, void *opaque)
3779
{
3780
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3781
}
3782

    
3783
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3784
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3785
        BlockDriverCompletionFunc *cb, void *opaque)
3786
{
3787
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3788
}
3789

    
3790

    
3791
typedef struct BlockDriverAIOCBCoroutine {
3792
    BlockDriverAIOCB common;
3793
    BlockRequest req;
3794
    bool is_write;
3795
    bool *done;
3796
    QEMUBH* bh;
3797
} BlockDriverAIOCBCoroutine;
3798

    
3799
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3800
{
3801
    BlockDriverAIOCBCoroutine *acb =
3802
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3803
    bool done = false;
3804

    
3805
    acb->done = &done;
3806
    while (!done) {
3807
        qemu_aio_wait();
3808
    }
3809
}
3810

    
3811
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3812
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3813
    .cancel             = bdrv_aio_co_cancel_em,
3814
};
3815

    
3816
static void bdrv_co_em_bh(void *opaque)
3817
{
3818
    BlockDriverAIOCBCoroutine *acb = opaque;
3819

    
3820
    acb->common.cb(acb->common.opaque, acb->req.error);
3821

    
3822
    if (acb->done) {
3823
        *acb->done = true;
3824
    }
3825

    
3826
    qemu_bh_delete(acb->bh);
3827
    qemu_aio_release(acb);
3828
}
3829

    
3830
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3831
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3832
{
3833
    BlockDriverAIOCBCoroutine *acb = opaque;
3834
    BlockDriverState *bs = acb->common.bs;
3835

    
3836
    if (!acb->is_write) {
3837
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3838
            acb->req.nb_sectors, acb->req.qiov, 0);
3839
    } else {
3840
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3841
            acb->req.nb_sectors, acb->req.qiov, 0);
3842
    }
3843

    
3844
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3845
    qemu_bh_schedule(acb->bh);
3846
}
3847

    
3848
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3849
                                               int64_t sector_num,
3850
                                               QEMUIOVector *qiov,
3851
                                               int nb_sectors,
3852
                                               BlockDriverCompletionFunc *cb,
3853
                                               void *opaque,
3854
                                               bool is_write)
3855
{
3856
    Coroutine *co;
3857
    BlockDriverAIOCBCoroutine *acb;
3858

    
3859
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3860
    acb->req.sector = sector_num;
3861
    acb->req.nb_sectors = nb_sectors;
3862
    acb->req.qiov = qiov;
3863
    acb->is_write = is_write;
3864
    acb->done = NULL;
3865

    
3866
    co = qemu_coroutine_create(bdrv_co_do_rw);
3867
    qemu_coroutine_enter(co, acb);
3868

    
3869
    return &acb->common;
3870
}
3871

    
3872
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3873
{
3874
    BlockDriverAIOCBCoroutine *acb = opaque;
3875
    BlockDriverState *bs = acb->common.bs;
3876

    
3877
    acb->req.error = bdrv_co_flush(bs);
3878
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3879
    qemu_bh_schedule(acb->bh);
3880
}
3881

    
3882
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3883
        BlockDriverCompletionFunc *cb, void *opaque)
3884
{
3885
    trace_bdrv_aio_flush(bs, opaque);
3886

    
3887
    Coroutine *co;
3888
    BlockDriverAIOCBCoroutine *acb;
3889

    
3890
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3891
    acb->done = NULL;
3892

    
3893
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3894
    qemu_coroutine_enter(co, acb);
3895

    
3896
    return &acb->common;
3897
}
3898

    
3899
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3900
{
3901
    BlockDriverAIOCBCoroutine *acb = opaque;
3902
    BlockDriverState *bs = acb->common.bs;
3903

    
3904
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3905
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3906
    qemu_bh_schedule(acb->bh);
3907
}
3908

    
3909
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3910
        int64_t sector_num, int nb_sectors,
3911
        BlockDriverCompletionFunc *cb, void *opaque)
3912
{
3913
    Coroutine *co;
3914
    BlockDriverAIOCBCoroutine *acb;
3915

    
3916
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3917

    
3918
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3919
    acb->req.sector = sector_num;
3920
    acb->req.nb_sectors = nb_sectors;
3921
    acb->done = NULL;
3922
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3923
    qemu_coroutine_enter(co, acb);
3924

    
3925
    return &acb->common;
3926
}
3927

    
3928
void bdrv_init(void)
3929
{
3930
    module_call_init(MODULE_INIT_BLOCK);
3931
}
3932

    
3933
void bdrv_init_with_whitelist(void)
3934
{
3935
    use_bdrv_whitelist = 1;
3936
    bdrv_init();
3937
}
3938

    
3939
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
3940
                   BlockDriverCompletionFunc *cb, void *opaque)
3941
{
3942
    BlockDriverAIOCB *acb;
3943

    
3944
    acb = g_slice_alloc(aiocb_info->aiocb_size);
3945
    acb->aiocb_info = aiocb_info;
3946
    acb->bs = bs;
3947
    acb->cb = cb;
3948
    acb->opaque = opaque;
3949
    return acb;
3950
}
3951

    
3952
void qemu_aio_release(void *p)
3953
{
3954
    BlockDriverAIOCB *acb = p;
3955
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
3956
}
3957

    
3958
/**************************************************************/
3959
/* Coroutine block device emulation */
3960

    
3961
typedef struct CoroutineIOCompletion {
3962
    Coroutine *coroutine;
3963
    int ret;
3964
} CoroutineIOCompletion;
3965

    
3966
static void bdrv_co_io_em_complete(void *opaque, int ret)
3967
{
3968
    CoroutineIOCompletion *co = opaque;
3969

    
3970
    co->ret = ret;
3971
    qemu_coroutine_enter(co->coroutine, NULL);
3972
}
3973

    
3974
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3975
                                      int nb_sectors, QEMUIOVector *iov,
3976
                                      bool is_write)
3977
{
3978
    CoroutineIOCompletion co = {
3979
        .coroutine = qemu_coroutine_self(),
3980
    };
3981
    BlockDriverAIOCB *acb;
3982

    
3983
    if (is_write) {
3984
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3985
                                       bdrv_co_io_em_complete, &co);
3986
    } else {
3987
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3988
                                      bdrv_co_io_em_complete, &co);
3989
    }
3990

    
3991
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3992
    if (!acb) {
3993
        return -EIO;
3994
    }
3995
    qemu_coroutine_yield();
3996

    
3997
    return co.ret;
3998
}
3999

    
4000
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4001
                                         int64_t sector_num, int nb_sectors,
4002
                                         QEMUIOVector *iov)
4003
{
4004
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4005
}
4006

    
4007
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4008
                                         int64_t sector_num, int nb_sectors,
4009
                                         QEMUIOVector *iov)
4010
{
4011
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4012
}
4013

    
4014
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4015
{
4016
    RwCo *rwco = opaque;
4017

    
4018
    rwco->ret = bdrv_co_flush(rwco->bs);
4019
}
4020

    
4021
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4022
{
4023
    int ret;
4024

    
4025
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4026
        return 0;
4027
    }
4028

    
4029
    /* Write back cached data to the OS even with cache=unsafe */
4030
    if (bs->drv->bdrv_co_flush_to_os) {
4031
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4032
        if (ret < 0) {
4033
            return ret;
4034
        }
4035
    }
4036

    
4037
    /* But don't actually force it to the disk with cache=unsafe */
4038
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4039
        goto flush_parent;
4040
    }
4041

    
4042
    if (bs->drv->bdrv_co_flush_to_disk) {
4043
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4044
    } else if (bs->drv->bdrv_aio_flush) {
4045
        BlockDriverAIOCB *acb;
4046
        CoroutineIOCompletion co = {
4047
            .coroutine = qemu_coroutine_self(),
4048
        };
4049

    
4050
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4051
        if (acb == NULL) {
4052
            ret = -EIO;
4053
        } else {
4054
            qemu_coroutine_yield();
4055
            ret = co.ret;
4056
        }
4057
    } else {
4058
        /*
4059
         * Some block drivers always operate in either writethrough or unsafe
4060
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4061
         * know how the server works (because the behaviour is hardcoded or
4062
         * depends on server-side configuration), so we can't ensure that
4063
         * everything is safe on disk. Returning an error doesn't work because
4064
         * that would break guests even if the server operates in writethrough
4065
         * mode.
4066
         *
4067
         * Let's hope the user knows what he's doing.
4068
         */
4069
        ret = 0;
4070
    }
4071
    if (ret < 0) {
4072
        return ret;
4073
    }
4074

    
4075
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4076
     * in the case of cache=unsafe, so there are no useless flushes.
4077
     */
4078
flush_parent:
4079
    return bdrv_co_flush(bs->file);
4080
}
4081

    
4082
void bdrv_invalidate_cache(BlockDriverState *bs)
4083
{
4084
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4085
        bs->drv->bdrv_invalidate_cache(bs);
4086
    }
4087
}
4088

    
4089
void bdrv_invalidate_cache_all(void)
4090
{
4091
    BlockDriverState *bs;
4092

    
4093
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4094
        bdrv_invalidate_cache(bs);
4095
    }
4096
}
4097

    
4098
void bdrv_clear_incoming_migration_all(void)
4099
{
4100
    BlockDriverState *bs;
4101

    
4102
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4103
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4104
    }
4105
}
4106

    
4107
int bdrv_flush(BlockDriverState *bs)
4108
{
4109
    Coroutine *co;
4110
    RwCo rwco = {
4111
        .bs = bs,
4112
        .ret = NOT_DONE,
4113
    };
4114

    
4115
    if (qemu_in_coroutine()) {
4116
        /* Fast-path if already in coroutine context */
4117
        bdrv_flush_co_entry(&rwco);
4118
    } else {
4119
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4120
        qemu_coroutine_enter(co, &rwco);
4121
        while (rwco.ret == NOT_DONE) {
4122
            qemu_aio_wait();
4123
        }
4124
    }
4125

    
4126
    return rwco.ret;
4127
}
4128

    
4129
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4130
{
4131
    RwCo *rwco = opaque;
4132

    
4133
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4134
}
4135

    
4136
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4137
                                 int nb_sectors)
4138
{
4139
    if (!bs->drv) {
4140
        return -ENOMEDIUM;
4141
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4142
        return -EIO;
4143
    } else if (bs->read_only) {
4144
        return -EROFS;
4145
    }
4146

    
4147
    if (bs->dirty_bitmap) {
4148
        bdrv_reset_dirty(bs, sector_num, nb_sectors);
4149
    }
4150

    
4151
    if (bs->drv->bdrv_co_discard) {
4152
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4153
    } else if (bs->drv->bdrv_aio_discard) {
4154
        BlockDriverAIOCB *acb;
4155
        CoroutineIOCompletion co = {
4156
            .coroutine = qemu_coroutine_self(),
4157
        };
4158

    
4159
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4160
                                        bdrv_co_io_em_complete, &co);
4161
        if (acb == NULL) {
4162
            return -EIO;
4163
        } else {
4164
            qemu_coroutine_yield();
4165
            return co.ret;
4166
        }
4167
    } else {
4168
        return 0;
4169
    }
4170
}
4171

    
4172
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4173
{
4174
    Coroutine *co;
4175
    RwCo rwco = {
4176
        .bs = bs,
4177
        .sector_num = sector_num,
4178
        .nb_sectors = nb_sectors,
4179
        .ret = NOT_DONE,
4180
    };
4181

    
4182
    if (qemu_in_coroutine()) {
4183
        /* Fast-path if already in coroutine context */
4184
        bdrv_discard_co_entry(&rwco);
4185
    } else {
4186
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4187
        qemu_coroutine_enter(co, &rwco);
4188
        while (rwco.ret == NOT_DONE) {
4189
            qemu_aio_wait();
4190
        }
4191
    }
4192

    
4193
    return rwco.ret;
4194
}
4195

    
4196
/**************************************************************/
4197
/* removable device support */
4198

    
4199
/**
4200
 * Return TRUE if the media is present
4201
 */
4202
int bdrv_is_inserted(BlockDriverState *bs)
4203
{
4204
    BlockDriver *drv = bs->drv;
4205

    
4206
    if (!drv)
4207
        return 0;
4208
    if (!drv->bdrv_is_inserted)
4209
        return 1;
4210
    return drv->bdrv_is_inserted(bs);
4211
}
4212

    
4213
/**
4214
 * Return whether the media changed since the last call to this
4215
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4216
 */
4217
int bdrv_media_changed(BlockDriverState *bs)
4218
{
4219
    BlockDriver *drv = bs->drv;
4220

    
4221
    if (drv && drv->bdrv_media_changed) {
4222
        return drv->bdrv_media_changed(bs);
4223
    }
4224
    return -ENOTSUP;
4225
}
4226

    
4227
/**
4228
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4229
 */
4230
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4231
{
4232
    BlockDriver *drv = bs->drv;
4233

    
4234
    if (drv && drv->bdrv_eject) {
4235
        drv->bdrv_eject(bs, eject_flag);
4236
    }
4237

    
4238
    if (bs->device_name[0] != '\0') {
4239
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4240
    }
4241
}
4242

    
4243
/**
4244
 * Lock or unlock the media (if it is locked, the user won't be able
4245
 * to eject it manually).
4246
 */
4247
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4248
{
4249
    BlockDriver *drv = bs->drv;
4250

    
4251
    trace_bdrv_lock_medium(bs, locked);
4252

    
4253
    if (drv && drv->bdrv_lock_medium) {
4254
        drv->bdrv_lock_medium(bs, locked);
4255
    }
4256
}
4257

    
4258
/* needed for generic scsi interface */
4259

    
4260
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4261
{
4262
    BlockDriver *drv = bs->drv;
4263

    
4264
    if (drv && drv->bdrv_ioctl)
4265
        return drv->bdrv_ioctl(bs, req, buf);
4266
    return -ENOTSUP;
4267
}
4268

    
4269
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4270
        unsigned long int req, void *buf,
4271
        BlockDriverCompletionFunc *cb, void *opaque)
4272
{
4273
    BlockDriver *drv = bs->drv;
4274

    
4275
    if (drv && drv->bdrv_aio_ioctl)
4276
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4277
    return NULL;
4278
}
4279

    
4280
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4281
{
4282
    bs->buffer_alignment = align;
4283
}
4284

    
4285
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4286
{
4287
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4288
}
4289

    
4290
/*
4291
 * Check if all memory in this vector is sector aligned.
4292
 */
4293
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4294
{
4295
    int i;
4296

    
4297
    for (i = 0; i < qiov->niov; i++) {
4298
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4299
            return false;
4300
        }
4301
    }
4302

    
4303
    return true;
4304
}
4305

    
4306
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4307
{
4308
    int64_t bitmap_size;
4309

    
4310
    assert((granularity & (granularity - 1)) == 0);
4311

    
4312
    if (granularity) {
4313
        granularity >>= BDRV_SECTOR_BITS;
4314
        assert(!bs->dirty_bitmap);
4315
        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4316
        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4317
    } else {
4318
        if (bs->dirty_bitmap) {
4319
            hbitmap_free(bs->dirty_bitmap);
4320
            bs->dirty_bitmap = NULL;
4321
        }
4322
    }
4323
}
4324

    
4325
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4326
{
4327
    if (bs->dirty_bitmap) {
4328
        return hbitmap_get(bs->dirty_bitmap, sector);
4329
    } else {
4330
        return 0;
4331
    }
4332
}
4333

    
4334
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4335
{
4336
    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4337
}
4338

    
4339
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4340
                    int nr_sectors)
4341
{
4342
    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4343
}
4344

    
4345
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4346
                      int nr_sectors)
4347
{
4348
    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4349
}
4350

    
4351
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4352
{
4353
    if (bs->dirty_bitmap) {
4354
        return hbitmap_count(bs->dirty_bitmap);
4355
    } else {
4356
        return 0;
4357
    }
4358
}
4359

    
4360
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4361
{
4362
    assert(bs->in_use != in_use);
4363
    bs->in_use = in_use;
4364
}
4365

    
4366
int bdrv_in_use(BlockDriverState *bs)
4367
{
4368
    return bs->in_use;
4369
}
4370

    
4371
void bdrv_iostatus_enable(BlockDriverState *bs)
4372
{
4373
    bs->iostatus_enabled = true;
4374
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4375
}
4376

    
4377
/* The I/O status is only enabled if the drive explicitly
4378
 * enables it _and_ the VM is configured to stop on errors */
4379
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4380
{
4381
    return (bs->iostatus_enabled &&
4382
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4383
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4384
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4385
}
4386

    
4387
void bdrv_iostatus_disable(BlockDriverState *bs)
4388
{
4389
    bs->iostatus_enabled = false;
4390
}
4391

    
4392
void bdrv_iostatus_reset(BlockDriverState *bs)
4393
{
4394
    if (bdrv_iostatus_is_enabled(bs)) {
4395
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4396
        if (bs->job) {
4397
            block_job_iostatus_reset(bs->job);
4398
        }
4399
    }
4400
}
4401

    
4402
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4403
{
4404
    assert(bdrv_iostatus_is_enabled(bs));
4405
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4406
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4407
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4408
    }
4409
}
4410

    
4411
void
4412
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4413
        enum BlockAcctType type)
4414
{
4415
    assert(type < BDRV_MAX_IOTYPE);
4416

    
4417
    cookie->bytes = bytes;
4418
    cookie->start_time_ns = get_clock();
4419
    cookie->type = type;
4420
}
4421

    
4422
void
4423
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4424
{
4425
    assert(cookie->type < BDRV_MAX_IOTYPE);
4426

    
4427
    bs->nr_bytes[cookie->type] += cookie->bytes;
4428
    bs->nr_ops[cookie->type]++;
4429
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4430
}
4431

    
4432
void bdrv_img_create(const char *filename, const char *fmt,
4433
                     const char *base_filename, const char *base_fmt,
4434
                     char *options, uint64_t img_size, int flags, Error **errp)
4435
{
4436
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4437
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4438
    BlockDriverState *bs = NULL;
4439
    BlockDriver *drv, *proto_drv;
4440
    BlockDriver *backing_drv = NULL;
4441
    int ret = 0;
4442

    
4443
    /* Find driver and parse its options */
4444
    drv = bdrv_find_format(fmt);
4445
    if (!drv) {
4446
        error_setg(errp, "Unknown file format '%s'", fmt);
4447
        return;
4448
    }
4449

    
4450
    proto_drv = bdrv_find_protocol(filename);
4451
    if (!proto_drv) {
4452
        error_setg(errp, "Unknown protocol '%s'", filename);
4453
        return;
4454
    }
4455

    
4456
    create_options = append_option_parameters(create_options,
4457
                                              drv->create_options);
4458
    create_options = append_option_parameters(create_options,
4459
                                              proto_drv->create_options);
4460

    
4461
    /* Create parameter list with default values */
4462
    param = parse_option_parameters("", create_options, param);
4463

    
4464
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4465

    
4466
    /* Parse -o options */
4467
    if (options) {
4468
        param = parse_option_parameters(options, create_options, param);
4469
        if (param == NULL) {
4470
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4471
            goto out;
4472
        }
4473
    }
4474

    
4475
    if (base_filename) {
4476
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4477
                                 base_filename)) {
4478
            error_setg(errp, "Backing file not supported for file format '%s'",
4479
                       fmt);
4480
            goto out;
4481
        }
4482
    }
4483

    
4484
    if (base_fmt) {
4485
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4486
            error_setg(errp, "Backing file format not supported for file "
4487
                             "format '%s'", fmt);
4488
            goto out;
4489
        }
4490
    }
4491

    
4492
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4493
    if (backing_file && backing_file->value.s) {
4494
        if (!strcmp(filename, backing_file->value.s)) {
4495
            error_setg(errp, "Error: Trying to create an image with the "
4496
                             "same filename as the backing file");
4497
            goto out;
4498
        }
4499
    }
4500

    
4501
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4502
    if (backing_fmt && backing_fmt->value.s) {
4503
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4504
        if (!backing_drv) {
4505
            error_setg(errp, "Unknown backing file format '%s'",
4506
                       backing_fmt->value.s);
4507
            goto out;
4508
        }
4509
    }
4510

    
4511
    // The size for the image must always be specified, with one exception:
4512
    // If we are using a backing file, we can obtain the size from there
4513
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4514
    if (size && size->value.n == -1) {
4515
        if (backing_file && backing_file->value.s) {
4516
            uint64_t size;
4517
            char buf[32];
4518
            int back_flags;
4519

    
4520
            /* backing files always opened read-only */
4521
            back_flags =
4522
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4523

    
4524
            bs = bdrv_new("");
4525

    
4526
            ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4527
            if (ret < 0) {
4528
                error_setg_errno(errp, -ret, "Could not open '%s'",
4529
                                 backing_file->value.s);
4530
                goto out;
4531
            }
4532
            bdrv_get_geometry(bs, &size);
4533
            size *= 512;
4534

    
4535
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4536
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4537
        } else {
4538
            error_setg(errp, "Image creation needs a size parameter");
4539
            goto out;
4540
        }
4541
    }
4542

    
4543
    printf("Formatting '%s', fmt=%s ", filename, fmt);
4544
    print_option_parameters(param);
4545
    puts("");
4546

    
4547
    ret = bdrv_create(drv, filename, param);
4548
    if (ret < 0) {
4549
        if (ret == -ENOTSUP) {
4550
            error_setg(errp,"Formatting or formatting option not supported for "
4551
                            "file format '%s'", fmt);
4552
        } else if (ret == -EFBIG) {
4553
            error_setg(errp, "The image size is too large for file format '%s'",
4554
                       fmt);
4555
        } else {
4556
            error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
4557
                       strerror(-ret));
4558
        }
4559
    }
4560

    
4561
out:
4562
    free_option_parameters(create_options);
4563
    free_option_parameters(param);
4564

    
4565
    if (bs) {
4566
        bdrv_delete(bs);
4567
    }
4568
}