Statistics
| Branch: | Revision:

root / block.c @ d7331bed

History | View | Annotate | Download (124 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block_int.h"
29
#include "blockjob.h"
30
#include "module.h"
31
#include "qjson.h"
32
#include "sysemu.h"
33
#include "notify.h"
34
#include "qemu-coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu-timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90
        bool is_write, double elapsed_time, uint64_t *wait);
91
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92
        double elapsed_time, uint64_t *wait);
93
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94
        bool is_write, int64_t *wait);
95

    
96
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
98

    
99
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
101

    
102
/* The device to use for VM snapshots */
103
static BlockDriverState *bs_snapshots;
104

    
105
/* If non-zero, use only whitelisted block drivers */
106
static int use_bdrv_whitelist;
107

    
108
#ifdef _WIN32
109
static int is_windows_drive_prefix(const char *filename)
110
{
111
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
112
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113
            filename[1] == ':');
114
}
115

    
116
int is_windows_drive(const char *filename)
117
{
118
    if (is_windows_drive_prefix(filename) &&
119
        filename[2] == '\0')
120
        return 1;
121
    if (strstart(filename, "\\\\.\\", NULL) ||
122
        strstart(filename, "//./", NULL))
123
        return 1;
124
    return 0;
125
}
126
#endif
127

    
128
/* throttling disk I/O limits */
129
void bdrv_io_limits_disable(BlockDriverState *bs)
130
{
131
    bs->io_limits_enabled = false;
132

    
133
    while (qemu_co_queue_next(&bs->throttled_reqs));
134

    
135
    if (bs->block_timer) {
136
        qemu_del_timer(bs->block_timer);
137
        qemu_free_timer(bs->block_timer);
138
        bs->block_timer = NULL;
139
    }
140

    
141
    bs->slice_start = 0;
142
    bs->slice_end   = 0;
143
    bs->slice_time  = 0;
144
    memset(&bs->io_base, 0, sizeof(bs->io_base));
145
}
146

    
147
static void bdrv_block_timer(void *opaque)
148
{
149
    BlockDriverState *bs = opaque;
150

    
151
    qemu_co_queue_next(&bs->throttled_reqs);
152
}
153

    
154
void bdrv_io_limits_enable(BlockDriverState *bs)
155
{
156
    qemu_co_queue_init(&bs->throttled_reqs);
157
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
158
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
159
    bs->slice_start = qemu_get_clock_ns(vm_clock);
160
    bs->slice_end   = bs->slice_start + bs->slice_time;
161
    memset(&bs->io_base, 0, sizeof(bs->io_base));
162
    bs->io_limits_enabled = true;
163
}
164

    
165
bool bdrv_io_limits_enabled(BlockDriverState *bs)
166
{
167
    BlockIOLimit *io_limits = &bs->io_limits;
168
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
169
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
170
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
171
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
172
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
173
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
174
}
175

    
176
static void bdrv_io_limits_intercept(BlockDriverState *bs,
177
                                     bool is_write, int nb_sectors)
178
{
179
    int64_t wait_time = -1;
180

    
181
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
182
        qemu_co_queue_wait(&bs->throttled_reqs);
183
    }
184

    
185
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
186
     * throttled requests will not be dequeued until the current request is
187
     * allowed to be serviced. So if the current request still exceeds the
188
     * limits, it will be inserted to the head. All requests followed it will
189
     * be still in throttled_reqs queue.
190
     */
191

    
192
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
193
        qemu_mod_timer(bs->block_timer,
194
                       wait_time + qemu_get_clock_ns(vm_clock));
195
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
196
    }
197

    
198
    qemu_co_queue_next(&bs->throttled_reqs);
199
}
200

    
201
/* check if the path starts with "<protocol>:" */
202
static int path_has_protocol(const char *path)
203
{
204
    const char *p;
205

    
206
#ifdef _WIN32
207
    if (is_windows_drive(path) ||
208
        is_windows_drive_prefix(path)) {
209
        return 0;
210
    }
211
    p = path + strcspn(path, ":/\\");
212
#else
213
    p = path + strcspn(path, ":/");
214
#endif
215

    
216
    return *p == ':';
217
}
218

    
219
int path_is_absolute(const char *path)
220
{
221
#ifdef _WIN32
222
    /* specific case for names like: "\\.\d:" */
223
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
224
        return 1;
225
    }
226
    return (*path == '/' || *path == '\\');
227
#else
228
    return (*path == '/');
229
#endif
230
}
231

    
232
/* if filename is absolute, just copy it to dest. Otherwise, build a
233
   path to it by considering it is relative to base_path. URL are
234
   supported. */
235
void path_combine(char *dest, int dest_size,
236
                  const char *base_path,
237
                  const char *filename)
238
{
239
    const char *p, *p1;
240
    int len;
241

    
242
    if (dest_size <= 0)
243
        return;
244
    if (path_is_absolute(filename)) {
245
        pstrcpy(dest, dest_size, filename);
246
    } else {
247
        p = strchr(base_path, ':');
248
        if (p)
249
            p++;
250
        else
251
            p = base_path;
252
        p1 = strrchr(base_path, '/');
253
#ifdef _WIN32
254
        {
255
            const char *p2;
256
            p2 = strrchr(base_path, '\\');
257
            if (!p1 || p2 > p1)
258
                p1 = p2;
259
        }
260
#endif
261
        if (p1)
262
            p1++;
263
        else
264
            p1 = base_path;
265
        if (p1 > p)
266
            p = p1;
267
        len = p - base_path;
268
        if (len > dest_size - 1)
269
            len = dest_size - 1;
270
        memcpy(dest, base_path, len);
271
        dest[len] = '\0';
272
        pstrcat(dest, dest_size, filename);
273
    }
274
}
275

    
276
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
277
{
278
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
279
        pstrcpy(dest, sz, bs->backing_file);
280
    } else {
281
        path_combine(dest, sz, bs->filename, bs->backing_file);
282
    }
283
}
284

    
285
void bdrv_register(BlockDriver *bdrv)
286
{
287
    /* Block drivers without coroutine functions need emulation */
288
    if (!bdrv->bdrv_co_readv) {
289
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
290
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
291

    
292
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
293
         * the block driver lacks aio we need to emulate that too.
294
         */
295
        if (!bdrv->bdrv_aio_readv) {
296
            /* add AIO emulation layer */
297
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
298
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
299
        }
300
    }
301

    
302
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
303
}
304

    
305
/* create a new block device (by default it is empty) */
306
BlockDriverState *bdrv_new(const char *device_name)
307
{
308
    BlockDriverState *bs;
309

    
310
    bs = g_malloc0(sizeof(BlockDriverState));
311
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
312
    if (device_name[0] != '\0') {
313
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
314
    }
315
    bdrv_iostatus_disable(bs);
316
    notifier_list_init(&bs->close_notifiers);
317

    
318
    return bs;
319
}
320

    
321
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
322
{
323
    notifier_list_add(&bs->close_notifiers, notify);
324
}
325

    
326
BlockDriver *bdrv_find_format(const char *format_name)
327
{
328
    BlockDriver *drv1;
329
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
330
        if (!strcmp(drv1->format_name, format_name)) {
331
            return drv1;
332
        }
333
    }
334
    return NULL;
335
}
336

    
337
static int bdrv_is_whitelisted(BlockDriver *drv)
338
{
339
    static const char *whitelist[] = {
340
        CONFIG_BDRV_WHITELIST
341
    };
342
    const char **p;
343

    
344
    if (!whitelist[0])
345
        return 1;               /* no whitelist, anything goes */
346

    
347
    for (p = whitelist; *p; p++) {
348
        if (!strcmp(drv->format_name, *p)) {
349
            return 1;
350
        }
351
    }
352
    return 0;
353
}
354

    
355
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
356
{
357
    BlockDriver *drv = bdrv_find_format(format_name);
358
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
359
}
360

    
361
typedef struct CreateCo {
362
    BlockDriver *drv;
363
    char *filename;
364
    QEMUOptionParameter *options;
365
    int ret;
366
} CreateCo;
367

    
368
static void coroutine_fn bdrv_create_co_entry(void *opaque)
369
{
370
    CreateCo *cco = opaque;
371
    assert(cco->drv);
372

    
373
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
374
}
375

    
376
int bdrv_create(BlockDriver *drv, const char* filename,
377
    QEMUOptionParameter *options)
378
{
379
    int ret;
380

    
381
    Coroutine *co;
382
    CreateCo cco = {
383
        .drv = drv,
384
        .filename = g_strdup(filename),
385
        .options = options,
386
        .ret = NOT_DONE,
387
    };
388

    
389
    if (!drv->bdrv_create) {
390
        ret = -ENOTSUP;
391
        goto out;
392
    }
393

    
394
    if (qemu_in_coroutine()) {
395
        /* Fast-path if already in coroutine context */
396
        bdrv_create_co_entry(&cco);
397
    } else {
398
        co = qemu_coroutine_create(bdrv_create_co_entry);
399
        qemu_coroutine_enter(co, &cco);
400
        while (cco.ret == NOT_DONE) {
401
            qemu_aio_wait();
402
        }
403
    }
404

    
405
    ret = cco.ret;
406

    
407
out:
408
    g_free(cco.filename);
409
    return ret;
410
}
411

    
412
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
413
{
414
    BlockDriver *drv;
415

    
416
    drv = bdrv_find_protocol(filename);
417
    if (drv == NULL) {
418
        return -ENOENT;
419
    }
420

    
421
    return bdrv_create(drv, filename, options);
422
}
423

    
424
/*
425
 * Create a uniquely-named empty temporary file.
426
 * Return 0 upon success, otherwise a negative errno value.
427
 */
428
int get_tmp_filename(char *filename, int size)
429
{
430
#ifdef _WIN32
431
    char temp_dir[MAX_PATH];
432
    /* GetTempFileName requires that its output buffer (4th param)
433
       have length MAX_PATH or greater.  */
434
    assert(size >= MAX_PATH);
435
    return (GetTempPath(MAX_PATH, temp_dir)
436
            && GetTempFileName(temp_dir, "qem", 0, filename)
437
            ? 0 : -GetLastError());
438
#else
439
    int fd;
440
    const char *tmpdir;
441
    tmpdir = getenv("TMPDIR");
442
    if (!tmpdir)
443
        tmpdir = "/tmp";
444
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
445
        return -EOVERFLOW;
446
    }
447
    fd = mkstemp(filename);
448
    if (fd < 0) {
449
        return -errno;
450
    }
451
    if (close(fd) != 0) {
452
        unlink(filename);
453
        return -errno;
454
    }
455
    return 0;
456
#endif
457
}
458

    
459
/*
460
 * Detect host devices. By convention, /dev/cdrom[N] is always
461
 * recognized as a host CDROM.
462
 */
463
static BlockDriver *find_hdev_driver(const char *filename)
464
{
465
    int score_max = 0, score;
466
    BlockDriver *drv = NULL, *d;
467

    
468
    QLIST_FOREACH(d, &bdrv_drivers, list) {
469
        if (d->bdrv_probe_device) {
470
            score = d->bdrv_probe_device(filename);
471
            if (score > score_max) {
472
                score_max = score;
473
                drv = d;
474
            }
475
        }
476
    }
477

    
478
    return drv;
479
}
480

    
481
BlockDriver *bdrv_find_protocol(const char *filename)
482
{
483
    BlockDriver *drv1;
484
    char protocol[128];
485
    int len;
486
    const char *p;
487

    
488
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
489

    
490
    /*
491
     * XXX(hch): we really should not let host device detection
492
     * override an explicit protocol specification, but moving this
493
     * later breaks access to device names with colons in them.
494
     * Thanks to the brain-dead persistent naming schemes on udev-
495
     * based Linux systems those actually are quite common.
496
     */
497
    drv1 = find_hdev_driver(filename);
498
    if (drv1) {
499
        return drv1;
500
    }
501

    
502
    if (!path_has_protocol(filename)) {
503
        return bdrv_find_format("file");
504
    }
505
    p = strchr(filename, ':');
506
    assert(p != NULL);
507
    len = p - filename;
508
    if (len > sizeof(protocol) - 1)
509
        len = sizeof(protocol) - 1;
510
    memcpy(protocol, filename, len);
511
    protocol[len] = '\0';
512
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
513
        if (drv1->protocol_name &&
514
            !strcmp(drv1->protocol_name, protocol)) {
515
            return drv1;
516
        }
517
    }
518
    return NULL;
519
}
520

    
521
static int find_image_format(const char *filename, BlockDriver **pdrv)
522
{
523
    int ret, score, score_max;
524
    BlockDriver *drv1, *drv;
525
    uint8_t buf[2048];
526
    BlockDriverState *bs;
527

    
528
    ret = bdrv_file_open(&bs, filename, 0);
529
    if (ret < 0) {
530
        *pdrv = NULL;
531
        return ret;
532
    }
533

    
534
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
535
    if (bs->sg || !bdrv_is_inserted(bs)) {
536
        bdrv_delete(bs);
537
        drv = bdrv_find_format("raw");
538
        if (!drv) {
539
            ret = -ENOENT;
540
        }
541
        *pdrv = drv;
542
        return ret;
543
    }
544

    
545
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
546
    bdrv_delete(bs);
547
    if (ret < 0) {
548
        *pdrv = NULL;
549
        return ret;
550
    }
551

    
552
    score_max = 0;
553
    drv = NULL;
554
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
555
        if (drv1->bdrv_probe) {
556
            score = drv1->bdrv_probe(buf, ret, filename);
557
            if (score > score_max) {
558
                score_max = score;
559
                drv = drv1;
560
            }
561
        }
562
    }
563
    if (!drv) {
564
        ret = -ENOENT;
565
    }
566
    *pdrv = drv;
567
    return ret;
568
}
569

    
570
/**
571
 * Set the current 'total_sectors' value
572
 */
573
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
574
{
575
    BlockDriver *drv = bs->drv;
576

    
577
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
578
    if (bs->sg)
579
        return 0;
580

    
581
    /* query actual device if possible, otherwise just trust the hint */
582
    if (drv->bdrv_getlength) {
583
        int64_t length = drv->bdrv_getlength(bs);
584
        if (length < 0) {
585
            return length;
586
        }
587
        hint = length >> BDRV_SECTOR_BITS;
588
    }
589

    
590
    bs->total_sectors = hint;
591
    return 0;
592
}
593

    
594
/**
595
 * Set open flags for a given cache mode
596
 *
597
 * Return 0 on success, -1 if the cache mode was invalid.
598
 */
599
int bdrv_parse_cache_flags(const char *mode, int *flags)
600
{
601
    *flags &= ~BDRV_O_CACHE_MASK;
602

    
603
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
604
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
605
    } else if (!strcmp(mode, "directsync")) {
606
        *flags |= BDRV_O_NOCACHE;
607
    } else if (!strcmp(mode, "writeback")) {
608
        *flags |= BDRV_O_CACHE_WB;
609
    } else if (!strcmp(mode, "unsafe")) {
610
        *flags |= BDRV_O_CACHE_WB;
611
        *flags |= BDRV_O_NO_FLUSH;
612
    } else if (!strcmp(mode, "writethrough")) {
613
        /* this is the default */
614
    } else {
615
        return -1;
616
    }
617

    
618
    return 0;
619
}
620

    
621
/**
622
 * The copy-on-read flag is actually a reference count so multiple users may
623
 * use the feature without worrying about clobbering its previous state.
624
 * Copy-on-read stays enabled until all users have called to disable it.
625
 */
626
void bdrv_enable_copy_on_read(BlockDriverState *bs)
627
{
628
    bs->copy_on_read++;
629
}
630

    
631
void bdrv_disable_copy_on_read(BlockDriverState *bs)
632
{
633
    assert(bs->copy_on_read > 0);
634
    bs->copy_on_read--;
635
}
636

    
637
/*
638
 * Common part for opening disk images and files
639
 */
640
static int bdrv_open_common(BlockDriverState *bs, const char *filename,
641
    int flags, BlockDriver *drv)
642
{
643
    int ret, open_flags;
644

    
645
    assert(drv != NULL);
646
    assert(bs->file == NULL);
647

    
648
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
649

    
650
    bs->open_flags = flags;
651
    bs->buffer_alignment = 512;
652

    
653
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
654
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
655
        bdrv_enable_copy_on_read(bs);
656
    }
657

    
658
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
659

    
660
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
661
        return -ENOTSUP;
662
    }
663

    
664
    bs->drv = drv;
665
    bs->opaque = g_malloc0(drv->instance_size);
666

    
667
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
668
    open_flags = flags | BDRV_O_CACHE_WB;
669

    
670
    /*
671
     * Clear flags that are internal to the block layer before opening the
672
     * image.
673
     */
674
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
675

    
676
    /*
677
     * Snapshots should be writable.
678
     */
679
    if (bs->is_temporary) {
680
        open_flags |= BDRV_O_RDWR;
681
    }
682

    
683
    bs->read_only = !(open_flags & BDRV_O_RDWR);
684

    
685
    /* Open the image, either directly or using a protocol */
686
    if (drv->bdrv_file_open) {
687
        ret = drv->bdrv_file_open(bs, filename, open_flags);
688
    } else {
689
        ret = bdrv_file_open(&bs->file, filename, open_flags);
690
        if (ret >= 0) {
691
            ret = drv->bdrv_open(bs, open_flags);
692
        }
693
    }
694

    
695
    if (ret < 0) {
696
        goto free_and_fail;
697
    }
698

    
699
    ret = refresh_total_sectors(bs, bs->total_sectors);
700
    if (ret < 0) {
701
        goto free_and_fail;
702
    }
703

    
704
#ifndef _WIN32
705
    if (bs->is_temporary) {
706
        unlink(filename);
707
    }
708
#endif
709
    return 0;
710

    
711
free_and_fail:
712
    if (bs->file) {
713
        bdrv_delete(bs->file);
714
        bs->file = NULL;
715
    }
716
    g_free(bs->opaque);
717
    bs->opaque = NULL;
718
    bs->drv = NULL;
719
    return ret;
720
}
721

    
722
/*
723
 * Opens a file using a protocol (file, host_device, nbd, ...)
724
 */
725
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
726
{
727
    BlockDriverState *bs;
728
    BlockDriver *drv;
729
    int ret;
730

    
731
    drv = bdrv_find_protocol(filename);
732
    if (!drv) {
733
        return -ENOENT;
734
    }
735

    
736
    bs = bdrv_new("");
737
    ret = bdrv_open_common(bs, filename, flags, drv);
738
    if (ret < 0) {
739
        bdrv_delete(bs);
740
        return ret;
741
    }
742
    bs->growable = 1;
743
    *pbs = bs;
744
    return 0;
745
}
746

    
747
int bdrv_open_backing_file(BlockDriverState *bs)
748
{
749
    char backing_filename[PATH_MAX];
750
    int back_flags, ret;
751
    BlockDriver *back_drv = NULL;
752

    
753
    if (bs->backing_hd != NULL) {
754
        return 0;
755
    }
756

    
757
    bs->open_flags &= ~BDRV_O_NO_BACKING;
758
    if (bs->backing_file[0] == '\0') {
759
        return 0;
760
    }
761

    
762
    bs->backing_hd = bdrv_new("");
763
    bdrv_get_full_backing_filename(bs, backing_filename,
764
                                   sizeof(backing_filename));
765

    
766
    if (bs->backing_format[0] != '\0') {
767
        back_drv = bdrv_find_format(bs->backing_format);
768
    }
769

    
770
    /* backing files always opened read-only */
771
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
772

    
773
    ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
774
    if (ret < 0) {
775
        bdrv_delete(bs->backing_hd);
776
        bs->backing_hd = NULL;
777
        bs->open_flags |= BDRV_O_NO_BACKING;
778
        return ret;
779
    }
780
    return 0;
781
}
782

    
783
/*
784
 * Opens a disk image (raw, qcow2, vmdk, ...)
785
 */
786
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
787
              BlockDriver *drv)
788
{
789
    int ret;
790
    char tmp_filename[PATH_MAX];
791

    
792
    if (flags & BDRV_O_SNAPSHOT) {
793
        BlockDriverState *bs1;
794
        int64_t total_size;
795
        int is_protocol = 0;
796
        BlockDriver *bdrv_qcow2;
797
        QEMUOptionParameter *options;
798
        char backing_filename[PATH_MAX];
799

    
800
        /* if snapshot, we create a temporary backing file and open it
801
           instead of opening 'filename' directly */
802

    
803
        /* if there is a backing file, use it */
804
        bs1 = bdrv_new("");
805
        ret = bdrv_open(bs1, filename, 0, drv);
806
        if (ret < 0) {
807
            bdrv_delete(bs1);
808
            return ret;
809
        }
810
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
811

    
812
        if (bs1->drv && bs1->drv->protocol_name)
813
            is_protocol = 1;
814

    
815
        bdrv_delete(bs1);
816

    
817
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
818
        if (ret < 0) {
819
            return ret;
820
        }
821

    
822
        /* Real path is meaningless for protocols */
823
        if (is_protocol)
824
            snprintf(backing_filename, sizeof(backing_filename),
825
                     "%s", filename);
826
        else if (!realpath(filename, backing_filename))
827
            return -errno;
828

    
829
        bdrv_qcow2 = bdrv_find_format("qcow2");
830
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
831

    
832
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
833
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
834
        if (drv) {
835
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
836
                drv->format_name);
837
        }
838

    
839
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
840
        free_option_parameters(options);
841
        if (ret < 0) {
842
            return ret;
843
        }
844

    
845
        filename = tmp_filename;
846
        drv = bdrv_qcow2;
847
        bs->is_temporary = 1;
848
    }
849

    
850
    /* Find the right image format driver */
851
    if (!drv) {
852
        ret = find_image_format(filename, &drv);
853
    }
854

    
855
    if (!drv) {
856
        goto unlink_and_fail;
857
    }
858

    
859
    if (flags & BDRV_O_RDWR) {
860
        flags |= BDRV_O_ALLOW_RDWR;
861
    }
862

    
863
    /* Open the image */
864
    ret = bdrv_open_common(bs, filename, flags, drv);
865
    if (ret < 0) {
866
        goto unlink_and_fail;
867
    }
868

    
869
    /* If there is a backing file, use it */
870
    if ((flags & BDRV_O_NO_BACKING) == 0) {
871
        ret = bdrv_open_backing_file(bs);
872
        if (ret < 0) {
873
            bdrv_close(bs);
874
            return ret;
875
        }
876
    }
877

    
878
    if (!bdrv_key_required(bs)) {
879
        bdrv_dev_change_media_cb(bs, true);
880
    }
881

    
882
    /* throttling disk I/O limits */
883
    if (bs->io_limits_enabled) {
884
        bdrv_io_limits_enable(bs);
885
    }
886

    
887
    return 0;
888

    
889
unlink_and_fail:
890
    if (bs->is_temporary) {
891
        unlink(filename);
892
    }
893
    return ret;
894
}
895

    
896
typedef struct BlockReopenQueueEntry {
897
     bool prepared;
898
     BDRVReopenState state;
899
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
900
} BlockReopenQueueEntry;
901

    
902
/*
903
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
904
 * reopen of multiple devices.
905
 *
906
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
907
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
908
 * be created and initialized. This newly created BlockReopenQueue should be
909
 * passed back in for subsequent calls that are intended to be of the same
910
 * atomic 'set'.
911
 *
912
 * bs is the BlockDriverState to add to the reopen queue.
913
 *
914
 * flags contains the open flags for the associated bs
915
 *
916
 * returns a pointer to bs_queue, which is either the newly allocated
917
 * bs_queue, or the existing bs_queue being used.
918
 *
919
 */
920
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
921
                                    BlockDriverState *bs, int flags)
922
{
923
    assert(bs != NULL);
924

    
925
    BlockReopenQueueEntry *bs_entry;
926
    if (bs_queue == NULL) {
927
        bs_queue = g_new0(BlockReopenQueue, 1);
928
        QSIMPLEQ_INIT(bs_queue);
929
    }
930

    
931
    if (bs->file) {
932
        bdrv_reopen_queue(bs_queue, bs->file, flags);
933
    }
934

    
935
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
936
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
937

    
938
    bs_entry->state.bs = bs;
939
    bs_entry->state.flags = flags;
940

    
941
    return bs_queue;
942
}
943

    
944
/*
945
 * Reopen multiple BlockDriverStates atomically & transactionally.
946
 *
947
 * The queue passed in (bs_queue) must have been built up previous
948
 * via bdrv_reopen_queue().
949
 *
950
 * Reopens all BDS specified in the queue, with the appropriate
951
 * flags.  All devices are prepared for reopen, and failure of any
952
 * device will cause all device changes to be abandonded, and intermediate
953
 * data cleaned up.
954
 *
955
 * If all devices prepare successfully, then the changes are committed
956
 * to all devices.
957
 *
958
 */
959
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
960
{
961
    int ret = -1;
962
    BlockReopenQueueEntry *bs_entry, *next;
963
    Error *local_err = NULL;
964

    
965
    assert(bs_queue != NULL);
966

    
967
    bdrv_drain_all();
968

    
969
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
970
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
971
            error_propagate(errp, local_err);
972
            goto cleanup;
973
        }
974
        bs_entry->prepared = true;
975
    }
976

    
977
    /* If we reach this point, we have success and just need to apply the
978
     * changes
979
     */
980
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
981
        bdrv_reopen_commit(&bs_entry->state);
982
    }
983

    
984
    ret = 0;
985

    
986
cleanup:
987
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
988
        if (ret && bs_entry->prepared) {
989
            bdrv_reopen_abort(&bs_entry->state);
990
        }
991
        g_free(bs_entry);
992
    }
993
    g_free(bs_queue);
994
    return ret;
995
}
996

    
997

    
998
/* Reopen a single BlockDriverState with the specified flags. */
999
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1000
{
1001
    int ret = -1;
1002
    Error *local_err = NULL;
1003
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1004

    
1005
    ret = bdrv_reopen_multiple(queue, &local_err);
1006
    if (local_err != NULL) {
1007
        error_propagate(errp, local_err);
1008
    }
1009
    return ret;
1010
}
1011

    
1012

    
1013
/*
1014
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1015
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1016
 * the block driver layer .bdrv_reopen_prepare()
1017
 *
1018
 * bs is the BlockDriverState to reopen
1019
 * flags are the new open flags
1020
 * queue is the reopen queue
1021
 *
1022
 * Returns 0 on success, non-zero on error.  On error errp will be set
1023
 * as well.
1024
 *
1025
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1026
 * It is the responsibility of the caller to then call the abort() or
1027
 * commit() for any other BDS that have been left in a prepare() state
1028
 *
1029
 */
1030
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1031
                        Error **errp)
1032
{
1033
    int ret = -1;
1034
    Error *local_err = NULL;
1035
    BlockDriver *drv;
1036

    
1037
    assert(reopen_state != NULL);
1038
    assert(reopen_state->bs->drv != NULL);
1039
    drv = reopen_state->bs->drv;
1040

    
1041
    /* if we are to stay read-only, do not allow permission change
1042
     * to r/w */
1043
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1044
        reopen_state->flags & BDRV_O_RDWR) {
1045
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1046
                  reopen_state->bs->device_name);
1047
        goto error;
1048
    }
1049

    
1050

    
1051
    ret = bdrv_flush(reopen_state->bs);
1052
    if (ret) {
1053
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1054
                  strerror(-ret));
1055
        goto error;
1056
    }
1057

    
1058
    if (drv->bdrv_reopen_prepare) {
1059
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1060
        if (ret) {
1061
            if (local_err != NULL) {
1062
                error_propagate(errp, local_err);
1063
            } else {
1064
                error_set(errp, QERR_OPEN_FILE_FAILED,
1065
                          reopen_state->bs->filename);
1066
            }
1067
            goto error;
1068
        }
1069
    } else {
1070
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1071
         * handler for each supported drv. */
1072
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1073
                  drv->format_name, reopen_state->bs->device_name,
1074
                 "reopening of file");
1075
        ret = -1;
1076
        goto error;
1077
    }
1078

    
1079
    ret = 0;
1080

    
1081
error:
1082
    return ret;
1083
}
1084

    
1085
/*
1086
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1087
 * makes them final by swapping the staging BlockDriverState contents into
1088
 * the active BlockDriverState contents.
1089
 */
1090
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1091
{
1092
    BlockDriver *drv;
1093

    
1094
    assert(reopen_state != NULL);
1095
    drv = reopen_state->bs->drv;
1096
    assert(drv != NULL);
1097

    
1098
    /* If there are any driver level actions to take */
1099
    if (drv->bdrv_reopen_commit) {
1100
        drv->bdrv_reopen_commit(reopen_state);
1101
    }
1102

    
1103
    /* set BDS specific flags now */
1104
    reopen_state->bs->open_flags         = reopen_state->flags;
1105
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1106
                                              BDRV_O_CACHE_WB);
1107
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1108
}
1109

    
1110
/*
1111
 * Abort the reopen, and delete and free the staged changes in
1112
 * reopen_state
1113
 */
1114
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1115
{
1116
    BlockDriver *drv;
1117

    
1118
    assert(reopen_state != NULL);
1119
    drv = reopen_state->bs->drv;
1120
    assert(drv != NULL);
1121

    
1122
    if (drv->bdrv_reopen_abort) {
1123
        drv->bdrv_reopen_abort(reopen_state);
1124
    }
1125
}
1126

    
1127

    
1128
void bdrv_close(BlockDriverState *bs)
1129
{
1130
    bdrv_flush(bs);
1131
    if (bs->job) {
1132
        block_job_cancel_sync(bs->job);
1133
    }
1134
    bdrv_drain_all();
1135
    notifier_list_notify(&bs->close_notifiers, bs);
1136

    
1137
    if (bs->drv) {
1138
        if (bs == bs_snapshots) {
1139
            bs_snapshots = NULL;
1140
        }
1141
        if (bs->backing_hd) {
1142
            bdrv_delete(bs->backing_hd);
1143
            bs->backing_hd = NULL;
1144
        }
1145
        bs->drv->bdrv_close(bs);
1146
        g_free(bs->opaque);
1147
#ifdef _WIN32
1148
        if (bs->is_temporary) {
1149
            unlink(bs->filename);
1150
        }
1151
#endif
1152
        bs->opaque = NULL;
1153
        bs->drv = NULL;
1154
        bs->copy_on_read = 0;
1155
        bs->backing_file[0] = '\0';
1156
        bs->backing_format[0] = '\0';
1157
        bs->total_sectors = 0;
1158
        bs->encrypted = 0;
1159
        bs->valid_key = 0;
1160
        bs->sg = 0;
1161
        bs->growable = 0;
1162

    
1163
        if (bs->file != NULL) {
1164
            bdrv_delete(bs->file);
1165
            bs->file = NULL;
1166
        }
1167
    }
1168

    
1169
    bdrv_dev_change_media_cb(bs, false);
1170

    
1171
    /*throttling disk I/O limits*/
1172
    if (bs->io_limits_enabled) {
1173
        bdrv_io_limits_disable(bs);
1174
    }
1175
}
1176

    
1177
void bdrv_close_all(void)
1178
{
1179
    BlockDriverState *bs;
1180

    
1181
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1182
        bdrv_close(bs);
1183
    }
1184
}
1185

    
1186
/*
1187
 * Wait for pending requests to complete across all BlockDriverStates
1188
 *
1189
 * This function does not flush data to disk, use bdrv_flush_all() for that
1190
 * after calling this function.
1191
 *
1192
 * Note that completion of an asynchronous I/O operation can trigger any
1193
 * number of other I/O operations on other devices---for example a coroutine
1194
 * can be arbitrarily complex and a constant flow of I/O can come until the
1195
 * coroutine is complete.  Because of this, it is not possible to have a
1196
 * function to drain a single device's I/O queue.
1197
 */
1198
void bdrv_drain_all(void)
1199
{
1200
    BlockDriverState *bs;
1201
    bool busy;
1202

    
1203
    do {
1204
        busy = qemu_aio_wait();
1205

    
1206
        /* FIXME: We do not have timer support here, so this is effectively
1207
         * a busy wait.
1208
         */
1209
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1210
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1211
                qemu_co_queue_restart_all(&bs->throttled_reqs);
1212
                busy = true;
1213
            }
1214
        }
1215
    } while (busy);
1216

    
1217
    /* If requests are still pending there is a bug somewhere */
1218
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1219
        assert(QLIST_EMPTY(&bs->tracked_requests));
1220
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
1221
    }
1222
}
1223

    
1224
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1225
   Also, NULL terminate the device_name to prevent double remove */
1226
void bdrv_make_anon(BlockDriverState *bs)
1227
{
1228
    if (bs->device_name[0] != '\0') {
1229
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1230
    }
1231
    bs->device_name[0] = '\0';
1232
}
1233

    
1234
static void bdrv_rebind(BlockDriverState *bs)
1235
{
1236
    if (bs->drv && bs->drv->bdrv_rebind) {
1237
        bs->drv->bdrv_rebind(bs);
1238
    }
1239
}
1240

    
1241
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1242
                                     BlockDriverState *bs_src)
1243
{
1244
    /* move some fields that need to stay attached to the device */
1245
    bs_dest->open_flags         = bs_src->open_flags;
1246

    
1247
    /* dev info */
1248
    bs_dest->dev_ops            = bs_src->dev_ops;
1249
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1250
    bs_dest->dev                = bs_src->dev;
1251
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1252
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1253

    
1254
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1255

    
1256
    /* i/o timing parameters */
1257
    bs_dest->slice_time         = bs_src->slice_time;
1258
    bs_dest->slice_start        = bs_src->slice_start;
1259
    bs_dest->slice_end          = bs_src->slice_end;
1260
    bs_dest->io_limits          = bs_src->io_limits;
1261
    bs_dest->io_base            = bs_src->io_base;
1262
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1263
    bs_dest->block_timer        = bs_src->block_timer;
1264
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1265

    
1266
    /* r/w error */
1267
    bs_dest->on_read_error      = bs_src->on_read_error;
1268
    bs_dest->on_write_error     = bs_src->on_write_error;
1269

    
1270
    /* i/o status */
1271
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1272
    bs_dest->iostatus           = bs_src->iostatus;
1273

    
1274
    /* dirty bitmap */
1275
    bs_dest->dirty_count        = bs_src->dirty_count;
1276
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1277

    
1278
    /* job */
1279
    bs_dest->in_use             = bs_src->in_use;
1280
    bs_dest->job                = bs_src->job;
1281

    
1282
    /* keep the same entry in bdrv_states */
1283
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1284
            bs_src->device_name);
1285
    bs_dest->list = bs_src->list;
1286
}
1287

    
1288
/*
1289
 * Swap bs contents for two image chains while they are live,
1290
 * while keeping required fields on the BlockDriverState that is
1291
 * actually attached to a device.
1292
 *
1293
 * This will modify the BlockDriverState fields, and swap contents
1294
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1295
 *
1296
 * bs_new is required to be anonymous.
1297
 *
1298
 * This function does not create any image files.
1299
 */
1300
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1301
{
1302
    BlockDriverState tmp;
1303

    
1304
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1305
    assert(bs_new->device_name[0] == '\0');
1306
    assert(bs_new->dirty_bitmap == NULL);
1307
    assert(bs_new->job == NULL);
1308
    assert(bs_new->dev == NULL);
1309
    assert(bs_new->in_use == 0);
1310
    assert(bs_new->io_limits_enabled == false);
1311
    assert(bs_new->block_timer == NULL);
1312

    
1313
    tmp = *bs_new;
1314
    *bs_new = *bs_old;
1315
    *bs_old = tmp;
1316

    
1317
    /* there are some fields that should not be swapped, move them back */
1318
    bdrv_move_feature_fields(&tmp, bs_old);
1319
    bdrv_move_feature_fields(bs_old, bs_new);
1320
    bdrv_move_feature_fields(bs_new, &tmp);
1321

    
1322
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1323
    assert(bs_new->device_name[0] == '\0');
1324

    
1325
    /* Check a few fields that should remain attached to the device */
1326
    assert(bs_new->dev == NULL);
1327
    assert(bs_new->job == NULL);
1328
    assert(bs_new->in_use == 0);
1329
    assert(bs_new->io_limits_enabled == false);
1330
    assert(bs_new->block_timer == NULL);
1331

    
1332
    bdrv_rebind(bs_new);
1333
    bdrv_rebind(bs_old);
1334
}
1335

    
1336
/*
1337
 * Add new bs contents at the top of an image chain while the chain is
1338
 * live, while keeping required fields on the top layer.
1339
 *
1340
 * This will modify the BlockDriverState fields, and swap contents
1341
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1342
 *
1343
 * bs_new is required to be anonymous.
1344
 *
1345
 * This function does not create any image files.
1346
 */
1347
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1348
{
1349
    bdrv_swap(bs_new, bs_top);
1350

    
1351
    /* The contents of 'tmp' will become bs_top, as we are
1352
     * swapping bs_new and bs_top contents. */
1353
    bs_top->backing_hd = bs_new;
1354
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1355
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1356
            bs_new->filename);
1357
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1358
            bs_new->drv ? bs_new->drv->format_name : "");
1359
}
1360

    
1361
void bdrv_delete(BlockDriverState *bs)
1362
{
1363
    assert(!bs->dev);
1364
    assert(!bs->job);
1365
    assert(!bs->in_use);
1366

    
1367
    /* remove from list, if necessary */
1368
    bdrv_make_anon(bs);
1369

    
1370
    bdrv_close(bs);
1371

    
1372
    assert(bs != bs_snapshots);
1373
    g_free(bs);
1374
}
1375

    
1376
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1377
/* TODO change to DeviceState *dev when all users are qdevified */
1378
{
1379
    if (bs->dev) {
1380
        return -EBUSY;
1381
    }
1382
    bs->dev = dev;
1383
    bdrv_iostatus_reset(bs);
1384
    return 0;
1385
}
1386

    
1387
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1388
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1389
{
1390
    if (bdrv_attach_dev(bs, dev) < 0) {
1391
        abort();
1392
    }
1393
}
1394

    
1395
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1396
/* TODO change to DeviceState *dev when all users are qdevified */
1397
{
1398
    assert(bs->dev == dev);
1399
    bs->dev = NULL;
1400
    bs->dev_ops = NULL;
1401
    bs->dev_opaque = NULL;
1402
    bs->buffer_alignment = 512;
1403
}
1404

    
1405
/* TODO change to return DeviceState * when all users are qdevified */
1406
void *bdrv_get_attached_dev(BlockDriverState *bs)
1407
{
1408
    return bs->dev;
1409
}
1410

    
1411
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1412
                      void *opaque)
1413
{
1414
    bs->dev_ops = ops;
1415
    bs->dev_opaque = opaque;
1416
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1417
        bs_snapshots = NULL;
1418
    }
1419
}
1420

    
1421
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1422
                               enum MonitorEvent ev,
1423
                               BlockErrorAction action, bool is_read)
1424
{
1425
    QObject *data;
1426
    const char *action_str;
1427

    
1428
    switch (action) {
1429
    case BDRV_ACTION_REPORT:
1430
        action_str = "report";
1431
        break;
1432
    case BDRV_ACTION_IGNORE:
1433
        action_str = "ignore";
1434
        break;
1435
    case BDRV_ACTION_STOP:
1436
        action_str = "stop";
1437
        break;
1438
    default:
1439
        abort();
1440
    }
1441

    
1442
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1443
                              bdrv->device_name,
1444
                              action_str,
1445
                              is_read ? "read" : "write");
1446
    monitor_protocol_event(ev, data);
1447

    
1448
    qobject_decref(data);
1449
}
1450

    
1451
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1452
{
1453
    QObject *data;
1454

    
1455
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1456
                              bdrv_get_device_name(bs), ejected);
1457
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1458

    
1459
    qobject_decref(data);
1460
}
1461

    
1462
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1463
{
1464
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1465
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1466
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1467
        if (tray_was_closed) {
1468
            /* tray open */
1469
            bdrv_emit_qmp_eject_event(bs, true);
1470
        }
1471
        if (load) {
1472
            /* tray close */
1473
            bdrv_emit_qmp_eject_event(bs, false);
1474
        }
1475
    }
1476
}
1477

    
1478
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1479
{
1480
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1481
}
1482

    
1483
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1484
{
1485
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1486
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1487
    }
1488
}
1489

    
1490
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1491
{
1492
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1493
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1494
    }
1495
    return false;
1496
}
1497

    
1498
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1499
{
1500
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1501
        bs->dev_ops->resize_cb(bs->dev_opaque);
1502
    }
1503
}
1504

    
1505
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1506
{
1507
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1508
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1509
    }
1510
    return false;
1511
}
1512

    
1513
/*
1514
 * Run consistency checks on an image
1515
 *
1516
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1517
 * free of errors) or -errno when an internal error occurred. The results of the
1518
 * check are stored in res.
1519
 */
1520
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1521
{
1522
    if (bs->drv->bdrv_check == NULL) {
1523
        return -ENOTSUP;
1524
    }
1525

    
1526
    memset(res, 0, sizeof(*res));
1527
    return bs->drv->bdrv_check(bs, res, fix);
1528
}
1529

    
1530
#define COMMIT_BUF_SECTORS 2048
1531

    
1532
/* commit COW file into the raw image */
1533
int bdrv_commit(BlockDriverState *bs)
1534
{
1535
    BlockDriver *drv = bs->drv;
1536
    int64_t sector, total_sectors;
1537
    int n, ro, open_flags;
1538
    int ret = 0;
1539
    uint8_t *buf;
1540
    char filename[PATH_MAX];
1541

    
1542
    if (!drv)
1543
        return -ENOMEDIUM;
1544
    
1545
    if (!bs->backing_hd) {
1546
        return -ENOTSUP;
1547
    }
1548

    
1549
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1550
        return -EBUSY;
1551
    }
1552

    
1553
    ro = bs->backing_hd->read_only;
1554
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1555
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1556
    open_flags =  bs->backing_hd->open_flags;
1557

    
1558
    if (ro) {
1559
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1560
            return -EACCES;
1561
        }
1562
    }
1563

    
1564
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1565
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1566

    
1567
    for (sector = 0; sector < total_sectors; sector += n) {
1568
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1569

    
1570
            if (bdrv_read(bs, sector, buf, n) != 0) {
1571
                ret = -EIO;
1572
                goto ro_cleanup;
1573
            }
1574

    
1575
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1576
                ret = -EIO;
1577
                goto ro_cleanup;
1578
            }
1579
        }
1580
    }
1581

    
1582
    if (drv->bdrv_make_empty) {
1583
        ret = drv->bdrv_make_empty(bs);
1584
        bdrv_flush(bs);
1585
    }
1586

    
1587
    /*
1588
     * Make sure all data we wrote to the backing device is actually
1589
     * stable on disk.
1590
     */
1591
    if (bs->backing_hd)
1592
        bdrv_flush(bs->backing_hd);
1593

    
1594
ro_cleanup:
1595
    g_free(buf);
1596

    
1597
    if (ro) {
1598
        /* ignoring error return here */
1599
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1600
    }
1601

    
1602
    return ret;
1603
}
1604

    
1605
int bdrv_commit_all(void)
1606
{
1607
    BlockDriverState *bs;
1608

    
1609
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1610
        int ret = bdrv_commit(bs);
1611
        if (ret < 0) {
1612
            return ret;
1613
        }
1614
    }
1615
    return 0;
1616
}
1617

    
1618
struct BdrvTrackedRequest {
1619
    BlockDriverState *bs;
1620
    int64_t sector_num;
1621
    int nb_sectors;
1622
    bool is_write;
1623
    QLIST_ENTRY(BdrvTrackedRequest) list;
1624
    Coroutine *co; /* owner, used for deadlock detection */
1625
    CoQueue wait_queue; /* coroutines blocked on this request */
1626
};
1627

    
1628
/**
1629
 * Remove an active request from the tracked requests list
1630
 *
1631
 * This function should be called when a tracked request is completing.
1632
 */
1633
static void tracked_request_end(BdrvTrackedRequest *req)
1634
{
1635
    QLIST_REMOVE(req, list);
1636
    qemu_co_queue_restart_all(&req->wait_queue);
1637
}
1638

    
1639
/**
1640
 * Add an active request to the tracked requests list
1641
 */
1642
static void tracked_request_begin(BdrvTrackedRequest *req,
1643
                                  BlockDriverState *bs,
1644
                                  int64_t sector_num,
1645
                                  int nb_sectors, bool is_write)
1646
{
1647
    *req = (BdrvTrackedRequest){
1648
        .bs = bs,
1649
        .sector_num = sector_num,
1650
        .nb_sectors = nb_sectors,
1651
        .is_write = is_write,
1652
        .co = qemu_coroutine_self(),
1653
    };
1654

    
1655
    qemu_co_queue_init(&req->wait_queue);
1656

    
1657
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1658
}
1659

    
1660
/**
1661
 * Round a region to cluster boundaries
1662
 */
1663
static void round_to_clusters(BlockDriverState *bs,
1664
                              int64_t sector_num, int nb_sectors,
1665
                              int64_t *cluster_sector_num,
1666
                              int *cluster_nb_sectors)
1667
{
1668
    BlockDriverInfo bdi;
1669

    
1670
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1671
        *cluster_sector_num = sector_num;
1672
        *cluster_nb_sectors = nb_sectors;
1673
    } else {
1674
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1675
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1676
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1677
                                            nb_sectors, c);
1678
    }
1679
}
1680

    
1681
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1682
                                     int64_t sector_num, int nb_sectors) {
1683
    /*        aaaa   bbbb */
1684
    if (sector_num >= req->sector_num + req->nb_sectors) {
1685
        return false;
1686
    }
1687
    /* bbbb   aaaa        */
1688
    if (req->sector_num >= sector_num + nb_sectors) {
1689
        return false;
1690
    }
1691
    return true;
1692
}
1693

    
1694
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1695
        int64_t sector_num, int nb_sectors)
1696
{
1697
    BdrvTrackedRequest *req;
1698
    int64_t cluster_sector_num;
1699
    int cluster_nb_sectors;
1700
    bool retry;
1701

    
1702
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1703
     * that allocating writes will be serialized and not race with each other
1704
     * for the same cluster.  For example, in copy-on-read it ensures that the
1705
     * CoR read and write operations are atomic and guest writes cannot
1706
     * interleave between them.
1707
     */
1708
    round_to_clusters(bs, sector_num, nb_sectors,
1709
                      &cluster_sector_num, &cluster_nb_sectors);
1710

    
1711
    do {
1712
        retry = false;
1713
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1714
            if (tracked_request_overlaps(req, cluster_sector_num,
1715
                                         cluster_nb_sectors)) {
1716
                /* Hitting this means there was a reentrant request, for
1717
                 * example, a block driver issuing nested requests.  This must
1718
                 * never happen since it means deadlock.
1719
                 */
1720
                assert(qemu_coroutine_self() != req->co);
1721

    
1722
                qemu_co_queue_wait(&req->wait_queue);
1723
                retry = true;
1724
                break;
1725
            }
1726
        }
1727
    } while (retry);
1728
}
1729

    
1730
/*
1731
 * Return values:
1732
 * 0        - success
1733
 * -EINVAL  - backing format specified, but no file
1734
 * -ENOSPC  - can't update the backing file because no space is left in the
1735
 *            image file header
1736
 * -ENOTSUP - format driver doesn't support changing the backing file
1737
 */
1738
int bdrv_change_backing_file(BlockDriverState *bs,
1739
    const char *backing_file, const char *backing_fmt)
1740
{
1741
    BlockDriver *drv = bs->drv;
1742
    int ret;
1743

    
1744
    /* Backing file format doesn't make sense without a backing file */
1745
    if (backing_fmt && !backing_file) {
1746
        return -EINVAL;
1747
    }
1748

    
1749
    if (drv->bdrv_change_backing_file != NULL) {
1750
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1751
    } else {
1752
        ret = -ENOTSUP;
1753
    }
1754

    
1755
    if (ret == 0) {
1756
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1757
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1758
    }
1759
    return ret;
1760
}
1761

    
1762
/*
1763
 * Finds the image layer in the chain that has 'bs' as its backing file.
1764
 *
1765
 * active is the current topmost image.
1766
 *
1767
 * Returns NULL if bs is not found in active's image chain,
1768
 * or if active == bs.
1769
 */
1770
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1771
                                    BlockDriverState *bs)
1772
{
1773
    BlockDriverState *overlay = NULL;
1774
    BlockDriverState *intermediate;
1775

    
1776
    assert(active != NULL);
1777
    assert(bs != NULL);
1778

    
1779
    /* if bs is the same as active, then by definition it has no overlay
1780
     */
1781
    if (active == bs) {
1782
        return NULL;
1783
    }
1784

    
1785
    intermediate = active;
1786
    while (intermediate->backing_hd) {
1787
        if (intermediate->backing_hd == bs) {
1788
            overlay = intermediate;
1789
            break;
1790
        }
1791
        intermediate = intermediate->backing_hd;
1792
    }
1793

    
1794
    return overlay;
1795
}
1796

    
1797
typedef struct BlkIntermediateStates {
1798
    BlockDriverState *bs;
1799
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
1800
} BlkIntermediateStates;
1801

    
1802

    
1803
/*
1804
 * Drops images above 'base' up to and including 'top', and sets the image
1805
 * above 'top' to have base as its backing file.
1806
 *
1807
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
1808
 * information in 'bs' can be properly updated.
1809
 *
1810
 * E.g., this will convert the following chain:
1811
 * bottom <- base <- intermediate <- top <- active
1812
 *
1813
 * to
1814
 *
1815
 * bottom <- base <- active
1816
 *
1817
 * It is allowed for bottom==base, in which case it converts:
1818
 *
1819
 * base <- intermediate <- top <- active
1820
 *
1821
 * to
1822
 *
1823
 * base <- active
1824
 *
1825
 * Error conditions:
1826
 *  if active == top, that is considered an error
1827
 *
1828
 */
1829
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
1830
                           BlockDriverState *base)
1831
{
1832
    BlockDriverState *intermediate;
1833
    BlockDriverState *base_bs = NULL;
1834
    BlockDriverState *new_top_bs = NULL;
1835
    BlkIntermediateStates *intermediate_state, *next;
1836
    int ret = -EIO;
1837

    
1838
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
1839
    QSIMPLEQ_INIT(&states_to_delete);
1840

    
1841
    if (!top->drv || !base->drv) {
1842
        goto exit;
1843
    }
1844

    
1845
    new_top_bs = bdrv_find_overlay(active, top);
1846

    
1847
    if (new_top_bs == NULL) {
1848
        /* we could not find the image above 'top', this is an error */
1849
        goto exit;
1850
    }
1851

    
1852
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
1853
     * to do, no intermediate images */
1854
    if (new_top_bs->backing_hd == base) {
1855
        ret = 0;
1856
        goto exit;
1857
    }
1858

    
1859
    intermediate = top;
1860

    
1861
    /* now we will go down through the list, and add each BDS we find
1862
     * into our deletion queue, until we hit the 'base'
1863
     */
1864
    while (intermediate) {
1865
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
1866
        intermediate_state->bs = intermediate;
1867
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
1868

    
1869
        if (intermediate->backing_hd == base) {
1870
            base_bs = intermediate->backing_hd;
1871
            break;
1872
        }
1873
        intermediate = intermediate->backing_hd;
1874
    }
1875
    if (base_bs == NULL) {
1876
        /* something went wrong, we did not end at the base. safely
1877
         * unravel everything, and exit with error */
1878
        goto exit;
1879
    }
1880

    
1881
    /* success - we can delete the intermediate states, and link top->base */
1882
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
1883
                                   base_bs->drv ? base_bs->drv->format_name : "");
1884
    if (ret) {
1885
        goto exit;
1886
    }
1887
    new_top_bs->backing_hd = base_bs;
1888

    
1889

    
1890
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1891
        /* so that bdrv_close() does not recursively close the chain */
1892
        intermediate_state->bs->backing_hd = NULL;
1893
        bdrv_delete(intermediate_state->bs);
1894
    }
1895
    ret = 0;
1896

    
1897
exit:
1898
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1899
        g_free(intermediate_state);
1900
    }
1901
    return ret;
1902
}
1903

    
1904

    
1905
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1906
                                   size_t size)
1907
{
1908
    int64_t len;
1909

    
1910
    if (!bdrv_is_inserted(bs))
1911
        return -ENOMEDIUM;
1912

    
1913
    if (bs->growable)
1914
        return 0;
1915

    
1916
    len = bdrv_getlength(bs);
1917

    
1918
    if (offset < 0)
1919
        return -EIO;
1920

    
1921
    if ((offset > len) || (len - offset < size))
1922
        return -EIO;
1923

    
1924
    return 0;
1925
}
1926

    
1927
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1928
                              int nb_sectors)
1929
{
1930
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1931
                                   nb_sectors * BDRV_SECTOR_SIZE);
1932
}
1933

    
1934
typedef struct RwCo {
1935
    BlockDriverState *bs;
1936
    int64_t sector_num;
1937
    int nb_sectors;
1938
    QEMUIOVector *qiov;
1939
    bool is_write;
1940
    int ret;
1941
} RwCo;
1942

    
1943
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1944
{
1945
    RwCo *rwco = opaque;
1946

    
1947
    if (!rwco->is_write) {
1948
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1949
                                     rwco->nb_sectors, rwco->qiov, 0);
1950
    } else {
1951
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1952
                                      rwco->nb_sectors, rwco->qiov, 0);
1953
    }
1954
}
1955

    
1956
/*
1957
 * Process a synchronous request using coroutines
1958
 */
1959
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1960
                      int nb_sectors, bool is_write)
1961
{
1962
    QEMUIOVector qiov;
1963
    struct iovec iov = {
1964
        .iov_base = (void *)buf,
1965
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1966
    };
1967
    Coroutine *co;
1968
    RwCo rwco = {
1969
        .bs = bs,
1970
        .sector_num = sector_num,
1971
        .nb_sectors = nb_sectors,
1972
        .qiov = &qiov,
1973
        .is_write = is_write,
1974
        .ret = NOT_DONE,
1975
    };
1976

    
1977
    qemu_iovec_init_external(&qiov, &iov, 1);
1978

    
1979
    /**
1980
     * In sync call context, when the vcpu is blocked, this throttling timer
1981
     * will not fire; so the I/O throttling function has to be disabled here
1982
     * if it has been enabled.
1983
     */
1984
    if (bs->io_limits_enabled) {
1985
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
1986
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
1987
        bdrv_io_limits_disable(bs);
1988
    }
1989

    
1990
    if (qemu_in_coroutine()) {
1991
        /* Fast-path if already in coroutine context */
1992
        bdrv_rw_co_entry(&rwco);
1993
    } else {
1994
        co = qemu_coroutine_create(bdrv_rw_co_entry);
1995
        qemu_coroutine_enter(co, &rwco);
1996
        while (rwco.ret == NOT_DONE) {
1997
            qemu_aio_wait();
1998
        }
1999
    }
2000
    return rwco.ret;
2001
}
2002

    
2003
/* return < 0 if error. See bdrv_write() for the return codes */
2004
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2005
              uint8_t *buf, int nb_sectors)
2006
{
2007
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
2008
}
2009

    
2010
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2011
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2012
                          uint8_t *buf, int nb_sectors)
2013
{
2014
    bool enabled;
2015
    int ret;
2016

    
2017
    enabled = bs->io_limits_enabled;
2018
    bs->io_limits_enabled = false;
2019
    ret = bdrv_read(bs, 0, buf, 1);
2020
    bs->io_limits_enabled = enabled;
2021
    return ret;
2022
}
2023

    
2024
#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
2025

    
2026
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
2027
                             int nb_sectors, int dirty)
2028
{
2029
    int64_t start, end;
2030
    unsigned long val, idx, bit;
2031

    
2032
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
2033
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
2034

    
2035
    for (; start <= end; start++) {
2036
        idx = start / BITS_PER_LONG;
2037
        bit = start % BITS_PER_LONG;
2038
        val = bs->dirty_bitmap[idx];
2039
        if (dirty) {
2040
            if (!(val & (1UL << bit))) {
2041
                bs->dirty_count++;
2042
                val |= 1UL << bit;
2043
            }
2044
        } else {
2045
            if (val & (1UL << bit)) {
2046
                bs->dirty_count--;
2047
                val &= ~(1UL << bit);
2048
            }
2049
        }
2050
        bs->dirty_bitmap[idx] = val;
2051
    }
2052
}
2053

    
2054
/* Return < 0 if error. Important errors are:
2055
  -EIO         generic I/O error (may happen for all errors)
2056
  -ENOMEDIUM   No media inserted.
2057
  -EINVAL      Invalid sector number or nb_sectors
2058
  -EACCES      Trying to write a read-only device
2059
*/
2060
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2061
               const uint8_t *buf, int nb_sectors)
2062
{
2063
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
2064
}
2065

    
2066
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2067
               void *buf, int count1)
2068
{
2069
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2070
    int len, nb_sectors, count;
2071
    int64_t sector_num;
2072
    int ret;
2073

    
2074
    count = count1;
2075
    /* first read to align to sector start */
2076
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2077
    if (len > count)
2078
        len = count;
2079
    sector_num = offset >> BDRV_SECTOR_BITS;
2080
    if (len > 0) {
2081
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2082
            return ret;
2083
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2084
        count -= len;
2085
        if (count == 0)
2086
            return count1;
2087
        sector_num++;
2088
        buf += len;
2089
    }
2090

    
2091
    /* read the sectors "in place" */
2092
    nb_sectors = count >> BDRV_SECTOR_BITS;
2093
    if (nb_sectors > 0) {
2094
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2095
            return ret;
2096
        sector_num += nb_sectors;
2097
        len = nb_sectors << BDRV_SECTOR_BITS;
2098
        buf += len;
2099
        count -= len;
2100
    }
2101

    
2102
    /* add data from the last sector */
2103
    if (count > 0) {
2104
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2105
            return ret;
2106
        memcpy(buf, tmp_buf, count);
2107
    }
2108
    return count1;
2109
}
2110

    
2111
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2112
                const void *buf, int count1)
2113
{
2114
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2115
    int len, nb_sectors, count;
2116
    int64_t sector_num;
2117
    int ret;
2118

    
2119
    count = count1;
2120
    /* first write to align to sector start */
2121
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2122
    if (len > count)
2123
        len = count;
2124
    sector_num = offset >> BDRV_SECTOR_BITS;
2125
    if (len > 0) {
2126
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2127
            return ret;
2128
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
2129
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2130
            return ret;
2131
        count -= len;
2132
        if (count == 0)
2133
            return count1;
2134
        sector_num++;
2135
        buf += len;
2136
    }
2137

    
2138
    /* write the sectors "in place" */
2139
    nb_sectors = count >> BDRV_SECTOR_BITS;
2140
    if (nb_sectors > 0) {
2141
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
2142
            return ret;
2143
        sector_num += nb_sectors;
2144
        len = nb_sectors << BDRV_SECTOR_BITS;
2145
        buf += len;
2146
        count -= len;
2147
    }
2148

    
2149
    /* add data from the last sector */
2150
    if (count > 0) {
2151
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2152
            return ret;
2153
        memcpy(tmp_buf, buf, count);
2154
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2155
            return ret;
2156
    }
2157
    return count1;
2158
}
2159

    
2160
/*
2161
 * Writes to the file and ensures that no writes are reordered across this
2162
 * request (acts as a barrier)
2163
 *
2164
 * Returns 0 on success, -errno in error cases.
2165
 */
2166
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2167
    const void *buf, int count)
2168
{
2169
    int ret;
2170

    
2171
    ret = bdrv_pwrite(bs, offset, buf, count);
2172
    if (ret < 0) {
2173
        return ret;
2174
    }
2175

    
2176
    /* No flush needed for cache modes that already do it */
2177
    if (bs->enable_write_cache) {
2178
        bdrv_flush(bs);
2179
    }
2180

    
2181
    return 0;
2182
}
2183

    
2184
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2185
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2186
{
2187
    /* Perform I/O through a temporary buffer so that users who scribble over
2188
     * their read buffer while the operation is in progress do not end up
2189
     * modifying the image file.  This is critical for zero-copy guest I/O
2190
     * where anything might happen inside guest memory.
2191
     */
2192
    void *bounce_buffer;
2193

    
2194
    BlockDriver *drv = bs->drv;
2195
    struct iovec iov;
2196
    QEMUIOVector bounce_qiov;
2197
    int64_t cluster_sector_num;
2198
    int cluster_nb_sectors;
2199
    size_t skip_bytes;
2200
    int ret;
2201

    
2202
    /* Cover entire cluster so no additional backing file I/O is required when
2203
     * allocating cluster in the image file.
2204
     */
2205
    round_to_clusters(bs, sector_num, nb_sectors,
2206
                      &cluster_sector_num, &cluster_nb_sectors);
2207

    
2208
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2209
                                   cluster_sector_num, cluster_nb_sectors);
2210

    
2211
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2212
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2213
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2214

    
2215
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2216
                             &bounce_qiov);
2217
    if (ret < 0) {
2218
        goto err;
2219
    }
2220

    
2221
    if (drv->bdrv_co_write_zeroes &&
2222
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2223
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2224
                                      cluster_nb_sectors);
2225
    } else {
2226
        /* This does not change the data on the disk, it is not necessary
2227
         * to flush even in cache=writethrough mode.
2228
         */
2229
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2230
                                  &bounce_qiov);
2231
    }
2232

    
2233
    if (ret < 0) {
2234
        /* It might be okay to ignore write errors for guest requests.  If this
2235
         * is a deliberate copy-on-read then we don't want to ignore the error.
2236
         * Simply report it in all cases.
2237
         */
2238
        goto err;
2239
    }
2240

    
2241
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2242
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2243
                        nb_sectors * BDRV_SECTOR_SIZE);
2244

    
2245
err:
2246
    qemu_vfree(bounce_buffer);
2247
    return ret;
2248
}
2249

    
2250
/*
2251
 * Handle a read request in coroutine context
2252
 */
2253
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2254
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2255
    BdrvRequestFlags flags)
2256
{
2257
    BlockDriver *drv = bs->drv;
2258
    BdrvTrackedRequest req;
2259
    int ret;
2260

    
2261
    if (!drv) {
2262
        return -ENOMEDIUM;
2263
    }
2264
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2265
        return -EIO;
2266
    }
2267

    
2268
    /* throttling disk read I/O */
2269
    if (bs->io_limits_enabled) {
2270
        bdrv_io_limits_intercept(bs, false, nb_sectors);
2271
    }
2272

    
2273
    if (bs->copy_on_read) {
2274
        flags |= BDRV_REQ_COPY_ON_READ;
2275
    }
2276
    if (flags & BDRV_REQ_COPY_ON_READ) {
2277
        bs->copy_on_read_in_flight++;
2278
    }
2279

    
2280
    if (bs->copy_on_read_in_flight) {
2281
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2282
    }
2283

    
2284
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2285

    
2286
    if (flags & BDRV_REQ_COPY_ON_READ) {
2287
        int pnum;
2288

    
2289
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2290
        if (ret < 0) {
2291
            goto out;
2292
        }
2293

    
2294
        if (!ret || pnum != nb_sectors) {
2295
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2296
            goto out;
2297
        }
2298
    }
2299

    
2300
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2301

    
2302
out:
2303
    tracked_request_end(&req);
2304

    
2305
    if (flags & BDRV_REQ_COPY_ON_READ) {
2306
        bs->copy_on_read_in_flight--;
2307
    }
2308

    
2309
    return ret;
2310
}
2311

    
2312
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2313
    int nb_sectors, QEMUIOVector *qiov)
2314
{
2315
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2316

    
2317
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2318
}
2319

    
2320
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2321
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2322
{
2323
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2324

    
2325
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2326
                            BDRV_REQ_COPY_ON_READ);
2327
}
2328

    
2329
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2330
    int64_t sector_num, int nb_sectors)
2331
{
2332
    BlockDriver *drv = bs->drv;
2333
    QEMUIOVector qiov;
2334
    struct iovec iov;
2335
    int ret;
2336

    
2337
    /* TODO Emulate only part of misaligned requests instead of letting block
2338
     * drivers return -ENOTSUP and emulate everything */
2339

    
2340
    /* First try the efficient write zeroes operation */
2341
    if (drv->bdrv_co_write_zeroes) {
2342
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2343
        if (ret != -ENOTSUP) {
2344
            return ret;
2345
        }
2346
    }
2347

    
2348
    /* Fall back to bounce buffer if write zeroes is unsupported */
2349
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2350
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2351
    memset(iov.iov_base, 0, iov.iov_len);
2352
    qemu_iovec_init_external(&qiov, &iov, 1);
2353

    
2354
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2355

    
2356
    qemu_vfree(iov.iov_base);
2357
    return ret;
2358
}
2359

    
2360
/*
2361
 * Handle a write request in coroutine context
2362
 */
2363
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2364
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2365
    BdrvRequestFlags flags)
2366
{
2367
    BlockDriver *drv = bs->drv;
2368
    BdrvTrackedRequest req;
2369
    int ret;
2370

    
2371
    if (!bs->drv) {
2372
        return -ENOMEDIUM;
2373
    }
2374
    if (bs->read_only) {
2375
        return -EACCES;
2376
    }
2377
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2378
        return -EIO;
2379
    }
2380

    
2381
    /* throttling disk write I/O */
2382
    if (bs->io_limits_enabled) {
2383
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2384
    }
2385

    
2386
    if (bs->copy_on_read_in_flight) {
2387
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2388
    }
2389

    
2390
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2391

    
2392
    if (flags & BDRV_REQ_ZERO_WRITE) {
2393
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2394
    } else {
2395
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2396
    }
2397

    
2398
    if (ret == 0 && !bs->enable_write_cache) {
2399
        ret = bdrv_co_flush(bs);
2400
    }
2401

    
2402
    if (bs->dirty_bitmap) {
2403
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2404
    }
2405

    
2406
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2407
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2408
    }
2409

    
2410
    tracked_request_end(&req);
2411

    
2412
    return ret;
2413
}
2414

    
2415
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2416
    int nb_sectors, QEMUIOVector *qiov)
2417
{
2418
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2419

    
2420
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2421
}
2422

    
2423
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2424
                                      int64_t sector_num, int nb_sectors)
2425
{
2426
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2427

    
2428
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2429
                             BDRV_REQ_ZERO_WRITE);
2430
}
2431

    
2432
/**
2433
 * Truncate file to 'offset' bytes (needed only for file protocols)
2434
 */
2435
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2436
{
2437
    BlockDriver *drv = bs->drv;
2438
    int ret;
2439
    if (!drv)
2440
        return -ENOMEDIUM;
2441
    if (!drv->bdrv_truncate)
2442
        return -ENOTSUP;
2443
    if (bs->read_only)
2444
        return -EACCES;
2445
    if (bdrv_in_use(bs))
2446
        return -EBUSY;
2447
    ret = drv->bdrv_truncate(bs, offset);
2448
    if (ret == 0) {
2449
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2450
        bdrv_dev_resize_cb(bs);
2451
    }
2452
    return ret;
2453
}
2454

    
2455
/**
2456
 * Length of a allocated file in bytes. Sparse files are counted by actual
2457
 * allocated space. Return < 0 if error or unknown.
2458
 */
2459
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2460
{
2461
    BlockDriver *drv = bs->drv;
2462
    if (!drv) {
2463
        return -ENOMEDIUM;
2464
    }
2465
    if (drv->bdrv_get_allocated_file_size) {
2466
        return drv->bdrv_get_allocated_file_size(bs);
2467
    }
2468
    if (bs->file) {
2469
        return bdrv_get_allocated_file_size(bs->file);
2470
    }
2471
    return -ENOTSUP;
2472
}
2473

    
2474
/**
2475
 * Length of a file in bytes. Return < 0 if error or unknown.
2476
 */
2477
int64_t bdrv_getlength(BlockDriverState *bs)
2478
{
2479
    BlockDriver *drv = bs->drv;
2480
    if (!drv)
2481
        return -ENOMEDIUM;
2482

    
2483
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2484
        if (drv->bdrv_getlength) {
2485
            return drv->bdrv_getlength(bs);
2486
        }
2487
    }
2488
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2489
}
2490

    
2491
/* return 0 as number of sectors if no device present or error */
2492
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2493
{
2494
    int64_t length;
2495
    length = bdrv_getlength(bs);
2496
    if (length < 0)
2497
        length = 0;
2498
    else
2499
        length = length >> BDRV_SECTOR_BITS;
2500
    *nb_sectors_ptr = length;
2501
}
2502

    
2503
/* throttling disk io limits */
2504
void bdrv_set_io_limits(BlockDriverState *bs,
2505
                        BlockIOLimit *io_limits)
2506
{
2507
    bs->io_limits = *io_limits;
2508
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2509
}
2510

    
2511
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2512
                       BlockdevOnError on_write_error)
2513
{
2514
    bs->on_read_error = on_read_error;
2515
    bs->on_write_error = on_write_error;
2516
}
2517

    
2518
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2519
{
2520
    return is_read ? bs->on_read_error : bs->on_write_error;
2521
}
2522

    
2523
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2524
{
2525
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2526

    
2527
    switch (on_err) {
2528
    case BLOCKDEV_ON_ERROR_ENOSPC:
2529
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2530
    case BLOCKDEV_ON_ERROR_STOP:
2531
        return BDRV_ACTION_STOP;
2532
    case BLOCKDEV_ON_ERROR_REPORT:
2533
        return BDRV_ACTION_REPORT;
2534
    case BLOCKDEV_ON_ERROR_IGNORE:
2535
        return BDRV_ACTION_IGNORE;
2536
    default:
2537
        abort();
2538
    }
2539
}
2540

    
2541
/* This is done by device models because, while the block layer knows
2542
 * about the error, it does not know whether an operation comes from
2543
 * the device or the block layer (from a job, for example).
2544
 */
2545
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2546
                       bool is_read, int error)
2547
{
2548
    assert(error >= 0);
2549
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2550
    if (action == BDRV_ACTION_STOP) {
2551
        vm_stop(RUN_STATE_IO_ERROR);
2552
        bdrv_iostatus_set_err(bs, error);
2553
    }
2554
}
2555

    
2556
int bdrv_is_read_only(BlockDriverState *bs)
2557
{
2558
    return bs->read_only;
2559
}
2560

    
2561
int bdrv_is_sg(BlockDriverState *bs)
2562
{
2563
    return bs->sg;
2564
}
2565

    
2566
int bdrv_enable_write_cache(BlockDriverState *bs)
2567
{
2568
    return bs->enable_write_cache;
2569
}
2570

    
2571
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2572
{
2573
    bs->enable_write_cache = wce;
2574

    
2575
    /* so a reopen() will preserve wce */
2576
    if (wce) {
2577
        bs->open_flags |= BDRV_O_CACHE_WB;
2578
    } else {
2579
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2580
    }
2581
}
2582

    
2583
int bdrv_is_encrypted(BlockDriverState *bs)
2584
{
2585
    if (bs->backing_hd && bs->backing_hd->encrypted)
2586
        return 1;
2587
    return bs->encrypted;
2588
}
2589

    
2590
int bdrv_key_required(BlockDriverState *bs)
2591
{
2592
    BlockDriverState *backing_hd = bs->backing_hd;
2593

    
2594
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2595
        return 1;
2596
    return (bs->encrypted && !bs->valid_key);
2597
}
2598

    
2599
int bdrv_set_key(BlockDriverState *bs, const char *key)
2600
{
2601
    int ret;
2602
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2603
        ret = bdrv_set_key(bs->backing_hd, key);
2604
        if (ret < 0)
2605
            return ret;
2606
        if (!bs->encrypted)
2607
            return 0;
2608
    }
2609
    if (!bs->encrypted) {
2610
        return -EINVAL;
2611
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2612
        return -ENOMEDIUM;
2613
    }
2614
    ret = bs->drv->bdrv_set_key(bs, key);
2615
    if (ret < 0) {
2616
        bs->valid_key = 0;
2617
    } else if (!bs->valid_key) {
2618
        bs->valid_key = 1;
2619
        /* call the change callback now, we skipped it on open */
2620
        bdrv_dev_change_media_cb(bs, true);
2621
    }
2622
    return ret;
2623
}
2624

    
2625
const char *bdrv_get_format_name(BlockDriverState *bs)
2626
{
2627
    return bs->drv ? bs->drv->format_name : NULL;
2628
}
2629

    
2630
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2631
                         void *opaque)
2632
{
2633
    BlockDriver *drv;
2634

    
2635
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2636
        it(opaque, drv->format_name);
2637
    }
2638
}
2639

    
2640
BlockDriverState *bdrv_find(const char *name)
2641
{
2642
    BlockDriverState *bs;
2643

    
2644
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2645
        if (!strcmp(name, bs->device_name)) {
2646
            return bs;
2647
        }
2648
    }
2649
    return NULL;
2650
}
2651

    
2652
BlockDriverState *bdrv_next(BlockDriverState *bs)
2653
{
2654
    if (!bs) {
2655
        return QTAILQ_FIRST(&bdrv_states);
2656
    }
2657
    return QTAILQ_NEXT(bs, list);
2658
}
2659

    
2660
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2661
{
2662
    BlockDriverState *bs;
2663

    
2664
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2665
        it(opaque, bs);
2666
    }
2667
}
2668

    
2669
const char *bdrv_get_device_name(BlockDriverState *bs)
2670
{
2671
    return bs->device_name;
2672
}
2673

    
2674
int bdrv_get_flags(BlockDriverState *bs)
2675
{
2676
    return bs->open_flags;
2677
}
2678

    
2679
void bdrv_flush_all(void)
2680
{
2681
    BlockDriverState *bs;
2682

    
2683
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2684
        bdrv_flush(bs);
2685
    }
2686
}
2687

    
2688
int bdrv_has_zero_init(BlockDriverState *bs)
2689
{
2690
    assert(bs->drv);
2691

    
2692
    if (bs->drv->bdrv_has_zero_init) {
2693
        return bs->drv->bdrv_has_zero_init(bs);
2694
    }
2695

    
2696
    return 1;
2697
}
2698

    
2699
typedef struct BdrvCoIsAllocatedData {
2700
    BlockDriverState *bs;
2701
    int64_t sector_num;
2702
    int nb_sectors;
2703
    int *pnum;
2704
    int ret;
2705
    bool done;
2706
} BdrvCoIsAllocatedData;
2707

    
2708
/*
2709
 * Returns true iff the specified sector is present in the disk image. Drivers
2710
 * not implementing the functionality are assumed to not support backing files,
2711
 * hence all their sectors are reported as allocated.
2712
 *
2713
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2714
 * and 'pnum' is set to 0.
2715
 *
2716
 * 'pnum' is set to the number of sectors (including and immediately following
2717
 * the specified sector) that are known to be in the same
2718
 * allocated/unallocated state.
2719
 *
2720
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2721
 * beyond the end of the disk image it will be clamped.
2722
 */
2723
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2724
                                      int nb_sectors, int *pnum)
2725
{
2726
    int64_t n;
2727

    
2728
    if (sector_num >= bs->total_sectors) {
2729
        *pnum = 0;
2730
        return 0;
2731
    }
2732

    
2733
    n = bs->total_sectors - sector_num;
2734
    if (n < nb_sectors) {
2735
        nb_sectors = n;
2736
    }
2737

    
2738
    if (!bs->drv->bdrv_co_is_allocated) {
2739
        *pnum = nb_sectors;
2740
        return 1;
2741
    }
2742

    
2743
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2744
}
2745

    
2746
/* Coroutine wrapper for bdrv_is_allocated() */
2747
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2748
{
2749
    BdrvCoIsAllocatedData *data = opaque;
2750
    BlockDriverState *bs = data->bs;
2751

    
2752
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2753
                                     data->pnum);
2754
    data->done = true;
2755
}
2756

    
2757
/*
2758
 * Synchronous wrapper around bdrv_co_is_allocated().
2759
 *
2760
 * See bdrv_co_is_allocated() for details.
2761
 */
2762
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2763
                      int *pnum)
2764
{
2765
    Coroutine *co;
2766
    BdrvCoIsAllocatedData data = {
2767
        .bs = bs,
2768
        .sector_num = sector_num,
2769
        .nb_sectors = nb_sectors,
2770
        .pnum = pnum,
2771
        .done = false,
2772
    };
2773

    
2774
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2775
    qemu_coroutine_enter(co, &data);
2776
    while (!data.done) {
2777
        qemu_aio_wait();
2778
    }
2779
    return data.ret;
2780
}
2781

    
2782
/*
2783
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2784
 *
2785
 * Return true if the given sector is allocated in any image between
2786
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2787
 * sector is allocated in any image of the chain.  Return false otherwise.
2788
 *
2789
 * 'pnum' is set to the number of sectors (including and immediately following
2790
 *  the specified sector) that are known to be in the same
2791
 *  allocated/unallocated state.
2792
 *
2793
 */
2794
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2795
                                            BlockDriverState *base,
2796
                                            int64_t sector_num,
2797
                                            int nb_sectors, int *pnum)
2798
{
2799
    BlockDriverState *intermediate;
2800
    int ret, n = nb_sectors;
2801

    
2802
    intermediate = top;
2803
    while (intermediate && intermediate != base) {
2804
        int pnum_inter;
2805
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2806
                                   &pnum_inter);
2807
        if (ret < 0) {
2808
            return ret;
2809
        } else if (ret) {
2810
            *pnum = pnum_inter;
2811
            return 1;
2812
        }
2813

    
2814
        /*
2815
         * [sector_num, nb_sectors] is unallocated on top but intermediate
2816
         * might have
2817
         *
2818
         * [sector_num+x, nr_sectors] allocated.
2819
         */
2820
        if (n > pnum_inter) {
2821
            n = pnum_inter;
2822
        }
2823

    
2824
        intermediate = intermediate->backing_hd;
2825
    }
2826

    
2827
    *pnum = n;
2828
    return 0;
2829
}
2830

    
2831
BlockInfo *bdrv_query_info(BlockDriverState *bs)
2832
{
2833
    BlockInfo *info = g_malloc0(sizeof(*info));
2834
    info->device = g_strdup(bs->device_name);
2835
    info->type = g_strdup("unknown");
2836
    info->locked = bdrv_dev_is_medium_locked(bs);
2837
    info->removable = bdrv_dev_has_removable_media(bs);
2838

    
2839
    if (bdrv_dev_has_removable_media(bs)) {
2840
        info->has_tray_open = true;
2841
        info->tray_open = bdrv_dev_is_tray_open(bs);
2842
    }
2843

    
2844
    if (bdrv_iostatus_is_enabled(bs)) {
2845
        info->has_io_status = true;
2846
        info->io_status = bs->iostatus;
2847
    }
2848

    
2849
    if (bs->dirty_bitmap) {
2850
        info->has_dirty = true;
2851
        info->dirty = g_malloc0(sizeof(*info->dirty));
2852
        info->dirty->count = bdrv_get_dirty_count(bs) *
2853
            BDRV_SECTORS_PER_DIRTY_CHUNK * BDRV_SECTOR_SIZE;
2854
    }
2855

    
2856
    if (bs->drv) {
2857
        info->has_inserted = true;
2858
        info->inserted = g_malloc0(sizeof(*info->inserted));
2859
        info->inserted->file = g_strdup(bs->filename);
2860
        info->inserted->ro = bs->read_only;
2861
        info->inserted->drv = g_strdup(bs->drv->format_name);
2862
        info->inserted->encrypted = bs->encrypted;
2863
        info->inserted->encryption_key_missing = bdrv_key_required(bs);
2864

    
2865
        if (bs->backing_file[0]) {
2866
            info->inserted->has_backing_file = true;
2867
            info->inserted->backing_file = g_strdup(bs->backing_file);
2868
        }
2869

    
2870
        info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
2871

    
2872
        if (bs->io_limits_enabled) {
2873
            info->inserted->bps =
2874
                           bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2875
            info->inserted->bps_rd =
2876
                           bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2877
            info->inserted->bps_wr =
2878
                           bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2879
            info->inserted->iops =
2880
                           bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2881
            info->inserted->iops_rd =
2882
                           bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2883
            info->inserted->iops_wr =
2884
                           bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2885
        }
2886
    }
2887
    return info;
2888
}
2889

    
2890
BlockInfoList *qmp_query_block(Error **errp)
2891
{
2892
    BlockInfoList *head = NULL, **p_next = &head;
2893
    BlockDriverState *bs;
2894

    
2895
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2896
        BlockInfoList *info = g_malloc0(sizeof(*info));
2897
        info->value = bdrv_query_info(bs);
2898

    
2899
        *p_next = info;
2900
        p_next = &info->next;
2901
    }
2902

    
2903
    return head;
2904
}
2905

    
2906
BlockStats *bdrv_query_stats(const BlockDriverState *bs)
2907
{
2908
    BlockStats *s;
2909

    
2910
    s = g_malloc0(sizeof(*s));
2911

    
2912
    if (bs->device_name[0]) {
2913
        s->has_device = true;
2914
        s->device = g_strdup(bs->device_name);
2915
    }
2916

    
2917
    s->stats = g_malloc0(sizeof(*s->stats));
2918
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2919
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2920
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2921
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2922
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2923
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2924
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2925
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2926
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2927

    
2928
    if (bs->file) {
2929
        s->has_parent = true;
2930
        s->parent = bdrv_query_stats(bs->file);
2931
    }
2932

    
2933
    return s;
2934
}
2935

    
2936
BlockStatsList *qmp_query_blockstats(Error **errp)
2937
{
2938
    BlockStatsList *head = NULL, **p_next = &head;
2939
    BlockDriverState *bs;
2940

    
2941
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2942
        BlockStatsList *info = g_malloc0(sizeof(*info));
2943
        info->value = bdrv_query_stats(bs);
2944

    
2945
        *p_next = info;
2946
        p_next = &info->next;
2947
    }
2948

    
2949
    return head;
2950
}
2951

    
2952
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2953
{
2954
    if (bs->backing_hd && bs->backing_hd->encrypted)
2955
        return bs->backing_file;
2956
    else if (bs->encrypted)
2957
        return bs->filename;
2958
    else
2959
        return NULL;
2960
}
2961

    
2962
void bdrv_get_backing_filename(BlockDriverState *bs,
2963
                               char *filename, int filename_size)
2964
{
2965
    pstrcpy(filename, filename_size, bs->backing_file);
2966
}
2967

    
2968
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2969
                          const uint8_t *buf, int nb_sectors)
2970
{
2971
    BlockDriver *drv = bs->drv;
2972
    if (!drv)
2973
        return -ENOMEDIUM;
2974
    if (!drv->bdrv_write_compressed)
2975
        return -ENOTSUP;
2976
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2977
        return -EIO;
2978

    
2979
    assert(!bs->dirty_bitmap);
2980

    
2981
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2982
}
2983

    
2984
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2985
{
2986
    BlockDriver *drv = bs->drv;
2987
    if (!drv)
2988
        return -ENOMEDIUM;
2989
    if (!drv->bdrv_get_info)
2990
        return -ENOTSUP;
2991
    memset(bdi, 0, sizeof(*bdi));
2992
    return drv->bdrv_get_info(bs, bdi);
2993
}
2994

    
2995
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2996
                      int64_t pos, int size)
2997
{
2998
    BlockDriver *drv = bs->drv;
2999
    if (!drv)
3000
        return -ENOMEDIUM;
3001
    if (drv->bdrv_save_vmstate)
3002
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
3003
    if (bs->file)
3004
        return bdrv_save_vmstate(bs->file, buf, pos, size);
3005
    return -ENOTSUP;
3006
}
3007

    
3008
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3009
                      int64_t pos, int size)
3010
{
3011
    BlockDriver *drv = bs->drv;
3012
    if (!drv)
3013
        return -ENOMEDIUM;
3014
    if (drv->bdrv_load_vmstate)
3015
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3016
    if (bs->file)
3017
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3018
    return -ENOTSUP;
3019
}
3020

    
3021
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3022
{
3023
    BlockDriver *drv = bs->drv;
3024

    
3025
    if (!drv || !drv->bdrv_debug_event) {
3026
        return;
3027
    }
3028

    
3029
    drv->bdrv_debug_event(bs, event);
3030

    
3031
}
3032

    
3033
/**************************************************************/
3034
/* handling of snapshots */
3035

    
3036
int bdrv_can_snapshot(BlockDriverState *bs)
3037
{
3038
    BlockDriver *drv = bs->drv;
3039
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3040
        return 0;
3041
    }
3042

    
3043
    if (!drv->bdrv_snapshot_create) {
3044
        if (bs->file != NULL) {
3045
            return bdrv_can_snapshot(bs->file);
3046
        }
3047
        return 0;
3048
    }
3049

    
3050
    return 1;
3051
}
3052

    
3053
int bdrv_is_snapshot(BlockDriverState *bs)
3054
{
3055
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3056
}
3057

    
3058
BlockDriverState *bdrv_snapshots(void)
3059
{
3060
    BlockDriverState *bs;
3061

    
3062
    if (bs_snapshots) {
3063
        return bs_snapshots;
3064
    }
3065

    
3066
    bs = NULL;
3067
    while ((bs = bdrv_next(bs))) {
3068
        if (bdrv_can_snapshot(bs)) {
3069
            bs_snapshots = bs;
3070
            return bs;
3071
        }
3072
    }
3073
    return NULL;
3074
}
3075

    
3076
int bdrv_snapshot_create(BlockDriverState *bs,
3077
                         QEMUSnapshotInfo *sn_info)
3078
{
3079
    BlockDriver *drv = bs->drv;
3080
    if (!drv)
3081
        return -ENOMEDIUM;
3082
    if (drv->bdrv_snapshot_create)
3083
        return drv->bdrv_snapshot_create(bs, sn_info);
3084
    if (bs->file)
3085
        return bdrv_snapshot_create(bs->file, sn_info);
3086
    return -ENOTSUP;
3087
}
3088

    
3089
int bdrv_snapshot_goto(BlockDriverState *bs,
3090
                       const char *snapshot_id)
3091
{
3092
    BlockDriver *drv = bs->drv;
3093
    int ret, open_ret;
3094

    
3095
    if (!drv)
3096
        return -ENOMEDIUM;
3097
    if (drv->bdrv_snapshot_goto)
3098
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
3099

    
3100
    if (bs->file) {
3101
        drv->bdrv_close(bs);
3102
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
3103
        open_ret = drv->bdrv_open(bs, bs->open_flags);
3104
        if (open_ret < 0) {
3105
            bdrv_delete(bs->file);
3106
            bs->drv = NULL;
3107
            return open_ret;
3108
        }
3109
        return ret;
3110
    }
3111

    
3112
    return -ENOTSUP;
3113
}
3114

    
3115
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
3116
{
3117
    BlockDriver *drv = bs->drv;
3118
    if (!drv)
3119
        return -ENOMEDIUM;
3120
    if (drv->bdrv_snapshot_delete)
3121
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
3122
    if (bs->file)
3123
        return bdrv_snapshot_delete(bs->file, snapshot_id);
3124
    return -ENOTSUP;
3125
}
3126

    
3127
int bdrv_snapshot_list(BlockDriverState *bs,
3128
                       QEMUSnapshotInfo **psn_info)
3129
{
3130
    BlockDriver *drv = bs->drv;
3131
    if (!drv)
3132
        return -ENOMEDIUM;
3133
    if (drv->bdrv_snapshot_list)
3134
        return drv->bdrv_snapshot_list(bs, psn_info);
3135
    if (bs->file)
3136
        return bdrv_snapshot_list(bs->file, psn_info);
3137
    return -ENOTSUP;
3138
}
3139

    
3140
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
3141
        const char *snapshot_name)
3142
{
3143
    BlockDriver *drv = bs->drv;
3144
    if (!drv) {
3145
        return -ENOMEDIUM;
3146
    }
3147
    if (!bs->read_only) {
3148
        return -EINVAL;
3149
    }
3150
    if (drv->bdrv_snapshot_load_tmp) {
3151
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
3152
    }
3153
    return -ENOTSUP;
3154
}
3155

    
3156
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3157
 * relative, it must be relative to the chain.  So, passing in bs->filename
3158
 * from a BDS as backing_file should not be done, as that may be relative to
3159
 * the CWD rather than the chain. */
3160
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3161
        const char *backing_file)
3162
{
3163
    char *filename_full = NULL;
3164
    char *backing_file_full = NULL;
3165
    char *filename_tmp = NULL;
3166
    int is_protocol = 0;
3167
    BlockDriverState *curr_bs = NULL;
3168
    BlockDriverState *retval = NULL;
3169

    
3170
    if (!bs || !bs->drv || !backing_file) {
3171
        return NULL;
3172
    }
3173

    
3174
    filename_full     = g_malloc(PATH_MAX);
3175
    backing_file_full = g_malloc(PATH_MAX);
3176
    filename_tmp      = g_malloc(PATH_MAX);
3177

    
3178
    is_protocol = path_has_protocol(backing_file);
3179

    
3180
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3181

    
3182
        /* If either of the filename paths is actually a protocol, then
3183
         * compare unmodified paths; otherwise make paths relative */
3184
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3185
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3186
                retval = curr_bs->backing_hd;
3187
                break;
3188
            }
3189
        } else {
3190
            /* If not an absolute filename path, make it relative to the current
3191
             * image's filename path */
3192
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3193
                         backing_file);
3194

    
3195
            /* We are going to compare absolute pathnames */
3196
            if (!realpath(filename_tmp, filename_full)) {
3197
                continue;
3198
            }
3199

    
3200
            /* We need to make sure the backing filename we are comparing against
3201
             * is relative to the current image filename (or absolute) */
3202
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3203
                         curr_bs->backing_file);
3204

    
3205
            if (!realpath(filename_tmp, backing_file_full)) {
3206
                continue;
3207
            }
3208

    
3209
            if (strcmp(backing_file_full, filename_full) == 0) {
3210
                retval = curr_bs->backing_hd;
3211
                break;
3212
            }
3213
        }
3214
    }
3215

    
3216
    g_free(filename_full);
3217
    g_free(backing_file_full);
3218
    g_free(filename_tmp);
3219
    return retval;
3220
}
3221

    
3222
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3223
{
3224
    if (!bs->drv) {
3225
        return 0;
3226
    }
3227

    
3228
    if (!bs->backing_hd) {
3229
        return 0;
3230
    }
3231

    
3232
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3233
}
3234

    
3235
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3236
{
3237
    BlockDriverState *curr_bs = NULL;
3238

    
3239
    if (!bs) {
3240
        return NULL;
3241
    }
3242

    
3243
    curr_bs = bs;
3244

    
3245
    while (curr_bs->backing_hd) {
3246
        curr_bs = curr_bs->backing_hd;
3247
    }
3248
    return curr_bs;
3249
}
3250

    
3251
#define NB_SUFFIXES 4
3252

    
3253
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
3254
{
3255
    static const char suffixes[NB_SUFFIXES] = "KMGT";
3256
    int64_t base;
3257
    int i;
3258

    
3259
    if (size <= 999) {
3260
        snprintf(buf, buf_size, "%" PRId64, size);
3261
    } else {
3262
        base = 1024;
3263
        for(i = 0; i < NB_SUFFIXES; i++) {
3264
            if (size < (10 * base)) {
3265
                snprintf(buf, buf_size, "%0.1f%c",
3266
                         (double)size / base,
3267
                         suffixes[i]);
3268
                break;
3269
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3270
                snprintf(buf, buf_size, "%" PRId64 "%c",
3271
                         ((size + (base >> 1)) / base),
3272
                         suffixes[i]);
3273
                break;
3274
            }
3275
            base = base * 1024;
3276
        }
3277
    }
3278
    return buf;
3279
}
3280

    
3281
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3282
{
3283
    char buf1[128], date_buf[128], clock_buf[128];
3284
#ifdef _WIN32
3285
    struct tm *ptm;
3286
#else
3287
    struct tm tm;
3288
#endif
3289
    time_t ti;
3290
    int64_t secs;
3291

    
3292
    if (!sn) {
3293
        snprintf(buf, buf_size,
3294
                 "%-10s%-20s%7s%20s%15s",
3295
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3296
    } else {
3297
        ti = sn->date_sec;
3298
#ifdef _WIN32
3299
        ptm = localtime(&ti);
3300
        strftime(date_buf, sizeof(date_buf),
3301
                 "%Y-%m-%d %H:%M:%S", ptm);
3302
#else
3303
        localtime_r(&ti, &tm);
3304
        strftime(date_buf, sizeof(date_buf),
3305
                 "%Y-%m-%d %H:%M:%S", &tm);
3306
#endif
3307
        secs = sn->vm_clock_nsec / 1000000000;
3308
        snprintf(clock_buf, sizeof(clock_buf),
3309
                 "%02d:%02d:%02d.%03d",
3310
                 (int)(secs / 3600),
3311
                 (int)((secs / 60) % 60),
3312
                 (int)(secs % 60),
3313
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3314
        snprintf(buf, buf_size,
3315
                 "%-10s%-20s%7s%20s%15s",
3316
                 sn->id_str, sn->name,
3317
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3318
                 date_buf,
3319
                 clock_buf);
3320
    }
3321
    return buf;
3322
}
3323

    
3324
/**************************************************************/
3325
/* async I/Os */
3326

    
3327
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3328
                                 QEMUIOVector *qiov, int nb_sectors,
3329
                                 BlockDriverCompletionFunc *cb, void *opaque)
3330
{
3331
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3332

    
3333
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3334
                                 cb, opaque, false);
3335
}
3336

    
3337
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3338
                                  QEMUIOVector *qiov, int nb_sectors,
3339
                                  BlockDriverCompletionFunc *cb, void *opaque)
3340
{
3341
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3342

    
3343
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3344
                                 cb, opaque, true);
3345
}
3346

    
3347

    
3348
typedef struct MultiwriteCB {
3349
    int error;
3350
    int num_requests;
3351
    int num_callbacks;
3352
    struct {
3353
        BlockDriverCompletionFunc *cb;
3354
        void *opaque;
3355
        QEMUIOVector *free_qiov;
3356
    } callbacks[];
3357
} MultiwriteCB;
3358

    
3359
static void multiwrite_user_cb(MultiwriteCB *mcb)
3360
{
3361
    int i;
3362

    
3363
    for (i = 0; i < mcb->num_callbacks; i++) {
3364
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3365
        if (mcb->callbacks[i].free_qiov) {
3366
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3367
        }
3368
        g_free(mcb->callbacks[i].free_qiov);
3369
    }
3370
}
3371

    
3372
static void multiwrite_cb(void *opaque, int ret)
3373
{
3374
    MultiwriteCB *mcb = opaque;
3375

    
3376
    trace_multiwrite_cb(mcb, ret);
3377

    
3378
    if (ret < 0 && !mcb->error) {
3379
        mcb->error = ret;
3380
    }
3381

    
3382
    mcb->num_requests--;
3383
    if (mcb->num_requests == 0) {
3384
        multiwrite_user_cb(mcb);
3385
        g_free(mcb);
3386
    }
3387
}
3388

    
3389
static int multiwrite_req_compare(const void *a, const void *b)
3390
{
3391
    const BlockRequest *req1 = a, *req2 = b;
3392

    
3393
    /*
3394
     * Note that we can't simply subtract req2->sector from req1->sector
3395
     * here as that could overflow the return value.
3396
     */
3397
    if (req1->sector > req2->sector) {
3398
        return 1;
3399
    } else if (req1->sector < req2->sector) {
3400
        return -1;
3401
    } else {
3402
        return 0;
3403
    }
3404
}
3405

    
3406
/*
3407
 * Takes a bunch of requests and tries to merge them. Returns the number of
3408
 * requests that remain after merging.
3409
 */
3410
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3411
    int num_reqs, MultiwriteCB *mcb)
3412
{
3413
    int i, outidx;
3414

    
3415
    // Sort requests by start sector
3416
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3417

    
3418
    // Check if adjacent requests touch the same clusters. If so, combine them,
3419
    // filling up gaps with zero sectors.
3420
    outidx = 0;
3421
    for (i = 1; i < num_reqs; i++) {
3422
        int merge = 0;
3423
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3424

    
3425
        // Handle exactly sequential writes and overlapping writes.
3426
        if (reqs[i].sector <= oldreq_last) {
3427
            merge = 1;
3428
        }
3429

    
3430
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3431
            merge = 0;
3432
        }
3433

    
3434
        if (merge) {
3435
            size_t size;
3436
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3437
            qemu_iovec_init(qiov,
3438
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3439

    
3440
            // Add the first request to the merged one. If the requests are
3441
            // overlapping, drop the last sectors of the first request.
3442
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3443
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3444

    
3445
            // We should need to add any zeros between the two requests
3446
            assert (reqs[i].sector <= oldreq_last);
3447

    
3448
            // Add the second request
3449
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3450

    
3451
            reqs[outidx].nb_sectors = qiov->size >> 9;
3452
            reqs[outidx].qiov = qiov;
3453

    
3454
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3455
        } else {
3456
            outidx++;
3457
            reqs[outidx].sector     = reqs[i].sector;
3458
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3459
            reqs[outidx].qiov       = reqs[i].qiov;
3460
        }
3461
    }
3462

    
3463
    return outidx + 1;
3464
}
3465

    
3466
/*
3467
 * Submit multiple AIO write requests at once.
3468
 *
3469
 * On success, the function returns 0 and all requests in the reqs array have
3470
 * been submitted. In error case this function returns -1, and any of the
3471
 * requests may or may not be submitted yet. In particular, this means that the
3472
 * callback will be called for some of the requests, for others it won't. The
3473
 * caller must check the error field of the BlockRequest to wait for the right
3474
 * callbacks (if error != 0, no callback will be called).
3475
 *
3476
 * The implementation may modify the contents of the reqs array, e.g. to merge
3477
 * requests. However, the fields opaque and error are left unmodified as they
3478
 * are used to signal failure for a single request to the caller.
3479
 */
3480
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3481
{
3482
    MultiwriteCB *mcb;
3483
    int i;
3484

    
3485
    /* don't submit writes if we don't have a medium */
3486
    if (bs->drv == NULL) {
3487
        for (i = 0; i < num_reqs; i++) {
3488
            reqs[i].error = -ENOMEDIUM;
3489
        }
3490
        return -1;
3491
    }
3492

    
3493
    if (num_reqs == 0) {
3494
        return 0;
3495
    }
3496

    
3497
    // Create MultiwriteCB structure
3498
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3499
    mcb->num_requests = 0;
3500
    mcb->num_callbacks = num_reqs;
3501

    
3502
    for (i = 0; i < num_reqs; i++) {
3503
        mcb->callbacks[i].cb = reqs[i].cb;
3504
        mcb->callbacks[i].opaque = reqs[i].opaque;
3505
    }
3506

    
3507
    // Check for mergable requests
3508
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3509

    
3510
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3511

    
3512
    /* Run the aio requests. */
3513
    mcb->num_requests = num_reqs;
3514
    for (i = 0; i < num_reqs; i++) {
3515
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3516
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3517
    }
3518

    
3519
    return 0;
3520
}
3521

    
3522
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3523
{
3524
    acb->aiocb_info->cancel(acb);
3525
}
3526

    
3527
/* block I/O throttling */
3528
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3529
                 bool is_write, double elapsed_time, uint64_t *wait)
3530
{
3531
    uint64_t bps_limit = 0;
3532
    double   bytes_limit, bytes_base, bytes_res;
3533
    double   slice_time, wait_time;
3534

    
3535
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3536
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3537
    } else if (bs->io_limits.bps[is_write]) {
3538
        bps_limit = bs->io_limits.bps[is_write];
3539
    } else {
3540
        if (wait) {
3541
            *wait = 0;
3542
        }
3543

    
3544
        return false;
3545
    }
3546

    
3547
    slice_time = bs->slice_end - bs->slice_start;
3548
    slice_time /= (NANOSECONDS_PER_SECOND);
3549
    bytes_limit = bps_limit * slice_time;
3550
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3551
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3552
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3553
    }
3554

    
3555
    /* bytes_base: the bytes of data which have been read/written; and
3556
     *             it is obtained from the history statistic info.
3557
     * bytes_res: the remaining bytes of data which need to be read/written.
3558
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3559
     *             the total time for completing reading/writting all data.
3560
     */
3561
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3562

    
3563
    if (bytes_base + bytes_res <= bytes_limit) {
3564
        if (wait) {
3565
            *wait = 0;
3566
        }
3567

    
3568
        return false;
3569
    }
3570

    
3571
    /* Calc approx time to dispatch */
3572
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3573

    
3574
    /* When the I/O rate at runtime exceeds the limits,
3575
     * bs->slice_end need to be extended in order that the current statistic
3576
     * info can be kept until the timer fire, so it is increased and tuned
3577
     * based on the result of experiment.
3578
     */
3579
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3580
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3581
    if (wait) {
3582
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3583
    }
3584

    
3585
    return true;
3586
}
3587

    
3588
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3589
                             double elapsed_time, uint64_t *wait)
3590
{
3591
    uint64_t iops_limit = 0;
3592
    double   ios_limit, ios_base;
3593
    double   slice_time, wait_time;
3594

    
3595
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3596
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3597
    } else if (bs->io_limits.iops[is_write]) {
3598
        iops_limit = bs->io_limits.iops[is_write];
3599
    } else {
3600
        if (wait) {
3601
            *wait = 0;
3602
        }
3603

    
3604
        return false;
3605
    }
3606

    
3607
    slice_time = bs->slice_end - bs->slice_start;
3608
    slice_time /= (NANOSECONDS_PER_SECOND);
3609
    ios_limit  = iops_limit * slice_time;
3610
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3611
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3612
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3613
    }
3614

    
3615
    if (ios_base + 1 <= ios_limit) {
3616
        if (wait) {
3617
            *wait = 0;
3618
        }
3619

    
3620
        return false;
3621
    }
3622

    
3623
    /* Calc approx time to dispatch */
3624
    wait_time = (ios_base + 1) / iops_limit;
3625
    if (wait_time > elapsed_time) {
3626
        wait_time = wait_time - elapsed_time;
3627
    } else {
3628
        wait_time = 0;
3629
    }
3630

    
3631
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3632
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3633
    if (wait) {
3634
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3635
    }
3636

    
3637
    return true;
3638
}
3639

    
3640
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3641
                           bool is_write, int64_t *wait)
3642
{
3643
    int64_t  now, max_wait;
3644
    uint64_t bps_wait = 0, iops_wait = 0;
3645
    double   elapsed_time;
3646
    int      bps_ret, iops_ret;
3647

    
3648
    now = qemu_get_clock_ns(vm_clock);
3649
    if ((bs->slice_start < now)
3650
        && (bs->slice_end > now)) {
3651
        bs->slice_end = now + bs->slice_time;
3652
    } else {
3653
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3654
        bs->slice_start = now;
3655
        bs->slice_end   = now + bs->slice_time;
3656

    
3657
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3658
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3659

    
3660
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3661
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3662
    }
3663

    
3664
    elapsed_time  = now - bs->slice_start;
3665
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3666

    
3667
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3668
                                      is_write, elapsed_time, &bps_wait);
3669
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3670
                                      elapsed_time, &iops_wait);
3671
    if (bps_ret || iops_ret) {
3672
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3673
        if (wait) {
3674
            *wait = max_wait;
3675
        }
3676

    
3677
        now = qemu_get_clock_ns(vm_clock);
3678
        if (bs->slice_end < now + max_wait) {
3679
            bs->slice_end = now + max_wait;
3680
        }
3681

    
3682
        return true;
3683
    }
3684

    
3685
    if (wait) {
3686
        *wait = 0;
3687
    }
3688

    
3689
    return false;
3690
}
3691

    
3692
/**************************************************************/
3693
/* async block device emulation */
3694

    
3695
typedef struct BlockDriverAIOCBSync {
3696
    BlockDriverAIOCB common;
3697
    QEMUBH *bh;
3698
    int ret;
3699
    /* vector translation state */
3700
    QEMUIOVector *qiov;
3701
    uint8_t *bounce;
3702
    int is_write;
3703
} BlockDriverAIOCBSync;
3704

    
3705
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3706
{
3707
    BlockDriverAIOCBSync *acb =
3708
        container_of(blockacb, BlockDriverAIOCBSync, common);
3709
    qemu_bh_delete(acb->bh);
3710
    acb->bh = NULL;
3711
    qemu_aio_release(acb);
3712
}
3713

    
3714
static const AIOCBInfo bdrv_em_aiocb_info = {
3715
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3716
    .cancel             = bdrv_aio_cancel_em,
3717
};
3718

    
3719
static void bdrv_aio_bh_cb(void *opaque)
3720
{
3721
    BlockDriverAIOCBSync *acb = opaque;
3722

    
3723
    if (!acb->is_write)
3724
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3725
    qemu_vfree(acb->bounce);
3726
    acb->common.cb(acb->common.opaque, acb->ret);
3727
    qemu_bh_delete(acb->bh);
3728
    acb->bh = NULL;
3729
    qemu_aio_release(acb);
3730
}
3731

    
3732
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3733
                                            int64_t sector_num,
3734
                                            QEMUIOVector *qiov,
3735
                                            int nb_sectors,
3736
                                            BlockDriverCompletionFunc *cb,
3737
                                            void *opaque,
3738
                                            int is_write)
3739

    
3740
{
3741
    BlockDriverAIOCBSync *acb;
3742

    
3743
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3744
    acb->is_write = is_write;
3745
    acb->qiov = qiov;
3746
    acb->bounce = qemu_blockalign(bs, qiov->size);
3747
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3748

    
3749
    if (is_write) {
3750
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3751
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3752
    } else {
3753
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3754
    }
3755

    
3756
    qemu_bh_schedule(acb->bh);
3757

    
3758
    return &acb->common;
3759
}
3760

    
3761
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3762
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3763
        BlockDriverCompletionFunc *cb, void *opaque)
3764
{
3765
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3766
}
3767

    
3768
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3769
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3770
        BlockDriverCompletionFunc *cb, void *opaque)
3771
{
3772
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3773
}
3774

    
3775

    
3776
typedef struct BlockDriverAIOCBCoroutine {
3777
    BlockDriverAIOCB common;
3778
    BlockRequest req;
3779
    bool is_write;
3780
    QEMUBH* bh;
3781
} BlockDriverAIOCBCoroutine;
3782

    
3783
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3784
{
3785
    qemu_aio_flush();
3786
}
3787

    
3788
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3789
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3790
    .cancel             = bdrv_aio_co_cancel_em,
3791
};
3792

    
3793
static void bdrv_co_em_bh(void *opaque)
3794
{
3795
    BlockDriverAIOCBCoroutine *acb = opaque;
3796

    
3797
    acb->common.cb(acb->common.opaque, acb->req.error);
3798
    qemu_bh_delete(acb->bh);
3799
    qemu_aio_release(acb);
3800
}
3801

    
3802
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3803
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3804
{
3805
    BlockDriverAIOCBCoroutine *acb = opaque;
3806
    BlockDriverState *bs = acb->common.bs;
3807

    
3808
    if (!acb->is_write) {
3809
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3810
            acb->req.nb_sectors, acb->req.qiov, 0);
3811
    } else {
3812
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3813
            acb->req.nb_sectors, acb->req.qiov, 0);
3814
    }
3815

    
3816
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3817
    qemu_bh_schedule(acb->bh);
3818
}
3819

    
3820
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3821
                                               int64_t sector_num,
3822
                                               QEMUIOVector *qiov,
3823
                                               int nb_sectors,
3824
                                               BlockDriverCompletionFunc *cb,
3825
                                               void *opaque,
3826
                                               bool is_write)
3827
{
3828
    Coroutine *co;
3829
    BlockDriverAIOCBCoroutine *acb;
3830

    
3831
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3832
    acb->req.sector = sector_num;
3833
    acb->req.nb_sectors = nb_sectors;
3834
    acb->req.qiov = qiov;
3835
    acb->is_write = is_write;
3836

    
3837
    co = qemu_coroutine_create(bdrv_co_do_rw);
3838
    qemu_coroutine_enter(co, acb);
3839

    
3840
    return &acb->common;
3841
}
3842

    
3843
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3844
{
3845
    BlockDriverAIOCBCoroutine *acb = opaque;
3846
    BlockDriverState *bs = acb->common.bs;
3847

    
3848
    acb->req.error = bdrv_co_flush(bs);
3849
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3850
    qemu_bh_schedule(acb->bh);
3851
}
3852

    
3853
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3854
        BlockDriverCompletionFunc *cb, void *opaque)
3855
{
3856
    trace_bdrv_aio_flush(bs, opaque);
3857

    
3858
    Coroutine *co;
3859
    BlockDriverAIOCBCoroutine *acb;
3860

    
3861
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3862
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3863
    qemu_coroutine_enter(co, acb);
3864

    
3865
    return &acb->common;
3866
}
3867

    
3868
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3869
{
3870
    BlockDriverAIOCBCoroutine *acb = opaque;
3871
    BlockDriverState *bs = acb->common.bs;
3872

    
3873
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3874
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3875
    qemu_bh_schedule(acb->bh);
3876
}
3877

    
3878
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3879
        int64_t sector_num, int nb_sectors,
3880
        BlockDriverCompletionFunc *cb, void *opaque)
3881
{
3882
    Coroutine *co;
3883
    BlockDriverAIOCBCoroutine *acb;
3884

    
3885
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3886

    
3887
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3888
    acb->req.sector = sector_num;
3889
    acb->req.nb_sectors = nb_sectors;
3890
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3891
    qemu_coroutine_enter(co, acb);
3892

    
3893
    return &acb->common;
3894
}
3895

    
3896
void bdrv_init(void)
3897
{
3898
    module_call_init(MODULE_INIT_BLOCK);
3899
}
3900

    
3901
void bdrv_init_with_whitelist(void)
3902
{
3903
    use_bdrv_whitelist = 1;
3904
    bdrv_init();
3905
}
3906

    
3907
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
3908
                   BlockDriverCompletionFunc *cb, void *opaque)
3909
{
3910
    BlockDriverAIOCB *acb;
3911

    
3912
    acb = g_slice_alloc(aiocb_info->aiocb_size);
3913
    acb->aiocb_info = aiocb_info;
3914
    acb->bs = bs;
3915
    acb->cb = cb;
3916
    acb->opaque = opaque;
3917
    return acb;
3918
}
3919

    
3920
void qemu_aio_release(void *p)
3921
{
3922
    BlockDriverAIOCB *acb = p;
3923
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
3924
}
3925

    
3926
/**************************************************************/
3927
/* Coroutine block device emulation */
3928

    
3929
typedef struct CoroutineIOCompletion {
3930
    Coroutine *coroutine;
3931
    int ret;
3932
} CoroutineIOCompletion;
3933

    
3934
static void bdrv_co_io_em_complete(void *opaque, int ret)
3935
{
3936
    CoroutineIOCompletion *co = opaque;
3937

    
3938
    co->ret = ret;
3939
    qemu_coroutine_enter(co->coroutine, NULL);
3940
}
3941

    
3942
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3943
                                      int nb_sectors, QEMUIOVector *iov,
3944
                                      bool is_write)
3945
{
3946
    CoroutineIOCompletion co = {
3947
        .coroutine = qemu_coroutine_self(),
3948
    };
3949
    BlockDriverAIOCB *acb;
3950

    
3951
    if (is_write) {
3952
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3953
                                       bdrv_co_io_em_complete, &co);
3954
    } else {
3955
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3956
                                      bdrv_co_io_em_complete, &co);
3957
    }
3958

    
3959
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3960
    if (!acb) {
3961
        return -EIO;
3962
    }
3963
    qemu_coroutine_yield();
3964

    
3965
    return co.ret;
3966
}
3967

    
3968
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3969
                                         int64_t sector_num, int nb_sectors,
3970
                                         QEMUIOVector *iov)
3971
{
3972
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3973
}
3974

    
3975
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3976
                                         int64_t sector_num, int nb_sectors,
3977
                                         QEMUIOVector *iov)
3978
{
3979
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3980
}
3981

    
3982
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3983
{
3984
    RwCo *rwco = opaque;
3985

    
3986
    rwco->ret = bdrv_co_flush(rwco->bs);
3987
}
3988

    
3989
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3990
{
3991
    int ret;
3992

    
3993
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3994
        return 0;
3995
    }
3996

    
3997
    /* Write back cached data to the OS even with cache=unsafe */
3998
    if (bs->drv->bdrv_co_flush_to_os) {
3999
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4000
        if (ret < 0) {
4001
            return ret;
4002
        }
4003
    }
4004

    
4005
    /* But don't actually force it to the disk with cache=unsafe */
4006
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4007
        goto flush_parent;
4008
    }
4009

    
4010
    if (bs->drv->bdrv_co_flush_to_disk) {
4011
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4012
    } else if (bs->drv->bdrv_aio_flush) {
4013
        BlockDriverAIOCB *acb;
4014
        CoroutineIOCompletion co = {
4015
            .coroutine = qemu_coroutine_self(),
4016
        };
4017

    
4018
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4019
        if (acb == NULL) {
4020
            ret = -EIO;
4021
        } else {
4022
            qemu_coroutine_yield();
4023
            ret = co.ret;
4024
        }
4025
    } else {
4026
        /*
4027
         * Some block drivers always operate in either writethrough or unsafe
4028
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4029
         * know how the server works (because the behaviour is hardcoded or
4030
         * depends on server-side configuration), so we can't ensure that
4031
         * everything is safe on disk. Returning an error doesn't work because
4032
         * that would break guests even if the server operates in writethrough
4033
         * mode.
4034
         *
4035
         * Let's hope the user knows what he's doing.
4036
         */
4037
        ret = 0;
4038
    }
4039
    if (ret < 0) {
4040
        return ret;
4041
    }
4042

    
4043
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4044
     * in the case of cache=unsafe, so there are no useless flushes.
4045
     */
4046
flush_parent:
4047
    return bdrv_co_flush(bs->file);
4048
}
4049

    
4050
void bdrv_invalidate_cache(BlockDriverState *bs)
4051
{
4052
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4053
        bs->drv->bdrv_invalidate_cache(bs);
4054
    }
4055
}
4056

    
4057
void bdrv_invalidate_cache_all(void)
4058
{
4059
    BlockDriverState *bs;
4060

    
4061
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4062
        bdrv_invalidate_cache(bs);
4063
    }
4064
}
4065

    
4066
void bdrv_clear_incoming_migration_all(void)
4067
{
4068
    BlockDriverState *bs;
4069

    
4070
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4071
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4072
    }
4073
}
4074

    
4075
int bdrv_flush(BlockDriverState *bs)
4076
{
4077
    Coroutine *co;
4078
    RwCo rwco = {
4079
        .bs = bs,
4080
        .ret = NOT_DONE,
4081
    };
4082

    
4083
    if (qemu_in_coroutine()) {
4084
        /* Fast-path if already in coroutine context */
4085
        bdrv_flush_co_entry(&rwco);
4086
    } else {
4087
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4088
        qemu_coroutine_enter(co, &rwco);
4089
        while (rwco.ret == NOT_DONE) {
4090
            qemu_aio_wait();
4091
        }
4092
    }
4093

    
4094
    return rwco.ret;
4095
}
4096

    
4097
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4098
{
4099
    RwCo *rwco = opaque;
4100

    
4101
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4102
}
4103

    
4104
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4105
                                 int nb_sectors)
4106
{
4107
    if (!bs->drv) {
4108
        return -ENOMEDIUM;
4109
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4110
        return -EIO;
4111
    } else if (bs->read_only) {
4112
        return -EROFS;
4113
    } else if (bs->drv->bdrv_co_discard) {
4114
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4115
    } else if (bs->drv->bdrv_aio_discard) {
4116
        BlockDriverAIOCB *acb;
4117
        CoroutineIOCompletion co = {
4118
            .coroutine = qemu_coroutine_self(),
4119
        };
4120

    
4121
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4122
                                        bdrv_co_io_em_complete, &co);
4123
        if (acb == NULL) {
4124
            return -EIO;
4125
        } else {
4126
            qemu_coroutine_yield();
4127
            return co.ret;
4128
        }
4129
    } else {
4130
        return 0;
4131
    }
4132
}
4133

    
4134
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4135
{
4136
    Coroutine *co;
4137
    RwCo rwco = {
4138
        .bs = bs,
4139
        .sector_num = sector_num,
4140
        .nb_sectors = nb_sectors,
4141
        .ret = NOT_DONE,
4142
    };
4143

    
4144
    if (qemu_in_coroutine()) {
4145
        /* Fast-path if already in coroutine context */
4146
        bdrv_discard_co_entry(&rwco);
4147
    } else {
4148
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4149
        qemu_coroutine_enter(co, &rwco);
4150
        while (rwco.ret == NOT_DONE) {
4151
            qemu_aio_wait();
4152
        }
4153
    }
4154

    
4155
    return rwco.ret;
4156
}
4157

    
4158
/**************************************************************/
4159
/* removable device support */
4160

    
4161
/**
4162
 * Return TRUE if the media is present
4163
 */
4164
int bdrv_is_inserted(BlockDriverState *bs)
4165
{
4166
    BlockDriver *drv = bs->drv;
4167

    
4168
    if (!drv)
4169
        return 0;
4170
    if (!drv->bdrv_is_inserted)
4171
        return 1;
4172
    return drv->bdrv_is_inserted(bs);
4173
}
4174

    
4175
/**
4176
 * Return whether the media changed since the last call to this
4177
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4178
 */
4179
int bdrv_media_changed(BlockDriverState *bs)
4180
{
4181
    BlockDriver *drv = bs->drv;
4182

    
4183
    if (drv && drv->bdrv_media_changed) {
4184
        return drv->bdrv_media_changed(bs);
4185
    }
4186
    return -ENOTSUP;
4187
}
4188

    
4189
/**
4190
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4191
 */
4192
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4193
{
4194
    BlockDriver *drv = bs->drv;
4195

    
4196
    if (drv && drv->bdrv_eject) {
4197
        drv->bdrv_eject(bs, eject_flag);
4198
    }
4199

    
4200
    if (bs->device_name[0] != '\0') {
4201
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4202
    }
4203
}
4204

    
4205
/**
4206
 * Lock or unlock the media (if it is locked, the user won't be able
4207
 * to eject it manually).
4208
 */
4209
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4210
{
4211
    BlockDriver *drv = bs->drv;
4212

    
4213
    trace_bdrv_lock_medium(bs, locked);
4214

    
4215
    if (drv && drv->bdrv_lock_medium) {
4216
        drv->bdrv_lock_medium(bs, locked);
4217
    }
4218
}
4219

    
4220
/* needed for generic scsi interface */
4221

    
4222
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4223
{
4224
    BlockDriver *drv = bs->drv;
4225

    
4226
    if (drv && drv->bdrv_ioctl)
4227
        return drv->bdrv_ioctl(bs, req, buf);
4228
    return -ENOTSUP;
4229
}
4230

    
4231
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4232
        unsigned long int req, void *buf,
4233
        BlockDriverCompletionFunc *cb, void *opaque)
4234
{
4235
    BlockDriver *drv = bs->drv;
4236

    
4237
    if (drv && drv->bdrv_aio_ioctl)
4238
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4239
    return NULL;
4240
}
4241

    
4242
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4243
{
4244
    bs->buffer_alignment = align;
4245
}
4246

    
4247
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4248
{
4249
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4250
}
4251

    
4252
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
4253
{
4254
    int64_t bitmap_size;
4255

    
4256
    bs->dirty_count = 0;
4257
    if (enable) {
4258
        if (!bs->dirty_bitmap) {
4259
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
4260
                    BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
4261
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
4262

    
4263
            bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
4264
        }
4265
    } else {
4266
        if (bs->dirty_bitmap) {
4267
            g_free(bs->dirty_bitmap);
4268
            bs->dirty_bitmap = NULL;
4269
        }
4270
    }
4271
}
4272

    
4273
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4274
{
4275
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4276

    
4277
    if (bs->dirty_bitmap &&
4278
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4279
        return !!(bs->dirty_bitmap[chunk / BITS_PER_LONG] &
4280
            (1UL << (chunk % BITS_PER_LONG)));
4281
    } else {
4282
        return 0;
4283
    }
4284
}
4285

    
4286
int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector)
4287
{
4288
    int64_t chunk;
4289
    int bit, elem;
4290

    
4291
    /* Avoid an infinite loop.  */
4292
    assert(bs->dirty_count > 0);
4293

    
4294
    sector = (sector | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
4295
    chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4296

    
4297
    QEMU_BUILD_BUG_ON(sizeof(bs->dirty_bitmap[0]) * 8 != BITS_PER_LONG);
4298
    elem = chunk / BITS_PER_LONG;
4299
    bit = chunk % BITS_PER_LONG;
4300
    for (;;) {
4301
        if (sector >= bs->total_sectors) {
4302
            sector = 0;
4303
            bit = elem = 0;
4304
        }
4305
        if (bit == 0 && bs->dirty_bitmap[elem] == 0) {
4306
            sector += BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
4307
            elem++;
4308
        } else {
4309
            if (bs->dirty_bitmap[elem] & (1UL << bit)) {
4310
                return sector;
4311
            }
4312
            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
4313
            if (++bit == BITS_PER_LONG) {
4314
                bit = 0;
4315
                elem++;
4316
            }
4317
        }
4318
    }
4319
}
4320

    
4321
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4322
                    int nr_sectors)
4323
{
4324
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 1);
4325
}
4326

    
4327
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4328
                      int nr_sectors)
4329
{
4330
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4331
}
4332

    
4333
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4334
{
4335
    return bs->dirty_count;
4336
}
4337

    
4338
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4339
{
4340
    assert(bs->in_use != in_use);
4341
    bs->in_use = in_use;
4342
}
4343

    
4344
int bdrv_in_use(BlockDriverState *bs)
4345
{
4346
    return bs->in_use;
4347
}
4348

    
4349
void bdrv_iostatus_enable(BlockDriverState *bs)
4350
{
4351
    bs->iostatus_enabled = true;
4352
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4353
}
4354

    
4355
/* The I/O status is only enabled if the drive explicitly
4356
 * enables it _and_ the VM is configured to stop on errors */
4357
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4358
{
4359
    return (bs->iostatus_enabled &&
4360
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4361
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4362
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4363
}
4364

    
4365
void bdrv_iostatus_disable(BlockDriverState *bs)
4366
{
4367
    bs->iostatus_enabled = false;
4368
}
4369

    
4370
void bdrv_iostatus_reset(BlockDriverState *bs)
4371
{
4372
    if (bdrv_iostatus_is_enabled(bs)) {
4373
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4374
        if (bs->job) {
4375
            block_job_iostatus_reset(bs->job);
4376
        }
4377
    }
4378
}
4379

    
4380
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4381
{
4382
    assert(bdrv_iostatus_is_enabled(bs));
4383
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4384
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4385
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4386
    }
4387
}
4388

    
4389
void
4390
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4391
        enum BlockAcctType type)
4392
{
4393
    assert(type < BDRV_MAX_IOTYPE);
4394

    
4395
    cookie->bytes = bytes;
4396
    cookie->start_time_ns = get_clock();
4397
    cookie->type = type;
4398
}
4399

    
4400
void
4401
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4402
{
4403
    assert(cookie->type < BDRV_MAX_IOTYPE);
4404

    
4405
    bs->nr_bytes[cookie->type] += cookie->bytes;
4406
    bs->nr_ops[cookie->type]++;
4407
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4408
}
4409

    
4410
int bdrv_img_create(const char *filename, const char *fmt,
4411
                    const char *base_filename, const char *base_fmt,
4412
                    char *options, uint64_t img_size, int flags)
4413
{
4414
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4415
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4416
    BlockDriverState *bs = NULL;
4417
    BlockDriver *drv, *proto_drv;
4418
    BlockDriver *backing_drv = NULL;
4419
    int ret = 0;
4420

    
4421
    /* Find driver and parse its options */
4422
    drv = bdrv_find_format(fmt);
4423
    if (!drv) {
4424
        error_report("Unknown file format '%s'", fmt);
4425
        ret = -EINVAL;
4426
        goto out;
4427
    }
4428

    
4429
    proto_drv = bdrv_find_protocol(filename);
4430
    if (!proto_drv) {
4431
        error_report("Unknown protocol '%s'", filename);
4432
        ret = -EINVAL;
4433
        goto out;
4434
    }
4435

    
4436
    create_options = append_option_parameters(create_options,
4437
                                              drv->create_options);
4438
    create_options = append_option_parameters(create_options,
4439
                                              proto_drv->create_options);
4440

    
4441
    /* Create parameter list with default values */
4442
    param = parse_option_parameters("", create_options, param);
4443

    
4444
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4445

    
4446
    /* Parse -o options */
4447
    if (options) {
4448
        param = parse_option_parameters(options, create_options, param);
4449
        if (param == NULL) {
4450
            error_report("Invalid options for file format '%s'.", fmt);
4451
            ret = -EINVAL;
4452
            goto out;
4453
        }
4454
    }
4455

    
4456
    if (base_filename) {
4457
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4458
                                 base_filename)) {
4459
            error_report("Backing file not supported for file format '%s'",
4460
                         fmt);
4461
            ret = -EINVAL;
4462
            goto out;
4463
        }
4464
    }
4465

    
4466
    if (base_fmt) {
4467
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4468
            error_report("Backing file format not supported for file "
4469
                         "format '%s'", fmt);
4470
            ret = -EINVAL;
4471
            goto out;
4472
        }
4473
    }
4474

    
4475
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4476
    if (backing_file && backing_file->value.s) {
4477
        if (!strcmp(filename, backing_file->value.s)) {
4478
            error_report("Error: Trying to create an image with the "
4479
                         "same filename as the backing file");
4480
            ret = -EINVAL;
4481
            goto out;
4482
        }
4483
    }
4484

    
4485
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4486
    if (backing_fmt && backing_fmt->value.s) {
4487
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4488
        if (!backing_drv) {
4489
            error_report("Unknown backing file format '%s'",
4490
                         backing_fmt->value.s);
4491
            ret = -EINVAL;
4492
            goto out;
4493
        }
4494
    }
4495

    
4496
    // The size for the image must always be specified, with one exception:
4497
    // If we are using a backing file, we can obtain the size from there
4498
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4499
    if (size && size->value.n == -1) {
4500
        if (backing_file && backing_file->value.s) {
4501
            uint64_t size;
4502
            char buf[32];
4503
            int back_flags;
4504

    
4505
            /* backing files always opened read-only */
4506
            back_flags =
4507
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4508

    
4509
            bs = bdrv_new("");
4510

    
4511
            ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4512
            if (ret < 0) {
4513
                error_report("Could not open '%s'", backing_file->value.s);
4514
                goto out;
4515
            }
4516
            bdrv_get_geometry(bs, &size);
4517
            size *= 512;
4518

    
4519
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4520
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4521
        } else {
4522
            error_report("Image creation needs a size parameter");
4523
            ret = -EINVAL;
4524
            goto out;
4525
        }
4526
    }
4527

    
4528
    printf("Formatting '%s', fmt=%s ", filename, fmt);
4529
    print_option_parameters(param);
4530
    puts("");
4531

    
4532
    ret = bdrv_create(drv, filename, param);
4533

    
4534
    if (ret < 0) {
4535
        if (ret == -ENOTSUP) {
4536
            error_report("Formatting or formatting option not supported for "
4537
                         "file format '%s'", fmt);
4538
        } else if (ret == -EFBIG) {
4539
            error_report("The image size is too large for file format '%s'",
4540
                         fmt);
4541
        } else {
4542
            error_report("%s: error while creating %s: %s", filename, fmt,
4543
                         strerror(-ret));
4544
        }
4545
    }
4546

    
4547
out:
4548
    free_option_parameters(create_options);
4549
    free_option_parameters(param);
4550

    
4551
    if (bs) {
4552
        bdrv_delete(bs);
4553
    }
4554

    
4555
    return ret;
4556
}