Statistics
| Branch: | Revision:

root / block.c @ 737e150e

History | View | Annotate | Download (125.6 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu.h"
33
#include "notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu-timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90
        bool is_write, double elapsed_time, uint64_t *wait);
91
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92
        double elapsed_time, uint64_t *wait);
93
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94
        bool is_write, int64_t *wait);
95

    
96
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
98

    
99
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
101

    
102
/* The device to use for VM snapshots */
103
static BlockDriverState *bs_snapshots;
104

    
105
/* If non-zero, use only whitelisted block drivers */
106
static int use_bdrv_whitelist;
107

    
108
#ifdef _WIN32
109
static int is_windows_drive_prefix(const char *filename)
110
{
111
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
112
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113
            filename[1] == ':');
114
}
115

    
116
int is_windows_drive(const char *filename)
117
{
118
    if (is_windows_drive_prefix(filename) &&
119
        filename[2] == '\0')
120
        return 1;
121
    if (strstart(filename, "\\\\.\\", NULL) ||
122
        strstart(filename, "//./", NULL))
123
        return 1;
124
    return 0;
125
}
126
#endif
127

    
128
/* throttling disk I/O limits */
129
void bdrv_io_limits_disable(BlockDriverState *bs)
130
{
131
    bs->io_limits_enabled = false;
132

    
133
    while (qemu_co_queue_next(&bs->throttled_reqs));
134

    
135
    if (bs->block_timer) {
136
        qemu_del_timer(bs->block_timer);
137
        qemu_free_timer(bs->block_timer);
138
        bs->block_timer = NULL;
139
    }
140

    
141
    bs->slice_start = 0;
142
    bs->slice_end   = 0;
143
    bs->slice_time  = 0;
144
    memset(&bs->io_base, 0, sizeof(bs->io_base));
145
}
146

    
147
static void bdrv_block_timer(void *opaque)
148
{
149
    BlockDriverState *bs = opaque;
150

    
151
    qemu_co_queue_next(&bs->throttled_reqs);
152
}
153

    
154
void bdrv_io_limits_enable(BlockDriverState *bs)
155
{
156
    qemu_co_queue_init(&bs->throttled_reqs);
157
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
158
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
159
    bs->slice_start = qemu_get_clock_ns(vm_clock);
160
    bs->slice_end   = bs->slice_start + bs->slice_time;
161
    memset(&bs->io_base, 0, sizeof(bs->io_base));
162
    bs->io_limits_enabled = true;
163
}
164

    
165
bool bdrv_io_limits_enabled(BlockDriverState *bs)
166
{
167
    BlockIOLimit *io_limits = &bs->io_limits;
168
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
169
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
170
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
171
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
172
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
173
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
174
}
175

    
176
static void bdrv_io_limits_intercept(BlockDriverState *bs,
177
                                     bool is_write, int nb_sectors)
178
{
179
    int64_t wait_time = -1;
180

    
181
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
182
        qemu_co_queue_wait(&bs->throttled_reqs);
183
    }
184

    
185
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
186
     * throttled requests will not be dequeued until the current request is
187
     * allowed to be serviced. So if the current request still exceeds the
188
     * limits, it will be inserted to the head. All requests followed it will
189
     * be still in throttled_reqs queue.
190
     */
191

    
192
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
193
        qemu_mod_timer(bs->block_timer,
194
                       wait_time + qemu_get_clock_ns(vm_clock));
195
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
196
    }
197

    
198
    qemu_co_queue_next(&bs->throttled_reqs);
199
}
200

    
201
/* check if the path starts with "<protocol>:" */
202
static int path_has_protocol(const char *path)
203
{
204
    const char *p;
205

    
206
#ifdef _WIN32
207
    if (is_windows_drive(path) ||
208
        is_windows_drive_prefix(path)) {
209
        return 0;
210
    }
211
    p = path + strcspn(path, ":/\\");
212
#else
213
    p = path + strcspn(path, ":/");
214
#endif
215

    
216
    return *p == ':';
217
}
218

    
219
int path_is_absolute(const char *path)
220
{
221
#ifdef _WIN32
222
    /* specific case for names like: "\\.\d:" */
223
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
224
        return 1;
225
    }
226
    return (*path == '/' || *path == '\\');
227
#else
228
    return (*path == '/');
229
#endif
230
}
231

    
232
/* if filename is absolute, just copy it to dest. Otherwise, build a
233
   path to it by considering it is relative to base_path. URL are
234
   supported. */
235
void path_combine(char *dest, int dest_size,
236
                  const char *base_path,
237
                  const char *filename)
238
{
239
    const char *p, *p1;
240
    int len;
241

    
242
    if (dest_size <= 0)
243
        return;
244
    if (path_is_absolute(filename)) {
245
        pstrcpy(dest, dest_size, filename);
246
    } else {
247
        p = strchr(base_path, ':');
248
        if (p)
249
            p++;
250
        else
251
            p = base_path;
252
        p1 = strrchr(base_path, '/');
253
#ifdef _WIN32
254
        {
255
            const char *p2;
256
            p2 = strrchr(base_path, '\\');
257
            if (!p1 || p2 > p1)
258
                p1 = p2;
259
        }
260
#endif
261
        if (p1)
262
            p1++;
263
        else
264
            p1 = base_path;
265
        if (p1 > p)
266
            p = p1;
267
        len = p - base_path;
268
        if (len > dest_size - 1)
269
            len = dest_size - 1;
270
        memcpy(dest, base_path, len);
271
        dest[len] = '\0';
272
        pstrcat(dest, dest_size, filename);
273
    }
274
}
275

    
276
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
277
{
278
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
279
        pstrcpy(dest, sz, bs->backing_file);
280
    } else {
281
        path_combine(dest, sz, bs->filename, bs->backing_file);
282
    }
283
}
284

    
285
void bdrv_register(BlockDriver *bdrv)
286
{
287
    /* Block drivers without coroutine functions need emulation */
288
    if (!bdrv->bdrv_co_readv) {
289
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
290
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
291

    
292
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
293
         * the block driver lacks aio we need to emulate that too.
294
         */
295
        if (!bdrv->bdrv_aio_readv) {
296
            /* add AIO emulation layer */
297
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
298
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
299
        }
300
    }
301

    
302
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
303
}
304

    
305
/* create a new block device (by default it is empty) */
306
BlockDriverState *bdrv_new(const char *device_name)
307
{
308
    BlockDriverState *bs;
309

    
310
    bs = g_malloc0(sizeof(BlockDriverState));
311
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
312
    if (device_name[0] != '\0') {
313
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
314
    }
315
    bdrv_iostatus_disable(bs);
316
    notifier_list_init(&bs->close_notifiers);
317

    
318
    return bs;
319
}
320

    
321
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
322
{
323
    notifier_list_add(&bs->close_notifiers, notify);
324
}
325

    
326
BlockDriver *bdrv_find_format(const char *format_name)
327
{
328
    BlockDriver *drv1;
329
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
330
        if (!strcmp(drv1->format_name, format_name)) {
331
            return drv1;
332
        }
333
    }
334
    return NULL;
335
}
336

    
337
static int bdrv_is_whitelisted(BlockDriver *drv)
338
{
339
    static const char *whitelist[] = {
340
        CONFIG_BDRV_WHITELIST
341
    };
342
    const char **p;
343

    
344
    if (!whitelist[0])
345
        return 1;               /* no whitelist, anything goes */
346

    
347
    for (p = whitelist; *p; p++) {
348
        if (!strcmp(drv->format_name, *p)) {
349
            return 1;
350
        }
351
    }
352
    return 0;
353
}
354

    
355
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
356
{
357
    BlockDriver *drv = bdrv_find_format(format_name);
358
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
359
}
360

    
361
typedef struct CreateCo {
362
    BlockDriver *drv;
363
    char *filename;
364
    QEMUOptionParameter *options;
365
    int ret;
366
} CreateCo;
367

    
368
static void coroutine_fn bdrv_create_co_entry(void *opaque)
369
{
370
    CreateCo *cco = opaque;
371
    assert(cco->drv);
372

    
373
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
374
}
375

    
376
int bdrv_create(BlockDriver *drv, const char* filename,
377
    QEMUOptionParameter *options)
378
{
379
    int ret;
380

    
381
    Coroutine *co;
382
    CreateCo cco = {
383
        .drv = drv,
384
        .filename = g_strdup(filename),
385
        .options = options,
386
        .ret = NOT_DONE,
387
    };
388

    
389
    if (!drv->bdrv_create) {
390
        ret = -ENOTSUP;
391
        goto out;
392
    }
393

    
394
    if (qemu_in_coroutine()) {
395
        /* Fast-path if already in coroutine context */
396
        bdrv_create_co_entry(&cco);
397
    } else {
398
        co = qemu_coroutine_create(bdrv_create_co_entry);
399
        qemu_coroutine_enter(co, &cco);
400
        while (cco.ret == NOT_DONE) {
401
            qemu_aio_wait();
402
        }
403
    }
404

    
405
    ret = cco.ret;
406

    
407
out:
408
    g_free(cco.filename);
409
    return ret;
410
}
411

    
412
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
413
{
414
    BlockDriver *drv;
415

    
416
    drv = bdrv_find_protocol(filename);
417
    if (drv == NULL) {
418
        return -ENOENT;
419
    }
420

    
421
    return bdrv_create(drv, filename, options);
422
}
423

    
424
/*
425
 * Create a uniquely-named empty temporary file.
426
 * Return 0 upon success, otherwise a negative errno value.
427
 */
428
int get_tmp_filename(char *filename, int size)
429
{
430
#ifdef _WIN32
431
    char temp_dir[MAX_PATH];
432
    /* GetTempFileName requires that its output buffer (4th param)
433
       have length MAX_PATH or greater.  */
434
    assert(size >= MAX_PATH);
435
    return (GetTempPath(MAX_PATH, temp_dir)
436
            && GetTempFileName(temp_dir, "qem", 0, filename)
437
            ? 0 : -GetLastError());
438
#else
439
    int fd;
440
    const char *tmpdir;
441
    tmpdir = getenv("TMPDIR");
442
    if (!tmpdir)
443
        tmpdir = "/tmp";
444
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
445
        return -EOVERFLOW;
446
    }
447
    fd = mkstemp(filename);
448
    if (fd < 0) {
449
        return -errno;
450
    }
451
    if (close(fd) != 0) {
452
        unlink(filename);
453
        return -errno;
454
    }
455
    return 0;
456
#endif
457
}
458

    
459
/*
460
 * Detect host devices. By convention, /dev/cdrom[N] is always
461
 * recognized as a host CDROM.
462
 */
463
static BlockDriver *find_hdev_driver(const char *filename)
464
{
465
    int score_max = 0, score;
466
    BlockDriver *drv = NULL, *d;
467

    
468
    QLIST_FOREACH(d, &bdrv_drivers, list) {
469
        if (d->bdrv_probe_device) {
470
            score = d->bdrv_probe_device(filename);
471
            if (score > score_max) {
472
                score_max = score;
473
                drv = d;
474
            }
475
        }
476
    }
477

    
478
    return drv;
479
}
480

    
481
BlockDriver *bdrv_find_protocol(const char *filename)
482
{
483
    BlockDriver *drv1;
484
    char protocol[128];
485
    int len;
486
    const char *p;
487

    
488
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
489

    
490
    /*
491
     * XXX(hch): we really should not let host device detection
492
     * override an explicit protocol specification, but moving this
493
     * later breaks access to device names with colons in them.
494
     * Thanks to the brain-dead persistent naming schemes on udev-
495
     * based Linux systems those actually are quite common.
496
     */
497
    drv1 = find_hdev_driver(filename);
498
    if (drv1) {
499
        return drv1;
500
    }
501

    
502
    if (!path_has_protocol(filename)) {
503
        return bdrv_find_format("file");
504
    }
505
    p = strchr(filename, ':');
506
    assert(p != NULL);
507
    len = p - filename;
508
    if (len > sizeof(protocol) - 1)
509
        len = sizeof(protocol) - 1;
510
    memcpy(protocol, filename, len);
511
    protocol[len] = '\0';
512
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
513
        if (drv1->protocol_name &&
514
            !strcmp(drv1->protocol_name, protocol)) {
515
            return drv1;
516
        }
517
    }
518
    return NULL;
519
}
520

    
521
static int find_image_format(BlockDriverState *bs, const char *filename,
522
                             BlockDriver **pdrv)
523
{
524
    int score, score_max;
525
    BlockDriver *drv1, *drv;
526
    uint8_t buf[2048];
527
    int ret = 0;
528

    
529
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
530
    if (bs->sg || !bdrv_is_inserted(bs)) {
531
        drv = bdrv_find_format("raw");
532
        if (!drv) {
533
            ret = -ENOENT;
534
        }
535
        *pdrv = drv;
536
        return ret;
537
    }
538

    
539
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
540
    if (ret < 0) {
541
        *pdrv = NULL;
542
        return ret;
543
    }
544

    
545
    score_max = 0;
546
    drv = NULL;
547
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
548
        if (drv1->bdrv_probe) {
549
            score = drv1->bdrv_probe(buf, ret, filename);
550
            if (score > score_max) {
551
                score_max = score;
552
                drv = drv1;
553
            }
554
        }
555
    }
556
    if (!drv) {
557
        ret = -ENOENT;
558
    }
559
    *pdrv = drv;
560
    return ret;
561
}
562

    
563
/**
564
 * Set the current 'total_sectors' value
565
 */
566
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
567
{
568
    BlockDriver *drv = bs->drv;
569

    
570
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
571
    if (bs->sg)
572
        return 0;
573

    
574
    /* query actual device if possible, otherwise just trust the hint */
575
    if (drv->bdrv_getlength) {
576
        int64_t length = drv->bdrv_getlength(bs);
577
        if (length < 0) {
578
            return length;
579
        }
580
        hint = length >> BDRV_SECTOR_BITS;
581
    }
582

    
583
    bs->total_sectors = hint;
584
    return 0;
585
}
586

    
587
/**
588
 * Set open flags for a given cache mode
589
 *
590
 * Return 0 on success, -1 if the cache mode was invalid.
591
 */
592
int bdrv_parse_cache_flags(const char *mode, int *flags)
593
{
594
    *flags &= ~BDRV_O_CACHE_MASK;
595

    
596
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
597
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
598
    } else if (!strcmp(mode, "directsync")) {
599
        *flags |= BDRV_O_NOCACHE;
600
    } else if (!strcmp(mode, "writeback")) {
601
        *flags |= BDRV_O_CACHE_WB;
602
    } else if (!strcmp(mode, "unsafe")) {
603
        *flags |= BDRV_O_CACHE_WB;
604
        *flags |= BDRV_O_NO_FLUSH;
605
    } else if (!strcmp(mode, "writethrough")) {
606
        /* this is the default */
607
    } else {
608
        return -1;
609
    }
610

    
611
    return 0;
612
}
613

    
614
/**
615
 * The copy-on-read flag is actually a reference count so multiple users may
616
 * use the feature without worrying about clobbering its previous state.
617
 * Copy-on-read stays enabled until all users have called to disable it.
618
 */
619
void bdrv_enable_copy_on_read(BlockDriverState *bs)
620
{
621
    bs->copy_on_read++;
622
}
623

    
624
void bdrv_disable_copy_on_read(BlockDriverState *bs)
625
{
626
    assert(bs->copy_on_read > 0);
627
    bs->copy_on_read--;
628
}
629

    
630
static int bdrv_open_flags(BlockDriverState *bs, int flags)
631
{
632
    int open_flags = flags | BDRV_O_CACHE_WB;
633

    
634
    /*
635
     * Clear flags that are internal to the block layer before opening the
636
     * image.
637
     */
638
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
639

    
640
    /*
641
     * Snapshots should be writable.
642
     */
643
    if (bs->is_temporary) {
644
        open_flags |= BDRV_O_RDWR;
645
    }
646

    
647
    return open_flags;
648
}
649

    
650
/*
651
 * Common part for opening disk images and files
652
 */
653
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
654
    const char *filename,
655
    int flags, BlockDriver *drv)
656
{
657
    int ret, open_flags;
658

    
659
    assert(drv != NULL);
660
    assert(bs->file == NULL);
661

    
662
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
663

    
664
    bs->open_flags = flags;
665
    bs->buffer_alignment = 512;
666

    
667
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
668
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
669
        bdrv_enable_copy_on_read(bs);
670
    }
671

    
672
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
673

    
674
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
675
        return -ENOTSUP;
676
    }
677

    
678
    bs->drv = drv;
679
    bs->opaque = g_malloc0(drv->instance_size);
680

    
681
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
682
    open_flags = bdrv_open_flags(bs, flags);
683

    
684
    bs->read_only = !(open_flags & BDRV_O_RDWR);
685

    
686
    /* Open the image, either directly or using a protocol */
687
    if (drv->bdrv_file_open) {
688
        if (file != NULL) {
689
            bdrv_swap(file, bs);
690
            ret = 0;
691
        } else {
692
            ret = drv->bdrv_file_open(bs, filename, open_flags);
693
        }
694
    } else {
695
        assert(file != NULL);
696
        bs->file = file;
697
        ret = drv->bdrv_open(bs, open_flags);
698
    }
699

    
700
    if (ret < 0) {
701
        goto free_and_fail;
702
    }
703

    
704
    ret = refresh_total_sectors(bs, bs->total_sectors);
705
    if (ret < 0) {
706
        goto free_and_fail;
707
    }
708

    
709
#ifndef _WIN32
710
    if (bs->is_temporary) {
711
        unlink(filename);
712
    }
713
#endif
714
    return 0;
715

    
716
free_and_fail:
717
    bs->file = NULL;
718
    g_free(bs->opaque);
719
    bs->opaque = NULL;
720
    bs->drv = NULL;
721
    return ret;
722
}
723

    
724
/*
725
 * Opens a file using a protocol (file, host_device, nbd, ...)
726
 */
727
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
728
{
729
    BlockDriverState *bs;
730
    BlockDriver *drv;
731
    int ret;
732

    
733
    drv = bdrv_find_protocol(filename);
734
    if (!drv) {
735
        return -ENOENT;
736
    }
737

    
738
    bs = bdrv_new("");
739
    ret = bdrv_open_common(bs, NULL, filename, flags, drv);
740
    if (ret < 0) {
741
        bdrv_delete(bs);
742
        return ret;
743
    }
744
    bs->growable = 1;
745
    *pbs = bs;
746
    return 0;
747
}
748

    
749
int bdrv_open_backing_file(BlockDriverState *bs)
750
{
751
    char backing_filename[PATH_MAX];
752
    int back_flags, ret;
753
    BlockDriver *back_drv = NULL;
754

    
755
    if (bs->backing_hd != NULL) {
756
        return 0;
757
    }
758

    
759
    bs->open_flags &= ~BDRV_O_NO_BACKING;
760
    if (bs->backing_file[0] == '\0') {
761
        return 0;
762
    }
763

    
764
    bs->backing_hd = bdrv_new("");
765
    bdrv_get_full_backing_filename(bs, backing_filename,
766
                                   sizeof(backing_filename));
767

    
768
    if (bs->backing_format[0] != '\0') {
769
        back_drv = bdrv_find_format(bs->backing_format);
770
    }
771

    
772
    /* backing files always opened read-only */
773
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
774

    
775
    ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
776
    if (ret < 0) {
777
        bdrv_delete(bs->backing_hd);
778
        bs->backing_hd = NULL;
779
        bs->open_flags |= BDRV_O_NO_BACKING;
780
        return ret;
781
    }
782
    return 0;
783
}
784

    
785
/*
786
 * Opens a disk image (raw, qcow2, vmdk, ...)
787
 */
788
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
789
              BlockDriver *drv)
790
{
791
    int ret;
792
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
793
    char tmp_filename[PATH_MAX + 1];
794
    BlockDriverState *file = NULL;
795

    
796
    if (flags & BDRV_O_SNAPSHOT) {
797
        BlockDriverState *bs1;
798
        int64_t total_size;
799
        int is_protocol = 0;
800
        BlockDriver *bdrv_qcow2;
801
        QEMUOptionParameter *options;
802
        char backing_filename[PATH_MAX];
803

    
804
        /* if snapshot, we create a temporary backing file and open it
805
           instead of opening 'filename' directly */
806

    
807
        /* if there is a backing file, use it */
808
        bs1 = bdrv_new("");
809
        ret = bdrv_open(bs1, filename, 0, drv);
810
        if (ret < 0) {
811
            bdrv_delete(bs1);
812
            return ret;
813
        }
814
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
815

    
816
        if (bs1->drv && bs1->drv->protocol_name)
817
            is_protocol = 1;
818

    
819
        bdrv_delete(bs1);
820

    
821
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
822
        if (ret < 0) {
823
            return ret;
824
        }
825

    
826
        /* Real path is meaningless for protocols */
827
        if (is_protocol)
828
            snprintf(backing_filename, sizeof(backing_filename),
829
                     "%s", filename);
830
        else if (!realpath(filename, backing_filename))
831
            return -errno;
832

    
833
        bdrv_qcow2 = bdrv_find_format("qcow2");
834
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
835

    
836
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
837
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
838
        if (drv) {
839
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
840
                drv->format_name);
841
        }
842

    
843
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
844
        free_option_parameters(options);
845
        if (ret < 0) {
846
            return ret;
847
        }
848

    
849
        filename = tmp_filename;
850
        drv = bdrv_qcow2;
851
        bs->is_temporary = 1;
852
    }
853

    
854
    /* Open image file without format layer */
855
    if (flags & BDRV_O_RDWR) {
856
        flags |= BDRV_O_ALLOW_RDWR;
857
    }
858

    
859
    ret = bdrv_file_open(&file, filename, bdrv_open_flags(bs, flags));
860
    if (ret < 0) {
861
        return ret;
862
    }
863

    
864
    /* Find the right image format driver */
865
    if (!drv) {
866
        ret = find_image_format(file, filename, &drv);
867
    }
868

    
869
    if (!drv) {
870
        goto unlink_and_fail;
871
    }
872

    
873
    /* Open the image */
874
    ret = bdrv_open_common(bs, file, filename, flags, drv);
875
    if (ret < 0) {
876
        goto unlink_and_fail;
877
    }
878

    
879
    if (bs->file != file) {
880
        bdrv_delete(file);
881
        file = NULL;
882
    }
883

    
884
    /* If there is a backing file, use it */
885
    if ((flags & BDRV_O_NO_BACKING) == 0) {
886
        ret = bdrv_open_backing_file(bs);
887
        if (ret < 0) {
888
            bdrv_close(bs);
889
            return ret;
890
        }
891
    }
892

    
893
    if (!bdrv_key_required(bs)) {
894
        bdrv_dev_change_media_cb(bs, true);
895
    }
896

    
897
    /* throttling disk I/O limits */
898
    if (bs->io_limits_enabled) {
899
        bdrv_io_limits_enable(bs);
900
    }
901

    
902
    return 0;
903

    
904
unlink_and_fail:
905
    if (file != NULL) {
906
        bdrv_delete(file);
907
    }
908
    if (bs->is_temporary) {
909
        unlink(filename);
910
    }
911
    return ret;
912
}
913

    
914
typedef struct BlockReopenQueueEntry {
915
     bool prepared;
916
     BDRVReopenState state;
917
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
918
} BlockReopenQueueEntry;
919

    
920
/*
921
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
922
 * reopen of multiple devices.
923
 *
924
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
925
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
926
 * be created and initialized. This newly created BlockReopenQueue should be
927
 * passed back in for subsequent calls that are intended to be of the same
928
 * atomic 'set'.
929
 *
930
 * bs is the BlockDriverState to add to the reopen queue.
931
 *
932
 * flags contains the open flags for the associated bs
933
 *
934
 * returns a pointer to bs_queue, which is either the newly allocated
935
 * bs_queue, or the existing bs_queue being used.
936
 *
937
 */
938
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
939
                                    BlockDriverState *bs, int flags)
940
{
941
    assert(bs != NULL);
942

    
943
    BlockReopenQueueEntry *bs_entry;
944
    if (bs_queue == NULL) {
945
        bs_queue = g_new0(BlockReopenQueue, 1);
946
        QSIMPLEQ_INIT(bs_queue);
947
    }
948

    
949
    if (bs->file) {
950
        bdrv_reopen_queue(bs_queue, bs->file, flags);
951
    }
952

    
953
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
954
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
955

    
956
    bs_entry->state.bs = bs;
957
    bs_entry->state.flags = flags;
958

    
959
    return bs_queue;
960
}
961

    
962
/*
963
 * Reopen multiple BlockDriverStates atomically & transactionally.
964
 *
965
 * The queue passed in (bs_queue) must have been built up previous
966
 * via bdrv_reopen_queue().
967
 *
968
 * Reopens all BDS specified in the queue, with the appropriate
969
 * flags.  All devices are prepared for reopen, and failure of any
970
 * device will cause all device changes to be abandonded, and intermediate
971
 * data cleaned up.
972
 *
973
 * If all devices prepare successfully, then the changes are committed
974
 * to all devices.
975
 *
976
 */
977
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
978
{
979
    int ret = -1;
980
    BlockReopenQueueEntry *bs_entry, *next;
981
    Error *local_err = NULL;
982

    
983
    assert(bs_queue != NULL);
984

    
985
    bdrv_drain_all();
986

    
987
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
988
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
989
            error_propagate(errp, local_err);
990
            goto cleanup;
991
        }
992
        bs_entry->prepared = true;
993
    }
994

    
995
    /* If we reach this point, we have success and just need to apply the
996
     * changes
997
     */
998
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
999
        bdrv_reopen_commit(&bs_entry->state);
1000
    }
1001

    
1002
    ret = 0;
1003

    
1004
cleanup:
1005
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1006
        if (ret && bs_entry->prepared) {
1007
            bdrv_reopen_abort(&bs_entry->state);
1008
        }
1009
        g_free(bs_entry);
1010
    }
1011
    g_free(bs_queue);
1012
    return ret;
1013
}
1014

    
1015

    
1016
/* Reopen a single BlockDriverState with the specified flags. */
1017
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1018
{
1019
    int ret = -1;
1020
    Error *local_err = NULL;
1021
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1022

    
1023
    ret = bdrv_reopen_multiple(queue, &local_err);
1024
    if (local_err != NULL) {
1025
        error_propagate(errp, local_err);
1026
    }
1027
    return ret;
1028
}
1029

    
1030

    
1031
/*
1032
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1033
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1034
 * the block driver layer .bdrv_reopen_prepare()
1035
 *
1036
 * bs is the BlockDriverState to reopen
1037
 * flags are the new open flags
1038
 * queue is the reopen queue
1039
 *
1040
 * Returns 0 on success, non-zero on error.  On error errp will be set
1041
 * as well.
1042
 *
1043
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1044
 * It is the responsibility of the caller to then call the abort() or
1045
 * commit() for any other BDS that have been left in a prepare() state
1046
 *
1047
 */
1048
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1049
                        Error **errp)
1050
{
1051
    int ret = -1;
1052
    Error *local_err = NULL;
1053
    BlockDriver *drv;
1054

    
1055
    assert(reopen_state != NULL);
1056
    assert(reopen_state->bs->drv != NULL);
1057
    drv = reopen_state->bs->drv;
1058

    
1059
    /* if we are to stay read-only, do not allow permission change
1060
     * to r/w */
1061
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1062
        reopen_state->flags & BDRV_O_RDWR) {
1063
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1064
                  reopen_state->bs->device_name);
1065
        goto error;
1066
    }
1067

    
1068

    
1069
    ret = bdrv_flush(reopen_state->bs);
1070
    if (ret) {
1071
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1072
                  strerror(-ret));
1073
        goto error;
1074
    }
1075

    
1076
    if (drv->bdrv_reopen_prepare) {
1077
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1078
        if (ret) {
1079
            if (local_err != NULL) {
1080
                error_propagate(errp, local_err);
1081
            } else {
1082
                error_set(errp, QERR_OPEN_FILE_FAILED,
1083
                          reopen_state->bs->filename);
1084
            }
1085
            goto error;
1086
        }
1087
    } else {
1088
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1089
         * handler for each supported drv. */
1090
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1091
                  drv->format_name, reopen_state->bs->device_name,
1092
                 "reopening of file");
1093
        ret = -1;
1094
        goto error;
1095
    }
1096

    
1097
    ret = 0;
1098

    
1099
error:
1100
    return ret;
1101
}
1102

    
1103
/*
1104
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1105
 * makes them final by swapping the staging BlockDriverState contents into
1106
 * the active BlockDriverState contents.
1107
 */
1108
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1109
{
1110
    BlockDriver *drv;
1111

    
1112
    assert(reopen_state != NULL);
1113
    drv = reopen_state->bs->drv;
1114
    assert(drv != NULL);
1115

    
1116
    /* If there are any driver level actions to take */
1117
    if (drv->bdrv_reopen_commit) {
1118
        drv->bdrv_reopen_commit(reopen_state);
1119
    }
1120

    
1121
    /* set BDS specific flags now */
1122
    reopen_state->bs->open_flags         = reopen_state->flags;
1123
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1124
                                              BDRV_O_CACHE_WB);
1125
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1126
}
1127

    
1128
/*
1129
 * Abort the reopen, and delete and free the staged changes in
1130
 * reopen_state
1131
 */
1132
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1133
{
1134
    BlockDriver *drv;
1135

    
1136
    assert(reopen_state != NULL);
1137
    drv = reopen_state->bs->drv;
1138
    assert(drv != NULL);
1139

    
1140
    if (drv->bdrv_reopen_abort) {
1141
        drv->bdrv_reopen_abort(reopen_state);
1142
    }
1143
}
1144

    
1145

    
1146
void bdrv_close(BlockDriverState *bs)
1147
{
1148
    bdrv_flush(bs);
1149
    if (bs->job) {
1150
        block_job_cancel_sync(bs->job);
1151
    }
1152
    bdrv_drain_all();
1153
    notifier_list_notify(&bs->close_notifiers, bs);
1154

    
1155
    if (bs->drv) {
1156
        if (bs == bs_snapshots) {
1157
            bs_snapshots = NULL;
1158
        }
1159
        if (bs->backing_hd) {
1160
            bdrv_delete(bs->backing_hd);
1161
            bs->backing_hd = NULL;
1162
        }
1163
        bs->drv->bdrv_close(bs);
1164
        g_free(bs->opaque);
1165
#ifdef _WIN32
1166
        if (bs->is_temporary) {
1167
            unlink(bs->filename);
1168
        }
1169
#endif
1170
        bs->opaque = NULL;
1171
        bs->drv = NULL;
1172
        bs->copy_on_read = 0;
1173
        bs->backing_file[0] = '\0';
1174
        bs->backing_format[0] = '\0';
1175
        bs->total_sectors = 0;
1176
        bs->encrypted = 0;
1177
        bs->valid_key = 0;
1178
        bs->sg = 0;
1179
        bs->growable = 0;
1180

    
1181
        if (bs->file != NULL) {
1182
            bdrv_delete(bs->file);
1183
            bs->file = NULL;
1184
        }
1185
    }
1186

    
1187
    bdrv_dev_change_media_cb(bs, false);
1188

    
1189
    /*throttling disk I/O limits*/
1190
    if (bs->io_limits_enabled) {
1191
        bdrv_io_limits_disable(bs);
1192
    }
1193
}
1194

    
1195
void bdrv_close_all(void)
1196
{
1197
    BlockDriverState *bs;
1198

    
1199
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1200
        bdrv_close(bs);
1201
    }
1202
}
1203

    
1204
/*
1205
 * Wait for pending requests to complete across all BlockDriverStates
1206
 *
1207
 * This function does not flush data to disk, use bdrv_flush_all() for that
1208
 * after calling this function.
1209
 *
1210
 * Note that completion of an asynchronous I/O operation can trigger any
1211
 * number of other I/O operations on other devices---for example a coroutine
1212
 * can be arbitrarily complex and a constant flow of I/O can come until the
1213
 * coroutine is complete.  Because of this, it is not possible to have a
1214
 * function to drain a single device's I/O queue.
1215
 */
1216
void bdrv_drain_all(void)
1217
{
1218
    BlockDriverState *bs;
1219
    bool busy;
1220

    
1221
    do {
1222
        busy = qemu_aio_wait();
1223

    
1224
        /* FIXME: We do not have timer support here, so this is effectively
1225
         * a busy wait.
1226
         */
1227
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1228
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1229
                qemu_co_queue_restart_all(&bs->throttled_reqs);
1230
                busy = true;
1231
            }
1232
        }
1233
    } while (busy);
1234

    
1235
    /* If requests are still pending there is a bug somewhere */
1236
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1237
        assert(QLIST_EMPTY(&bs->tracked_requests));
1238
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
1239
    }
1240
}
1241

    
1242
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1243
   Also, NULL terminate the device_name to prevent double remove */
1244
void bdrv_make_anon(BlockDriverState *bs)
1245
{
1246
    if (bs->device_name[0] != '\0') {
1247
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1248
    }
1249
    bs->device_name[0] = '\0';
1250
}
1251

    
1252
static void bdrv_rebind(BlockDriverState *bs)
1253
{
1254
    if (bs->drv && bs->drv->bdrv_rebind) {
1255
        bs->drv->bdrv_rebind(bs);
1256
    }
1257
}
1258

    
1259
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1260
                                     BlockDriverState *bs_src)
1261
{
1262
    /* move some fields that need to stay attached to the device */
1263
    bs_dest->open_flags         = bs_src->open_flags;
1264

    
1265
    /* dev info */
1266
    bs_dest->dev_ops            = bs_src->dev_ops;
1267
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1268
    bs_dest->dev                = bs_src->dev;
1269
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1270
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1271

    
1272
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1273

    
1274
    /* i/o timing parameters */
1275
    bs_dest->slice_time         = bs_src->slice_time;
1276
    bs_dest->slice_start        = bs_src->slice_start;
1277
    bs_dest->slice_end          = bs_src->slice_end;
1278
    bs_dest->io_limits          = bs_src->io_limits;
1279
    bs_dest->io_base            = bs_src->io_base;
1280
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1281
    bs_dest->block_timer        = bs_src->block_timer;
1282
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1283

    
1284
    /* r/w error */
1285
    bs_dest->on_read_error      = bs_src->on_read_error;
1286
    bs_dest->on_write_error     = bs_src->on_write_error;
1287

    
1288
    /* i/o status */
1289
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1290
    bs_dest->iostatus           = bs_src->iostatus;
1291

    
1292
    /* dirty bitmap */
1293
    bs_dest->dirty_count        = bs_src->dirty_count;
1294
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1295

    
1296
    /* job */
1297
    bs_dest->in_use             = bs_src->in_use;
1298
    bs_dest->job                = bs_src->job;
1299

    
1300
    /* keep the same entry in bdrv_states */
1301
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1302
            bs_src->device_name);
1303
    bs_dest->list = bs_src->list;
1304
}
1305

    
1306
/*
1307
 * Swap bs contents for two image chains while they are live,
1308
 * while keeping required fields on the BlockDriverState that is
1309
 * actually attached to a device.
1310
 *
1311
 * This will modify the BlockDriverState fields, and swap contents
1312
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1313
 *
1314
 * bs_new is required to be anonymous.
1315
 *
1316
 * This function does not create any image files.
1317
 */
1318
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1319
{
1320
    BlockDriverState tmp;
1321

    
1322
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1323
    assert(bs_new->device_name[0] == '\0');
1324
    assert(bs_new->dirty_bitmap == NULL);
1325
    assert(bs_new->job == NULL);
1326
    assert(bs_new->dev == NULL);
1327
    assert(bs_new->in_use == 0);
1328
    assert(bs_new->io_limits_enabled == false);
1329
    assert(bs_new->block_timer == NULL);
1330

    
1331
    tmp = *bs_new;
1332
    *bs_new = *bs_old;
1333
    *bs_old = tmp;
1334

    
1335
    /* there are some fields that should not be swapped, move them back */
1336
    bdrv_move_feature_fields(&tmp, bs_old);
1337
    bdrv_move_feature_fields(bs_old, bs_new);
1338
    bdrv_move_feature_fields(bs_new, &tmp);
1339

    
1340
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1341
    assert(bs_new->device_name[0] == '\0');
1342

    
1343
    /* Check a few fields that should remain attached to the device */
1344
    assert(bs_new->dev == NULL);
1345
    assert(bs_new->job == NULL);
1346
    assert(bs_new->in_use == 0);
1347
    assert(bs_new->io_limits_enabled == false);
1348
    assert(bs_new->block_timer == NULL);
1349

    
1350
    bdrv_rebind(bs_new);
1351
    bdrv_rebind(bs_old);
1352
}
1353

    
1354
/*
1355
 * Add new bs contents at the top of an image chain while the chain is
1356
 * live, while keeping required fields on the top layer.
1357
 *
1358
 * This will modify the BlockDriverState fields, and swap contents
1359
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1360
 *
1361
 * bs_new is required to be anonymous.
1362
 *
1363
 * This function does not create any image files.
1364
 */
1365
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1366
{
1367
    bdrv_swap(bs_new, bs_top);
1368

    
1369
    /* The contents of 'tmp' will become bs_top, as we are
1370
     * swapping bs_new and bs_top contents. */
1371
    bs_top->backing_hd = bs_new;
1372
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1373
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1374
            bs_new->filename);
1375
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1376
            bs_new->drv ? bs_new->drv->format_name : "");
1377
}
1378

    
1379
void bdrv_delete(BlockDriverState *bs)
1380
{
1381
    assert(!bs->dev);
1382
    assert(!bs->job);
1383
    assert(!bs->in_use);
1384

    
1385
    /* remove from list, if necessary */
1386
    bdrv_make_anon(bs);
1387

    
1388
    bdrv_close(bs);
1389

    
1390
    assert(bs != bs_snapshots);
1391
    g_free(bs);
1392
}
1393

    
1394
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1395
/* TODO change to DeviceState *dev when all users are qdevified */
1396
{
1397
    if (bs->dev) {
1398
        return -EBUSY;
1399
    }
1400
    bs->dev = dev;
1401
    bdrv_iostatus_reset(bs);
1402
    return 0;
1403
}
1404

    
1405
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1406
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1407
{
1408
    if (bdrv_attach_dev(bs, dev) < 0) {
1409
        abort();
1410
    }
1411
}
1412

    
1413
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1414
/* TODO change to DeviceState *dev when all users are qdevified */
1415
{
1416
    assert(bs->dev == dev);
1417
    bs->dev = NULL;
1418
    bs->dev_ops = NULL;
1419
    bs->dev_opaque = NULL;
1420
    bs->buffer_alignment = 512;
1421
}
1422

    
1423
/* TODO change to return DeviceState * when all users are qdevified */
1424
void *bdrv_get_attached_dev(BlockDriverState *bs)
1425
{
1426
    return bs->dev;
1427
}
1428

    
1429
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1430
                      void *opaque)
1431
{
1432
    bs->dev_ops = ops;
1433
    bs->dev_opaque = opaque;
1434
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1435
        bs_snapshots = NULL;
1436
    }
1437
}
1438

    
1439
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1440
                               enum MonitorEvent ev,
1441
                               BlockErrorAction action, bool is_read)
1442
{
1443
    QObject *data;
1444
    const char *action_str;
1445

    
1446
    switch (action) {
1447
    case BDRV_ACTION_REPORT:
1448
        action_str = "report";
1449
        break;
1450
    case BDRV_ACTION_IGNORE:
1451
        action_str = "ignore";
1452
        break;
1453
    case BDRV_ACTION_STOP:
1454
        action_str = "stop";
1455
        break;
1456
    default:
1457
        abort();
1458
    }
1459

    
1460
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1461
                              bdrv->device_name,
1462
                              action_str,
1463
                              is_read ? "read" : "write");
1464
    monitor_protocol_event(ev, data);
1465

    
1466
    qobject_decref(data);
1467
}
1468

    
1469
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1470
{
1471
    QObject *data;
1472

    
1473
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1474
                              bdrv_get_device_name(bs), ejected);
1475
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1476

    
1477
    qobject_decref(data);
1478
}
1479

    
1480
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1481
{
1482
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1483
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1484
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1485
        if (tray_was_closed) {
1486
            /* tray open */
1487
            bdrv_emit_qmp_eject_event(bs, true);
1488
        }
1489
        if (load) {
1490
            /* tray close */
1491
            bdrv_emit_qmp_eject_event(bs, false);
1492
        }
1493
    }
1494
}
1495

    
1496
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1497
{
1498
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1499
}
1500

    
1501
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1502
{
1503
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1504
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1505
    }
1506
}
1507

    
1508
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1509
{
1510
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1511
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1512
    }
1513
    return false;
1514
}
1515

    
1516
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1517
{
1518
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1519
        bs->dev_ops->resize_cb(bs->dev_opaque);
1520
    }
1521
}
1522

    
1523
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1524
{
1525
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1526
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1527
    }
1528
    return false;
1529
}
1530

    
1531
/*
1532
 * Run consistency checks on an image
1533
 *
1534
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1535
 * free of errors) or -errno when an internal error occurred. The results of the
1536
 * check are stored in res.
1537
 */
1538
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1539
{
1540
    if (bs->drv->bdrv_check == NULL) {
1541
        return -ENOTSUP;
1542
    }
1543

    
1544
    memset(res, 0, sizeof(*res));
1545
    return bs->drv->bdrv_check(bs, res, fix);
1546
}
1547

    
1548
#define COMMIT_BUF_SECTORS 2048
1549

    
1550
/* commit COW file into the raw image */
1551
int bdrv_commit(BlockDriverState *bs)
1552
{
1553
    BlockDriver *drv = bs->drv;
1554
    int64_t sector, total_sectors;
1555
    int n, ro, open_flags;
1556
    int ret = 0;
1557
    uint8_t *buf;
1558
    char filename[PATH_MAX];
1559

    
1560
    if (!drv)
1561
        return -ENOMEDIUM;
1562
    
1563
    if (!bs->backing_hd) {
1564
        return -ENOTSUP;
1565
    }
1566

    
1567
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1568
        return -EBUSY;
1569
    }
1570

    
1571
    ro = bs->backing_hd->read_only;
1572
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1573
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1574
    open_flags =  bs->backing_hd->open_flags;
1575

    
1576
    if (ro) {
1577
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1578
            return -EACCES;
1579
        }
1580
    }
1581

    
1582
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1583
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1584

    
1585
    for (sector = 0; sector < total_sectors; sector += n) {
1586
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1587

    
1588
            if (bdrv_read(bs, sector, buf, n) != 0) {
1589
                ret = -EIO;
1590
                goto ro_cleanup;
1591
            }
1592

    
1593
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1594
                ret = -EIO;
1595
                goto ro_cleanup;
1596
            }
1597
        }
1598
    }
1599

    
1600
    if (drv->bdrv_make_empty) {
1601
        ret = drv->bdrv_make_empty(bs);
1602
        bdrv_flush(bs);
1603
    }
1604

    
1605
    /*
1606
     * Make sure all data we wrote to the backing device is actually
1607
     * stable on disk.
1608
     */
1609
    if (bs->backing_hd)
1610
        bdrv_flush(bs->backing_hd);
1611

    
1612
ro_cleanup:
1613
    g_free(buf);
1614

    
1615
    if (ro) {
1616
        /* ignoring error return here */
1617
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1618
    }
1619

    
1620
    return ret;
1621
}
1622

    
1623
int bdrv_commit_all(void)
1624
{
1625
    BlockDriverState *bs;
1626

    
1627
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1628
        int ret = bdrv_commit(bs);
1629
        if (ret < 0) {
1630
            return ret;
1631
        }
1632
    }
1633
    return 0;
1634
}
1635

    
1636
struct BdrvTrackedRequest {
1637
    BlockDriverState *bs;
1638
    int64_t sector_num;
1639
    int nb_sectors;
1640
    bool is_write;
1641
    QLIST_ENTRY(BdrvTrackedRequest) list;
1642
    Coroutine *co; /* owner, used for deadlock detection */
1643
    CoQueue wait_queue; /* coroutines blocked on this request */
1644
};
1645

    
1646
/**
1647
 * Remove an active request from the tracked requests list
1648
 *
1649
 * This function should be called when a tracked request is completing.
1650
 */
1651
static void tracked_request_end(BdrvTrackedRequest *req)
1652
{
1653
    QLIST_REMOVE(req, list);
1654
    qemu_co_queue_restart_all(&req->wait_queue);
1655
}
1656

    
1657
/**
1658
 * Add an active request to the tracked requests list
1659
 */
1660
static void tracked_request_begin(BdrvTrackedRequest *req,
1661
                                  BlockDriverState *bs,
1662
                                  int64_t sector_num,
1663
                                  int nb_sectors, bool is_write)
1664
{
1665
    *req = (BdrvTrackedRequest){
1666
        .bs = bs,
1667
        .sector_num = sector_num,
1668
        .nb_sectors = nb_sectors,
1669
        .is_write = is_write,
1670
        .co = qemu_coroutine_self(),
1671
    };
1672

    
1673
    qemu_co_queue_init(&req->wait_queue);
1674

    
1675
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1676
}
1677

    
1678
/**
1679
 * Round a region to cluster boundaries
1680
 */
1681
static void round_to_clusters(BlockDriverState *bs,
1682
                              int64_t sector_num, int nb_sectors,
1683
                              int64_t *cluster_sector_num,
1684
                              int *cluster_nb_sectors)
1685
{
1686
    BlockDriverInfo bdi;
1687

    
1688
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1689
        *cluster_sector_num = sector_num;
1690
        *cluster_nb_sectors = nb_sectors;
1691
    } else {
1692
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1693
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1694
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1695
                                            nb_sectors, c);
1696
    }
1697
}
1698

    
1699
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1700
                                     int64_t sector_num, int nb_sectors) {
1701
    /*        aaaa   bbbb */
1702
    if (sector_num >= req->sector_num + req->nb_sectors) {
1703
        return false;
1704
    }
1705
    /* bbbb   aaaa        */
1706
    if (req->sector_num >= sector_num + nb_sectors) {
1707
        return false;
1708
    }
1709
    return true;
1710
}
1711

    
1712
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1713
        int64_t sector_num, int nb_sectors)
1714
{
1715
    BdrvTrackedRequest *req;
1716
    int64_t cluster_sector_num;
1717
    int cluster_nb_sectors;
1718
    bool retry;
1719

    
1720
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1721
     * that allocating writes will be serialized and not race with each other
1722
     * for the same cluster.  For example, in copy-on-read it ensures that the
1723
     * CoR read and write operations are atomic and guest writes cannot
1724
     * interleave between them.
1725
     */
1726
    round_to_clusters(bs, sector_num, nb_sectors,
1727
                      &cluster_sector_num, &cluster_nb_sectors);
1728

    
1729
    do {
1730
        retry = false;
1731
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1732
            if (tracked_request_overlaps(req, cluster_sector_num,
1733
                                         cluster_nb_sectors)) {
1734
                /* Hitting this means there was a reentrant request, for
1735
                 * example, a block driver issuing nested requests.  This must
1736
                 * never happen since it means deadlock.
1737
                 */
1738
                assert(qemu_coroutine_self() != req->co);
1739

    
1740
                qemu_co_queue_wait(&req->wait_queue);
1741
                retry = true;
1742
                break;
1743
            }
1744
        }
1745
    } while (retry);
1746
}
1747

    
1748
/*
1749
 * Return values:
1750
 * 0        - success
1751
 * -EINVAL  - backing format specified, but no file
1752
 * -ENOSPC  - can't update the backing file because no space is left in the
1753
 *            image file header
1754
 * -ENOTSUP - format driver doesn't support changing the backing file
1755
 */
1756
int bdrv_change_backing_file(BlockDriverState *bs,
1757
    const char *backing_file, const char *backing_fmt)
1758
{
1759
    BlockDriver *drv = bs->drv;
1760
    int ret;
1761

    
1762
    /* Backing file format doesn't make sense without a backing file */
1763
    if (backing_fmt && !backing_file) {
1764
        return -EINVAL;
1765
    }
1766

    
1767
    if (drv->bdrv_change_backing_file != NULL) {
1768
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1769
    } else {
1770
        ret = -ENOTSUP;
1771
    }
1772

    
1773
    if (ret == 0) {
1774
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1775
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1776
    }
1777
    return ret;
1778
}
1779

    
1780
/*
1781
 * Finds the image layer in the chain that has 'bs' as its backing file.
1782
 *
1783
 * active is the current topmost image.
1784
 *
1785
 * Returns NULL if bs is not found in active's image chain,
1786
 * or if active == bs.
1787
 */
1788
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1789
                                    BlockDriverState *bs)
1790
{
1791
    BlockDriverState *overlay = NULL;
1792
    BlockDriverState *intermediate;
1793

    
1794
    assert(active != NULL);
1795
    assert(bs != NULL);
1796

    
1797
    /* if bs is the same as active, then by definition it has no overlay
1798
     */
1799
    if (active == bs) {
1800
        return NULL;
1801
    }
1802

    
1803
    intermediate = active;
1804
    while (intermediate->backing_hd) {
1805
        if (intermediate->backing_hd == bs) {
1806
            overlay = intermediate;
1807
            break;
1808
        }
1809
        intermediate = intermediate->backing_hd;
1810
    }
1811

    
1812
    return overlay;
1813
}
1814

    
1815
typedef struct BlkIntermediateStates {
1816
    BlockDriverState *bs;
1817
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
1818
} BlkIntermediateStates;
1819

    
1820

    
1821
/*
1822
 * Drops images above 'base' up to and including 'top', and sets the image
1823
 * above 'top' to have base as its backing file.
1824
 *
1825
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
1826
 * information in 'bs' can be properly updated.
1827
 *
1828
 * E.g., this will convert the following chain:
1829
 * bottom <- base <- intermediate <- top <- active
1830
 *
1831
 * to
1832
 *
1833
 * bottom <- base <- active
1834
 *
1835
 * It is allowed for bottom==base, in which case it converts:
1836
 *
1837
 * base <- intermediate <- top <- active
1838
 *
1839
 * to
1840
 *
1841
 * base <- active
1842
 *
1843
 * Error conditions:
1844
 *  if active == top, that is considered an error
1845
 *
1846
 */
1847
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
1848
                           BlockDriverState *base)
1849
{
1850
    BlockDriverState *intermediate;
1851
    BlockDriverState *base_bs = NULL;
1852
    BlockDriverState *new_top_bs = NULL;
1853
    BlkIntermediateStates *intermediate_state, *next;
1854
    int ret = -EIO;
1855

    
1856
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
1857
    QSIMPLEQ_INIT(&states_to_delete);
1858

    
1859
    if (!top->drv || !base->drv) {
1860
        goto exit;
1861
    }
1862

    
1863
    new_top_bs = bdrv_find_overlay(active, top);
1864

    
1865
    if (new_top_bs == NULL) {
1866
        /* we could not find the image above 'top', this is an error */
1867
        goto exit;
1868
    }
1869

    
1870
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
1871
     * to do, no intermediate images */
1872
    if (new_top_bs->backing_hd == base) {
1873
        ret = 0;
1874
        goto exit;
1875
    }
1876

    
1877
    intermediate = top;
1878

    
1879
    /* now we will go down through the list, and add each BDS we find
1880
     * into our deletion queue, until we hit the 'base'
1881
     */
1882
    while (intermediate) {
1883
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
1884
        intermediate_state->bs = intermediate;
1885
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
1886

    
1887
        if (intermediate->backing_hd == base) {
1888
            base_bs = intermediate->backing_hd;
1889
            break;
1890
        }
1891
        intermediate = intermediate->backing_hd;
1892
    }
1893
    if (base_bs == NULL) {
1894
        /* something went wrong, we did not end at the base. safely
1895
         * unravel everything, and exit with error */
1896
        goto exit;
1897
    }
1898

    
1899
    /* success - we can delete the intermediate states, and link top->base */
1900
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
1901
                                   base_bs->drv ? base_bs->drv->format_name : "");
1902
    if (ret) {
1903
        goto exit;
1904
    }
1905
    new_top_bs->backing_hd = base_bs;
1906

    
1907

    
1908
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1909
        /* so that bdrv_close() does not recursively close the chain */
1910
        intermediate_state->bs->backing_hd = NULL;
1911
        bdrv_delete(intermediate_state->bs);
1912
    }
1913
    ret = 0;
1914

    
1915
exit:
1916
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1917
        g_free(intermediate_state);
1918
    }
1919
    return ret;
1920
}
1921

    
1922

    
1923
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1924
                                   size_t size)
1925
{
1926
    int64_t len;
1927

    
1928
    if (!bdrv_is_inserted(bs))
1929
        return -ENOMEDIUM;
1930

    
1931
    if (bs->growable)
1932
        return 0;
1933

    
1934
    len = bdrv_getlength(bs);
1935

    
1936
    if (offset < 0)
1937
        return -EIO;
1938

    
1939
    if ((offset > len) || (len - offset < size))
1940
        return -EIO;
1941

    
1942
    return 0;
1943
}
1944

    
1945
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1946
                              int nb_sectors)
1947
{
1948
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1949
                                   nb_sectors * BDRV_SECTOR_SIZE);
1950
}
1951

    
1952
typedef struct RwCo {
1953
    BlockDriverState *bs;
1954
    int64_t sector_num;
1955
    int nb_sectors;
1956
    QEMUIOVector *qiov;
1957
    bool is_write;
1958
    int ret;
1959
} RwCo;
1960

    
1961
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1962
{
1963
    RwCo *rwco = opaque;
1964

    
1965
    if (!rwco->is_write) {
1966
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1967
                                     rwco->nb_sectors, rwco->qiov, 0);
1968
    } else {
1969
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1970
                                      rwco->nb_sectors, rwco->qiov, 0);
1971
    }
1972
}
1973

    
1974
/*
1975
 * Process a synchronous request using coroutines
1976
 */
1977
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1978
                      int nb_sectors, bool is_write)
1979
{
1980
    QEMUIOVector qiov;
1981
    struct iovec iov = {
1982
        .iov_base = (void *)buf,
1983
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1984
    };
1985
    Coroutine *co;
1986
    RwCo rwco = {
1987
        .bs = bs,
1988
        .sector_num = sector_num,
1989
        .nb_sectors = nb_sectors,
1990
        .qiov = &qiov,
1991
        .is_write = is_write,
1992
        .ret = NOT_DONE,
1993
    };
1994

    
1995
    qemu_iovec_init_external(&qiov, &iov, 1);
1996

    
1997
    /**
1998
     * In sync call context, when the vcpu is blocked, this throttling timer
1999
     * will not fire; so the I/O throttling function has to be disabled here
2000
     * if it has been enabled.
2001
     */
2002
    if (bs->io_limits_enabled) {
2003
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2004
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2005
        bdrv_io_limits_disable(bs);
2006
    }
2007

    
2008
    if (qemu_in_coroutine()) {
2009
        /* Fast-path if already in coroutine context */
2010
        bdrv_rw_co_entry(&rwco);
2011
    } else {
2012
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2013
        qemu_coroutine_enter(co, &rwco);
2014
        while (rwco.ret == NOT_DONE) {
2015
            qemu_aio_wait();
2016
        }
2017
    }
2018
    return rwco.ret;
2019
}
2020

    
2021
/* return < 0 if error. See bdrv_write() for the return codes */
2022
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2023
              uint8_t *buf, int nb_sectors)
2024
{
2025
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
2026
}
2027

    
2028
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2029
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2030
                          uint8_t *buf, int nb_sectors)
2031
{
2032
    bool enabled;
2033
    int ret;
2034

    
2035
    enabled = bs->io_limits_enabled;
2036
    bs->io_limits_enabled = false;
2037
    ret = bdrv_read(bs, 0, buf, 1);
2038
    bs->io_limits_enabled = enabled;
2039
    return ret;
2040
}
2041

    
2042
#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
2043

    
2044
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
2045
                             int nb_sectors, int dirty)
2046
{
2047
    int64_t start, end;
2048
    unsigned long val, idx, bit;
2049

    
2050
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
2051
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
2052

    
2053
    for (; start <= end; start++) {
2054
        idx = start / BITS_PER_LONG;
2055
        bit = start % BITS_PER_LONG;
2056
        val = bs->dirty_bitmap[idx];
2057
        if (dirty) {
2058
            if (!(val & (1UL << bit))) {
2059
                bs->dirty_count++;
2060
                val |= 1UL << bit;
2061
            }
2062
        } else {
2063
            if (val & (1UL << bit)) {
2064
                bs->dirty_count--;
2065
                val &= ~(1UL << bit);
2066
            }
2067
        }
2068
        bs->dirty_bitmap[idx] = val;
2069
    }
2070
}
2071

    
2072
/* Return < 0 if error. Important errors are:
2073
  -EIO         generic I/O error (may happen for all errors)
2074
  -ENOMEDIUM   No media inserted.
2075
  -EINVAL      Invalid sector number or nb_sectors
2076
  -EACCES      Trying to write a read-only device
2077
*/
2078
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2079
               const uint8_t *buf, int nb_sectors)
2080
{
2081
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
2082
}
2083

    
2084
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2085
               void *buf, int count1)
2086
{
2087
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2088
    int len, nb_sectors, count;
2089
    int64_t sector_num;
2090
    int ret;
2091

    
2092
    count = count1;
2093
    /* first read to align to sector start */
2094
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2095
    if (len > count)
2096
        len = count;
2097
    sector_num = offset >> BDRV_SECTOR_BITS;
2098
    if (len > 0) {
2099
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2100
            return ret;
2101
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2102
        count -= len;
2103
        if (count == 0)
2104
            return count1;
2105
        sector_num++;
2106
        buf += len;
2107
    }
2108

    
2109
    /* read the sectors "in place" */
2110
    nb_sectors = count >> BDRV_SECTOR_BITS;
2111
    if (nb_sectors > 0) {
2112
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2113
            return ret;
2114
        sector_num += nb_sectors;
2115
        len = nb_sectors << BDRV_SECTOR_BITS;
2116
        buf += len;
2117
        count -= len;
2118
    }
2119

    
2120
    /* add data from the last sector */
2121
    if (count > 0) {
2122
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2123
            return ret;
2124
        memcpy(buf, tmp_buf, count);
2125
    }
2126
    return count1;
2127
}
2128

    
2129
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2130
                const void *buf, int count1)
2131
{
2132
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2133
    int len, nb_sectors, count;
2134
    int64_t sector_num;
2135
    int ret;
2136

    
2137
    count = count1;
2138
    /* first write to align to sector start */
2139
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2140
    if (len > count)
2141
        len = count;
2142
    sector_num = offset >> BDRV_SECTOR_BITS;
2143
    if (len > 0) {
2144
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2145
            return ret;
2146
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
2147
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2148
            return ret;
2149
        count -= len;
2150
        if (count == 0)
2151
            return count1;
2152
        sector_num++;
2153
        buf += len;
2154
    }
2155

    
2156
    /* write the sectors "in place" */
2157
    nb_sectors = count >> BDRV_SECTOR_BITS;
2158
    if (nb_sectors > 0) {
2159
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
2160
            return ret;
2161
        sector_num += nb_sectors;
2162
        len = nb_sectors << BDRV_SECTOR_BITS;
2163
        buf += len;
2164
        count -= len;
2165
    }
2166

    
2167
    /* add data from the last sector */
2168
    if (count > 0) {
2169
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2170
            return ret;
2171
        memcpy(tmp_buf, buf, count);
2172
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2173
            return ret;
2174
    }
2175
    return count1;
2176
}
2177

    
2178
/*
2179
 * Writes to the file and ensures that no writes are reordered across this
2180
 * request (acts as a barrier)
2181
 *
2182
 * Returns 0 on success, -errno in error cases.
2183
 */
2184
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2185
    const void *buf, int count)
2186
{
2187
    int ret;
2188

    
2189
    ret = bdrv_pwrite(bs, offset, buf, count);
2190
    if (ret < 0) {
2191
        return ret;
2192
    }
2193

    
2194
    /* No flush needed for cache modes that already do it */
2195
    if (bs->enable_write_cache) {
2196
        bdrv_flush(bs);
2197
    }
2198

    
2199
    return 0;
2200
}
2201

    
2202
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2203
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2204
{
2205
    /* Perform I/O through a temporary buffer so that users who scribble over
2206
     * their read buffer while the operation is in progress do not end up
2207
     * modifying the image file.  This is critical for zero-copy guest I/O
2208
     * where anything might happen inside guest memory.
2209
     */
2210
    void *bounce_buffer;
2211

    
2212
    BlockDriver *drv = bs->drv;
2213
    struct iovec iov;
2214
    QEMUIOVector bounce_qiov;
2215
    int64_t cluster_sector_num;
2216
    int cluster_nb_sectors;
2217
    size_t skip_bytes;
2218
    int ret;
2219

    
2220
    /* Cover entire cluster so no additional backing file I/O is required when
2221
     * allocating cluster in the image file.
2222
     */
2223
    round_to_clusters(bs, sector_num, nb_sectors,
2224
                      &cluster_sector_num, &cluster_nb_sectors);
2225

    
2226
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2227
                                   cluster_sector_num, cluster_nb_sectors);
2228

    
2229
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2230
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2231
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2232

    
2233
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2234
                             &bounce_qiov);
2235
    if (ret < 0) {
2236
        goto err;
2237
    }
2238

    
2239
    if (drv->bdrv_co_write_zeroes &&
2240
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2241
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2242
                                      cluster_nb_sectors);
2243
    } else {
2244
        /* This does not change the data on the disk, it is not necessary
2245
         * to flush even in cache=writethrough mode.
2246
         */
2247
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2248
                                  &bounce_qiov);
2249
    }
2250

    
2251
    if (ret < 0) {
2252
        /* It might be okay to ignore write errors for guest requests.  If this
2253
         * is a deliberate copy-on-read then we don't want to ignore the error.
2254
         * Simply report it in all cases.
2255
         */
2256
        goto err;
2257
    }
2258

    
2259
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2260
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2261
                        nb_sectors * BDRV_SECTOR_SIZE);
2262

    
2263
err:
2264
    qemu_vfree(bounce_buffer);
2265
    return ret;
2266
}
2267

    
2268
/*
2269
 * Handle a read request in coroutine context
2270
 */
2271
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2272
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2273
    BdrvRequestFlags flags)
2274
{
2275
    BlockDriver *drv = bs->drv;
2276
    BdrvTrackedRequest req;
2277
    int ret;
2278

    
2279
    if (!drv) {
2280
        return -ENOMEDIUM;
2281
    }
2282
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2283
        return -EIO;
2284
    }
2285

    
2286
    /* throttling disk read I/O */
2287
    if (bs->io_limits_enabled) {
2288
        bdrv_io_limits_intercept(bs, false, nb_sectors);
2289
    }
2290

    
2291
    if (bs->copy_on_read) {
2292
        flags |= BDRV_REQ_COPY_ON_READ;
2293
    }
2294
    if (flags & BDRV_REQ_COPY_ON_READ) {
2295
        bs->copy_on_read_in_flight++;
2296
    }
2297

    
2298
    if (bs->copy_on_read_in_flight) {
2299
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2300
    }
2301

    
2302
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2303

    
2304
    if (flags & BDRV_REQ_COPY_ON_READ) {
2305
        int pnum;
2306

    
2307
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2308
        if (ret < 0) {
2309
            goto out;
2310
        }
2311

    
2312
        if (!ret || pnum != nb_sectors) {
2313
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2314
            goto out;
2315
        }
2316
    }
2317

    
2318
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2319

    
2320
out:
2321
    tracked_request_end(&req);
2322

    
2323
    if (flags & BDRV_REQ_COPY_ON_READ) {
2324
        bs->copy_on_read_in_flight--;
2325
    }
2326

    
2327
    return ret;
2328
}
2329

    
2330
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2331
    int nb_sectors, QEMUIOVector *qiov)
2332
{
2333
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2334

    
2335
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2336
}
2337

    
2338
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2339
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2340
{
2341
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2342

    
2343
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2344
                            BDRV_REQ_COPY_ON_READ);
2345
}
2346

    
2347
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2348
    int64_t sector_num, int nb_sectors)
2349
{
2350
    BlockDriver *drv = bs->drv;
2351
    QEMUIOVector qiov;
2352
    struct iovec iov;
2353
    int ret;
2354

    
2355
    /* TODO Emulate only part of misaligned requests instead of letting block
2356
     * drivers return -ENOTSUP and emulate everything */
2357

    
2358
    /* First try the efficient write zeroes operation */
2359
    if (drv->bdrv_co_write_zeroes) {
2360
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2361
        if (ret != -ENOTSUP) {
2362
            return ret;
2363
        }
2364
    }
2365

    
2366
    /* Fall back to bounce buffer if write zeroes is unsupported */
2367
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2368
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2369
    memset(iov.iov_base, 0, iov.iov_len);
2370
    qemu_iovec_init_external(&qiov, &iov, 1);
2371

    
2372
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2373

    
2374
    qemu_vfree(iov.iov_base);
2375
    return ret;
2376
}
2377

    
2378
/*
2379
 * Handle a write request in coroutine context
2380
 */
2381
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2382
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2383
    BdrvRequestFlags flags)
2384
{
2385
    BlockDriver *drv = bs->drv;
2386
    BdrvTrackedRequest req;
2387
    int ret;
2388

    
2389
    if (!bs->drv) {
2390
        return -ENOMEDIUM;
2391
    }
2392
    if (bs->read_only) {
2393
        return -EACCES;
2394
    }
2395
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2396
        return -EIO;
2397
    }
2398

    
2399
    /* throttling disk write I/O */
2400
    if (bs->io_limits_enabled) {
2401
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2402
    }
2403

    
2404
    if (bs->copy_on_read_in_flight) {
2405
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2406
    }
2407

    
2408
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2409

    
2410
    if (flags & BDRV_REQ_ZERO_WRITE) {
2411
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2412
    } else {
2413
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2414
    }
2415

    
2416
    if (ret == 0 && !bs->enable_write_cache) {
2417
        ret = bdrv_co_flush(bs);
2418
    }
2419

    
2420
    if (bs->dirty_bitmap) {
2421
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2422
    }
2423

    
2424
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2425
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2426
    }
2427

    
2428
    tracked_request_end(&req);
2429

    
2430
    return ret;
2431
}
2432

    
2433
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2434
    int nb_sectors, QEMUIOVector *qiov)
2435
{
2436
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2437

    
2438
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2439
}
2440

    
2441
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2442
                                      int64_t sector_num, int nb_sectors)
2443
{
2444
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2445

    
2446
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2447
                             BDRV_REQ_ZERO_WRITE);
2448
}
2449

    
2450
/**
2451
 * Truncate file to 'offset' bytes (needed only for file protocols)
2452
 */
2453
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2454
{
2455
    BlockDriver *drv = bs->drv;
2456
    int ret;
2457
    if (!drv)
2458
        return -ENOMEDIUM;
2459
    if (!drv->bdrv_truncate)
2460
        return -ENOTSUP;
2461
    if (bs->read_only)
2462
        return -EACCES;
2463
    if (bdrv_in_use(bs))
2464
        return -EBUSY;
2465
    ret = drv->bdrv_truncate(bs, offset);
2466
    if (ret == 0) {
2467
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2468
        bdrv_dev_resize_cb(bs);
2469
    }
2470
    return ret;
2471
}
2472

    
2473
/**
2474
 * Length of a allocated file in bytes. Sparse files are counted by actual
2475
 * allocated space. Return < 0 if error or unknown.
2476
 */
2477
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2478
{
2479
    BlockDriver *drv = bs->drv;
2480
    if (!drv) {
2481
        return -ENOMEDIUM;
2482
    }
2483
    if (drv->bdrv_get_allocated_file_size) {
2484
        return drv->bdrv_get_allocated_file_size(bs);
2485
    }
2486
    if (bs->file) {
2487
        return bdrv_get_allocated_file_size(bs->file);
2488
    }
2489
    return -ENOTSUP;
2490
}
2491

    
2492
/**
2493
 * Length of a file in bytes. Return < 0 if error or unknown.
2494
 */
2495
int64_t bdrv_getlength(BlockDriverState *bs)
2496
{
2497
    BlockDriver *drv = bs->drv;
2498
    if (!drv)
2499
        return -ENOMEDIUM;
2500

    
2501
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2502
        if (drv->bdrv_getlength) {
2503
            return drv->bdrv_getlength(bs);
2504
        }
2505
    }
2506
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2507
}
2508

    
2509
/* return 0 as number of sectors if no device present or error */
2510
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2511
{
2512
    int64_t length;
2513
    length = bdrv_getlength(bs);
2514
    if (length < 0)
2515
        length = 0;
2516
    else
2517
        length = length >> BDRV_SECTOR_BITS;
2518
    *nb_sectors_ptr = length;
2519
}
2520

    
2521
/* throttling disk io limits */
2522
void bdrv_set_io_limits(BlockDriverState *bs,
2523
                        BlockIOLimit *io_limits)
2524
{
2525
    bs->io_limits = *io_limits;
2526
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2527
}
2528

    
2529
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2530
                       BlockdevOnError on_write_error)
2531
{
2532
    bs->on_read_error = on_read_error;
2533
    bs->on_write_error = on_write_error;
2534
}
2535

    
2536
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2537
{
2538
    return is_read ? bs->on_read_error : bs->on_write_error;
2539
}
2540

    
2541
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2542
{
2543
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2544

    
2545
    switch (on_err) {
2546
    case BLOCKDEV_ON_ERROR_ENOSPC:
2547
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2548
    case BLOCKDEV_ON_ERROR_STOP:
2549
        return BDRV_ACTION_STOP;
2550
    case BLOCKDEV_ON_ERROR_REPORT:
2551
        return BDRV_ACTION_REPORT;
2552
    case BLOCKDEV_ON_ERROR_IGNORE:
2553
        return BDRV_ACTION_IGNORE;
2554
    default:
2555
        abort();
2556
    }
2557
}
2558

    
2559
/* This is done by device models because, while the block layer knows
2560
 * about the error, it does not know whether an operation comes from
2561
 * the device or the block layer (from a job, for example).
2562
 */
2563
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2564
                       bool is_read, int error)
2565
{
2566
    assert(error >= 0);
2567
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2568
    if (action == BDRV_ACTION_STOP) {
2569
        vm_stop(RUN_STATE_IO_ERROR);
2570
        bdrv_iostatus_set_err(bs, error);
2571
    }
2572
}
2573

    
2574
int bdrv_is_read_only(BlockDriverState *bs)
2575
{
2576
    return bs->read_only;
2577
}
2578

    
2579
int bdrv_is_sg(BlockDriverState *bs)
2580
{
2581
    return bs->sg;
2582
}
2583

    
2584
int bdrv_enable_write_cache(BlockDriverState *bs)
2585
{
2586
    return bs->enable_write_cache;
2587
}
2588

    
2589
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2590
{
2591
    bs->enable_write_cache = wce;
2592

    
2593
    /* so a reopen() will preserve wce */
2594
    if (wce) {
2595
        bs->open_flags |= BDRV_O_CACHE_WB;
2596
    } else {
2597
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2598
    }
2599
}
2600

    
2601
int bdrv_is_encrypted(BlockDriverState *bs)
2602
{
2603
    if (bs->backing_hd && bs->backing_hd->encrypted)
2604
        return 1;
2605
    return bs->encrypted;
2606
}
2607

    
2608
int bdrv_key_required(BlockDriverState *bs)
2609
{
2610
    BlockDriverState *backing_hd = bs->backing_hd;
2611

    
2612
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2613
        return 1;
2614
    return (bs->encrypted && !bs->valid_key);
2615
}
2616

    
2617
int bdrv_set_key(BlockDriverState *bs, const char *key)
2618
{
2619
    int ret;
2620
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2621
        ret = bdrv_set_key(bs->backing_hd, key);
2622
        if (ret < 0)
2623
            return ret;
2624
        if (!bs->encrypted)
2625
            return 0;
2626
    }
2627
    if (!bs->encrypted) {
2628
        return -EINVAL;
2629
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2630
        return -ENOMEDIUM;
2631
    }
2632
    ret = bs->drv->bdrv_set_key(bs, key);
2633
    if (ret < 0) {
2634
        bs->valid_key = 0;
2635
    } else if (!bs->valid_key) {
2636
        bs->valid_key = 1;
2637
        /* call the change callback now, we skipped it on open */
2638
        bdrv_dev_change_media_cb(bs, true);
2639
    }
2640
    return ret;
2641
}
2642

    
2643
const char *bdrv_get_format_name(BlockDriverState *bs)
2644
{
2645
    return bs->drv ? bs->drv->format_name : NULL;
2646
}
2647

    
2648
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2649
                         void *opaque)
2650
{
2651
    BlockDriver *drv;
2652

    
2653
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2654
        it(opaque, drv->format_name);
2655
    }
2656
}
2657

    
2658
BlockDriverState *bdrv_find(const char *name)
2659
{
2660
    BlockDriverState *bs;
2661

    
2662
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2663
        if (!strcmp(name, bs->device_name)) {
2664
            return bs;
2665
        }
2666
    }
2667
    return NULL;
2668
}
2669

    
2670
BlockDriverState *bdrv_next(BlockDriverState *bs)
2671
{
2672
    if (!bs) {
2673
        return QTAILQ_FIRST(&bdrv_states);
2674
    }
2675
    return QTAILQ_NEXT(bs, list);
2676
}
2677

    
2678
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2679
{
2680
    BlockDriverState *bs;
2681

    
2682
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2683
        it(opaque, bs);
2684
    }
2685
}
2686

    
2687
const char *bdrv_get_device_name(BlockDriverState *bs)
2688
{
2689
    return bs->device_name;
2690
}
2691

    
2692
int bdrv_get_flags(BlockDriverState *bs)
2693
{
2694
    return bs->open_flags;
2695
}
2696

    
2697
void bdrv_flush_all(void)
2698
{
2699
    BlockDriverState *bs;
2700

    
2701
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2702
        bdrv_flush(bs);
2703
    }
2704
}
2705

    
2706
int bdrv_has_zero_init(BlockDriverState *bs)
2707
{
2708
    assert(bs->drv);
2709

    
2710
    if (bs->drv->bdrv_has_zero_init) {
2711
        return bs->drv->bdrv_has_zero_init(bs);
2712
    }
2713

    
2714
    return 1;
2715
}
2716

    
2717
typedef struct BdrvCoIsAllocatedData {
2718
    BlockDriverState *bs;
2719
    int64_t sector_num;
2720
    int nb_sectors;
2721
    int *pnum;
2722
    int ret;
2723
    bool done;
2724
} BdrvCoIsAllocatedData;
2725

    
2726
/*
2727
 * Returns true iff the specified sector is present in the disk image. Drivers
2728
 * not implementing the functionality are assumed to not support backing files,
2729
 * hence all their sectors are reported as allocated.
2730
 *
2731
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2732
 * and 'pnum' is set to 0.
2733
 *
2734
 * 'pnum' is set to the number of sectors (including and immediately following
2735
 * the specified sector) that are known to be in the same
2736
 * allocated/unallocated state.
2737
 *
2738
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2739
 * beyond the end of the disk image it will be clamped.
2740
 */
2741
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2742
                                      int nb_sectors, int *pnum)
2743
{
2744
    int64_t n;
2745

    
2746
    if (sector_num >= bs->total_sectors) {
2747
        *pnum = 0;
2748
        return 0;
2749
    }
2750

    
2751
    n = bs->total_sectors - sector_num;
2752
    if (n < nb_sectors) {
2753
        nb_sectors = n;
2754
    }
2755

    
2756
    if (!bs->drv->bdrv_co_is_allocated) {
2757
        *pnum = nb_sectors;
2758
        return 1;
2759
    }
2760

    
2761
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2762
}
2763

    
2764
/* Coroutine wrapper for bdrv_is_allocated() */
2765
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2766
{
2767
    BdrvCoIsAllocatedData *data = opaque;
2768
    BlockDriverState *bs = data->bs;
2769

    
2770
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2771
                                     data->pnum);
2772
    data->done = true;
2773
}
2774

    
2775
/*
2776
 * Synchronous wrapper around bdrv_co_is_allocated().
2777
 *
2778
 * See bdrv_co_is_allocated() for details.
2779
 */
2780
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2781
                      int *pnum)
2782
{
2783
    Coroutine *co;
2784
    BdrvCoIsAllocatedData data = {
2785
        .bs = bs,
2786
        .sector_num = sector_num,
2787
        .nb_sectors = nb_sectors,
2788
        .pnum = pnum,
2789
        .done = false,
2790
    };
2791

    
2792
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2793
    qemu_coroutine_enter(co, &data);
2794
    while (!data.done) {
2795
        qemu_aio_wait();
2796
    }
2797
    return data.ret;
2798
}
2799

    
2800
/*
2801
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2802
 *
2803
 * Return true if the given sector is allocated in any image between
2804
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2805
 * sector is allocated in any image of the chain.  Return false otherwise.
2806
 *
2807
 * 'pnum' is set to the number of sectors (including and immediately following
2808
 *  the specified sector) that are known to be in the same
2809
 *  allocated/unallocated state.
2810
 *
2811
 */
2812
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2813
                                            BlockDriverState *base,
2814
                                            int64_t sector_num,
2815
                                            int nb_sectors, int *pnum)
2816
{
2817
    BlockDriverState *intermediate;
2818
    int ret, n = nb_sectors;
2819

    
2820
    intermediate = top;
2821
    while (intermediate && intermediate != base) {
2822
        int pnum_inter;
2823
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2824
                                   &pnum_inter);
2825
        if (ret < 0) {
2826
            return ret;
2827
        } else if (ret) {
2828
            *pnum = pnum_inter;
2829
            return 1;
2830
        }
2831

    
2832
        /*
2833
         * [sector_num, nb_sectors] is unallocated on top but intermediate
2834
         * might have
2835
         *
2836
         * [sector_num+x, nr_sectors] allocated.
2837
         */
2838
        if (n > pnum_inter) {
2839
            n = pnum_inter;
2840
        }
2841

    
2842
        intermediate = intermediate->backing_hd;
2843
    }
2844

    
2845
    *pnum = n;
2846
    return 0;
2847
}
2848

    
2849
BlockInfo *bdrv_query_info(BlockDriverState *bs)
2850
{
2851
    BlockInfo *info = g_malloc0(sizeof(*info));
2852
    info->device = g_strdup(bs->device_name);
2853
    info->type = g_strdup("unknown");
2854
    info->locked = bdrv_dev_is_medium_locked(bs);
2855
    info->removable = bdrv_dev_has_removable_media(bs);
2856

    
2857
    if (bdrv_dev_has_removable_media(bs)) {
2858
        info->has_tray_open = true;
2859
        info->tray_open = bdrv_dev_is_tray_open(bs);
2860
    }
2861

    
2862
    if (bdrv_iostatus_is_enabled(bs)) {
2863
        info->has_io_status = true;
2864
        info->io_status = bs->iostatus;
2865
    }
2866

    
2867
    if (bs->dirty_bitmap) {
2868
        info->has_dirty = true;
2869
        info->dirty = g_malloc0(sizeof(*info->dirty));
2870
        info->dirty->count = bdrv_get_dirty_count(bs) *
2871
            BDRV_SECTORS_PER_DIRTY_CHUNK * BDRV_SECTOR_SIZE;
2872
    }
2873

    
2874
    if (bs->drv) {
2875
        info->has_inserted = true;
2876
        info->inserted = g_malloc0(sizeof(*info->inserted));
2877
        info->inserted->file = g_strdup(bs->filename);
2878
        info->inserted->ro = bs->read_only;
2879
        info->inserted->drv = g_strdup(bs->drv->format_name);
2880
        info->inserted->encrypted = bs->encrypted;
2881
        info->inserted->encryption_key_missing = bdrv_key_required(bs);
2882

    
2883
        if (bs->backing_file[0]) {
2884
            info->inserted->has_backing_file = true;
2885
            info->inserted->backing_file = g_strdup(bs->backing_file);
2886
        }
2887

    
2888
        info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
2889

    
2890
        if (bs->io_limits_enabled) {
2891
            info->inserted->bps =
2892
                           bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2893
            info->inserted->bps_rd =
2894
                           bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2895
            info->inserted->bps_wr =
2896
                           bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2897
            info->inserted->iops =
2898
                           bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2899
            info->inserted->iops_rd =
2900
                           bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2901
            info->inserted->iops_wr =
2902
                           bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2903
        }
2904
    }
2905
    return info;
2906
}
2907

    
2908
BlockInfoList *qmp_query_block(Error **errp)
2909
{
2910
    BlockInfoList *head = NULL, **p_next = &head;
2911
    BlockDriverState *bs;
2912

    
2913
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2914
        BlockInfoList *info = g_malloc0(sizeof(*info));
2915
        info->value = bdrv_query_info(bs);
2916

    
2917
        *p_next = info;
2918
        p_next = &info->next;
2919
    }
2920

    
2921
    return head;
2922
}
2923

    
2924
BlockStats *bdrv_query_stats(const BlockDriverState *bs)
2925
{
2926
    BlockStats *s;
2927

    
2928
    s = g_malloc0(sizeof(*s));
2929

    
2930
    if (bs->device_name[0]) {
2931
        s->has_device = true;
2932
        s->device = g_strdup(bs->device_name);
2933
    }
2934

    
2935
    s->stats = g_malloc0(sizeof(*s->stats));
2936
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2937
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2938
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2939
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2940
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2941
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2942
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2943
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2944
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2945

    
2946
    if (bs->file) {
2947
        s->has_parent = true;
2948
        s->parent = bdrv_query_stats(bs->file);
2949
    }
2950

    
2951
    return s;
2952
}
2953

    
2954
BlockStatsList *qmp_query_blockstats(Error **errp)
2955
{
2956
    BlockStatsList *head = NULL, **p_next = &head;
2957
    BlockDriverState *bs;
2958

    
2959
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2960
        BlockStatsList *info = g_malloc0(sizeof(*info));
2961
        info->value = bdrv_query_stats(bs);
2962

    
2963
        *p_next = info;
2964
        p_next = &info->next;
2965
    }
2966

    
2967
    return head;
2968
}
2969

    
2970
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2971
{
2972
    if (bs->backing_hd && bs->backing_hd->encrypted)
2973
        return bs->backing_file;
2974
    else if (bs->encrypted)
2975
        return bs->filename;
2976
    else
2977
        return NULL;
2978
}
2979

    
2980
void bdrv_get_backing_filename(BlockDriverState *bs,
2981
                               char *filename, int filename_size)
2982
{
2983
    pstrcpy(filename, filename_size, bs->backing_file);
2984
}
2985

    
2986
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2987
                          const uint8_t *buf, int nb_sectors)
2988
{
2989
    BlockDriver *drv = bs->drv;
2990
    if (!drv)
2991
        return -ENOMEDIUM;
2992
    if (!drv->bdrv_write_compressed)
2993
        return -ENOTSUP;
2994
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2995
        return -EIO;
2996

    
2997
    assert(!bs->dirty_bitmap);
2998

    
2999
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3000
}
3001

    
3002
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3003
{
3004
    BlockDriver *drv = bs->drv;
3005
    if (!drv)
3006
        return -ENOMEDIUM;
3007
    if (!drv->bdrv_get_info)
3008
        return -ENOTSUP;
3009
    memset(bdi, 0, sizeof(*bdi));
3010
    return drv->bdrv_get_info(bs, bdi);
3011
}
3012

    
3013
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3014
                      int64_t pos, int size)
3015
{
3016
    BlockDriver *drv = bs->drv;
3017
    if (!drv)
3018
        return -ENOMEDIUM;
3019
    if (drv->bdrv_save_vmstate)
3020
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
3021
    if (bs->file)
3022
        return bdrv_save_vmstate(bs->file, buf, pos, size);
3023
    return -ENOTSUP;
3024
}
3025

    
3026
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3027
                      int64_t pos, int size)
3028
{
3029
    BlockDriver *drv = bs->drv;
3030
    if (!drv)
3031
        return -ENOMEDIUM;
3032
    if (drv->bdrv_load_vmstate)
3033
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3034
    if (bs->file)
3035
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3036
    return -ENOTSUP;
3037
}
3038

    
3039
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3040
{
3041
    BlockDriver *drv = bs->drv;
3042

    
3043
    if (!drv || !drv->bdrv_debug_event) {
3044
        return;
3045
    }
3046

    
3047
    drv->bdrv_debug_event(bs, event);
3048
}
3049

    
3050
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3051
                          const char *tag)
3052
{
3053
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3054
        bs = bs->file;
3055
    }
3056

    
3057
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3058
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3059
    }
3060

    
3061
    return -ENOTSUP;
3062
}
3063

    
3064
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3065
{
3066
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3067
        bs = bs->file;
3068
    }
3069

    
3070
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3071
        return bs->drv->bdrv_debug_resume(bs, tag);
3072
    }
3073

    
3074
    return -ENOTSUP;
3075
}
3076

    
3077
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3078
{
3079
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3080
        bs = bs->file;
3081
    }
3082

    
3083
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3084
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3085
    }
3086

    
3087
    return false;
3088
}
3089

    
3090
/**************************************************************/
3091
/* handling of snapshots */
3092

    
3093
int bdrv_can_snapshot(BlockDriverState *bs)
3094
{
3095
    BlockDriver *drv = bs->drv;
3096
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3097
        return 0;
3098
    }
3099

    
3100
    if (!drv->bdrv_snapshot_create) {
3101
        if (bs->file != NULL) {
3102
            return bdrv_can_snapshot(bs->file);
3103
        }
3104
        return 0;
3105
    }
3106

    
3107
    return 1;
3108
}
3109

    
3110
int bdrv_is_snapshot(BlockDriverState *bs)
3111
{
3112
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3113
}
3114

    
3115
BlockDriverState *bdrv_snapshots(void)
3116
{
3117
    BlockDriverState *bs;
3118

    
3119
    if (bs_snapshots) {
3120
        return bs_snapshots;
3121
    }
3122

    
3123
    bs = NULL;
3124
    while ((bs = bdrv_next(bs))) {
3125
        if (bdrv_can_snapshot(bs)) {
3126
            bs_snapshots = bs;
3127
            return bs;
3128
        }
3129
    }
3130
    return NULL;
3131
}
3132

    
3133
int bdrv_snapshot_create(BlockDriverState *bs,
3134
                         QEMUSnapshotInfo *sn_info)
3135
{
3136
    BlockDriver *drv = bs->drv;
3137
    if (!drv)
3138
        return -ENOMEDIUM;
3139
    if (drv->bdrv_snapshot_create)
3140
        return drv->bdrv_snapshot_create(bs, sn_info);
3141
    if (bs->file)
3142
        return bdrv_snapshot_create(bs->file, sn_info);
3143
    return -ENOTSUP;
3144
}
3145

    
3146
int bdrv_snapshot_goto(BlockDriverState *bs,
3147
                       const char *snapshot_id)
3148
{
3149
    BlockDriver *drv = bs->drv;
3150
    int ret, open_ret;
3151

    
3152
    if (!drv)
3153
        return -ENOMEDIUM;
3154
    if (drv->bdrv_snapshot_goto)
3155
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
3156

    
3157
    if (bs->file) {
3158
        drv->bdrv_close(bs);
3159
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
3160
        open_ret = drv->bdrv_open(bs, bs->open_flags);
3161
        if (open_ret < 0) {
3162
            bdrv_delete(bs->file);
3163
            bs->drv = NULL;
3164
            return open_ret;
3165
        }
3166
        return ret;
3167
    }
3168

    
3169
    return -ENOTSUP;
3170
}
3171

    
3172
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
3173
{
3174
    BlockDriver *drv = bs->drv;
3175
    if (!drv)
3176
        return -ENOMEDIUM;
3177
    if (drv->bdrv_snapshot_delete)
3178
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
3179
    if (bs->file)
3180
        return bdrv_snapshot_delete(bs->file, snapshot_id);
3181
    return -ENOTSUP;
3182
}
3183

    
3184
int bdrv_snapshot_list(BlockDriverState *bs,
3185
                       QEMUSnapshotInfo **psn_info)
3186
{
3187
    BlockDriver *drv = bs->drv;
3188
    if (!drv)
3189
        return -ENOMEDIUM;
3190
    if (drv->bdrv_snapshot_list)
3191
        return drv->bdrv_snapshot_list(bs, psn_info);
3192
    if (bs->file)
3193
        return bdrv_snapshot_list(bs->file, psn_info);
3194
    return -ENOTSUP;
3195
}
3196

    
3197
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
3198
        const char *snapshot_name)
3199
{
3200
    BlockDriver *drv = bs->drv;
3201
    if (!drv) {
3202
        return -ENOMEDIUM;
3203
    }
3204
    if (!bs->read_only) {
3205
        return -EINVAL;
3206
    }
3207
    if (drv->bdrv_snapshot_load_tmp) {
3208
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
3209
    }
3210
    return -ENOTSUP;
3211
}
3212

    
3213
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3214
 * relative, it must be relative to the chain.  So, passing in bs->filename
3215
 * from a BDS as backing_file should not be done, as that may be relative to
3216
 * the CWD rather than the chain. */
3217
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3218
        const char *backing_file)
3219
{
3220
    char *filename_full = NULL;
3221
    char *backing_file_full = NULL;
3222
    char *filename_tmp = NULL;
3223
    int is_protocol = 0;
3224
    BlockDriverState *curr_bs = NULL;
3225
    BlockDriverState *retval = NULL;
3226

    
3227
    if (!bs || !bs->drv || !backing_file) {
3228
        return NULL;
3229
    }
3230

    
3231
    filename_full     = g_malloc(PATH_MAX);
3232
    backing_file_full = g_malloc(PATH_MAX);
3233
    filename_tmp      = g_malloc(PATH_MAX);
3234

    
3235
    is_protocol = path_has_protocol(backing_file);
3236

    
3237
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3238

    
3239
        /* If either of the filename paths is actually a protocol, then
3240
         * compare unmodified paths; otherwise make paths relative */
3241
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3242
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3243
                retval = curr_bs->backing_hd;
3244
                break;
3245
            }
3246
        } else {
3247
            /* If not an absolute filename path, make it relative to the current
3248
             * image's filename path */
3249
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3250
                         backing_file);
3251

    
3252
            /* We are going to compare absolute pathnames */
3253
            if (!realpath(filename_tmp, filename_full)) {
3254
                continue;
3255
            }
3256

    
3257
            /* We need to make sure the backing filename we are comparing against
3258
             * is relative to the current image filename (or absolute) */
3259
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3260
                         curr_bs->backing_file);
3261

    
3262
            if (!realpath(filename_tmp, backing_file_full)) {
3263
                continue;
3264
            }
3265

    
3266
            if (strcmp(backing_file_full, filename_full) == 0) {
3267
                retval = curr_bs->backing_hd;
3268
                break;
3269
            }
3270
        }
3271
    }
3272

    
3273
    g_free(filename_full);
3274
    g_free(backing_file_full);
3275
    g_free(filename_tmp);
3276
    return retval;
3277
}
3278

    
3279
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3280
{
3281
    if (!bs->drv) {
3282
        return 0;
3283
    }
3284

    
3285
    if (!bs->backing_hd) {
3286
        return 0;
3287
    }
3288

    
3289
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3290
}
3291

    
3292
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3293
{
3294
    BlockDriverState *curr_bs = NULL;
3295

    
3296
    if (!bs) {
3297
        return NULL;
3298
    }
3299

    
3300
    curr_bs = bs;
3301

    
3302
    while (curr_bs->backing_hd) {
3303
        curr_bs = curr_bs->backing_hd;
3304
    }
3305
    return curr_bs;
3306
}
3307

    
3308
#define NB_SUFFIXES 4
3309

    
3310
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
3311
{
3312
    static const char suffixes[NB_SUFFIXES] = "KMGT";
3313
    int64_t base;
3314
    int i;
3315

    
3316
    if (size <= 999) {
3317
        snprintf(buf, buf_size, "%" PRId64, size);
3318
    } else {
3319
        base = 1024;
3320
        for(i = 0; i < NB_SUFFIXES; i++) {
3321
            if (size < (10 * base)) {
3322
                snprintf(buf, buf_size, "%0.1f%c",
3323
                         (double)size / base,
3324
                         suffixes[i]);
3325
                break;
3326
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3327
                snprintf(buf, buf_size, "%" PRId64 "%c",
3328
                         ((size + (base >> 1)) / base),
3329
                         suffixes[i]);
3330
                break;
3331
            }
3332
            base = base * 1024;
3333
        }
3334
    }
3335
    return buf;
3336
}
3337

    
3338
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3339
{
3340
    char buf1[128], date_buf[128], clock_buf[128];
3341
#ifdef _WIN32
3342
    struct tm *ptm;
3343
#else
3344
    struct tm tm;
3345
#endif
3346
    time_t ti;
3347
    int64_t secs;
3348

    
3349
    if (!sn) {
3350
        snprintf(buf, buf_size,
3351
                 "%-10s%-20s%7s%20s%15s",
3352
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3353
    } else {
3354
        ti = sn->date_sec;
3355
#ifdef _WIN32
3356
        ptm = localtime(&ti);
3357
        strftime(date_buf, sizeof(date_buf),
3358
                 "%Y-%m-%d %H:%M:%S", ptm);
3359
#else
3360
        localtime_r(&ti, &tm);
3361
        strftime(date_buf, sizeof(date_buf),
3362
                 "%Y-%m-%d %H:%M:%S", &tm);
3363
#endif
3364
        secs = sn->vm_clock_nsec / 1000000000;
3365
        snprintf(clock_buf, sizeof(clock_buf),
3366
                 "%02d:%02d:%02d.%03d",
3367
                 (int)(secs / 3600),
3368
                 (int)((secs / 60) % 60),
3369
                 (int)(secs % 60),
3370
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3371
        snprintf(buf, buf_size,
3372
                 "%-10s%-20s%7s%20s%15s",
3373
                 sn->id_str, sn->name,
3374
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3375
                 date_buf,
3376
                 clock_buf);
3377
    }
3378
    return buf;
3379
}
3380

    
3381
/**************************************************************/
3382
/* async I/Os */
3383

    
3384
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3385
                                 QEMUIOVector *qiov, int nb_sectors,
3386
                                 BlockDriverCompletionFunc *cb, void *opaque)
3387
{
3388
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3389

    
3390
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3391
                                 cb, opaque, false);
3392
}
3393

    
3394
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3395
                                  QEMUIOVector *qiov, int nb_sectors,
3396
                                  BlockDriverCompletionFunc *cb, void *opaque)
3397
{
3398
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3399

    
3400
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3401
                                 cb, opaque, true);
3402
}
3403

    
3404

    
3405
typedef struct MultiwriteCB {
3406
    int error;
3407
    int num_requests;
3408
    int num_callbacks;
3409
    struct {
3410
        BlockDriverCompletionFunc *cb;
3411
        void *opaque;
3412
        QEMUIOVector *free_qiov;
3413
    } callbacks[];
3414
} MultiwriteCB;
3415

    
3416
static void multiwrite_user_cb(MultiwriteCB *mcb)
3417
{
3418
    int i;
3419

    
3420
    for (i = 0; i < mcb->num_callbacks; i++) {
3421
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3422
        if (mcb->callbacks[i].free_qiov) {
3423
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3424
        }
3425
        g_free(mcb->callbacks[i].free_qiov);
3426
    }
3427
}
3428

    
3429
static void multiwrite_cb(void *opaque, int ret)
3430
{
3431
    MultiwriteCB *mcb = opaque;
3432

    
3433
    trace_multiwrite_cb(mcb, ret);
3434

    
3435
    if (ret < 0 && !mcb->error) {
3436
        mcb->error = ret;
3437
    }
3438

    
3439
    mcb->num_requests--;
3440
    if (mcb->num_requests == 0) {
3441
        multiwrite_user_cb(mcb);
3442
        g_free(mcb);
3443
    }
3444
}
3445

    
3446
static int multiwrite_req_compare(const void *a, const void *b)
3447
{
3448
    const BlockRequest *req1 = a, *req2 = b;
3449

    
3450
    /*
3451
     * Note that we can't simply subtract req2->sector from req1->sector
3452
     * here as that could overflow the return value.
3453
     */
3454
    if (req1->sector > req2->sector) {
3455
        return 1;
3456
    } else if (req1->sector < req2->sector) {
3457
        return -1;
3458
    } else {
3459
        return 0;
3460
    }
3461
}
3462

    
3463
/*
3464
 * Takes a bunch of requests and tries to merge them. Returns the number of
3465
 * requests that remain after merging.
3466
 */
3467
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3468
    int num_reqs, MultiwriteCB *mcb)
3469
{
3470
    int i, outidx;
3471

    
3472
    // Sort requests by start sector
3473
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3474

    
3475
    // Check if adjacent requests touch the same clusters. If so, combine them,
3476
    // filling up gaps with zero sectors.
3477
    outidx = 0;
3478
    for (i = 1; i < num_reqs; i++) {
3479
        int merge = 0;
3480
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3481

    
3482
        // Handle exactly sequential writes and overlapping writes.
3483
        if (reqs[i].sector <= oldreq_last) {
3484
            merge = 1;
3485
        }
3486

    
3487
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3488
            merge = 0;
3489
        }
3490

    
3491
        if (merge) {
3492
            size_t size;
3493
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3494
            qemu_iovec_init(qiov,
3495
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3496

    
3497
            // Add the first request to the merged one. If the requests are
3498
            // overlapping, drop the last sectors of the first request.
3499
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3500
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3501

    
3502
            // We should need to add any zeros between the two requests
3503
            assert (reqs[i].sector <= oldreq_last);
3504

    
3505
            // Add the second request
3506
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3507

    
3508
            reqs[outidx].nb_sectors = qiov->size >> 9;
3509
            reqs[outidx].qiov = qiov;
3510

    
3511
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3512
        } else {
3513
            outidx++;
3514
            reqs[outidx].sector     = reqs[i].sector;
3515
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3516
            reqs[outidx].qiov       = reqs[i].qiov;
3517
        }
3518
    }
3519

    
3520
    return outidx + 1;
3521
}
3522

    
3523
/*
3524
 * Submit multiple AIO write requests at once.
3525
 *
3526
 * On success, the function returns 0 and all requests in the reqs array have
3527
 * been submitted. In error case this function returns -1, and any of the
3528
 * requests may or may not be submitted yet. In particular, this means that the
3529
 * callback will be called for some of the requests, for others it won't. The
3530
 * caller must check the error field of the BlockRequest to wait for the right
3531
 * callbacks (if error != 0, no callback will be called).
3532
 *
3533
 * The implementation may modify the contents of the reqs array, e.g. to merge
3534
 * requests. However, the fields opaque and error are left unmodified as they
3535
 * are used to signal failure for a single request to the caller.
3536
 */
3537
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3538
{
3539
    MultiwriteCB *mcb;
3540
    int i;
3541

    
3542
    /* don't submit writes if we don't have a medium */
3543
    if (bs->drv == NULL) {
3544
        for (i = 0; i < num_reqs; i++) {
3545
            reqs[i].error = -ENOMEDIUM;
3546
        }
3547
        return -1;
3548
    }
3549

    
3550
    if (num_reqs == 0) {
3551
        return 0;
3552
    }
3553

    
3554
    // Create MultiwriteCB structure
3555
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3556
    mcb->num_requests = 0;
3557
    mcb->num_callbacks = num_reqs;
3558

    
3559
    for (i = 0; i < num_reqs; i++) {
3560
        mcb->callbacks[i].cb = reqs[i].cb;
3561
        mcb->callbacks[i].opaque = reqs[i].opaque;
3562
    }
3563

    
3564
    // Check for mergable requests
3565
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3566

    
3567
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3568

    
3569
    /* Run the aio requests. */
3570
    mcb->num_requests = num_reqs;
3571
    for (i = 0; i < num_reqs; i++) {
3572
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3573
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3574
    }
3575

    
3576
    return 0;
3577
}
3578

    
3579
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3580
{
3581
    acb->aiocb_info->cancel(acb);
3582
}
3583

    
3584
/* block I/O throttling */
3585
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3586
                 bool is_write, double elapsed_time, uint64_t *wait)
3587
{
3588
    uint64_t bps_limit = 0;
3589
    double   bytes_limit, bytes_base, bytes_res;
3590
    double   slice_time, wait_time;
3591

    
3592
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3593
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3594
    } else if (bs->io_limits.bps[is_write]) {
3595
        bps_limit = bs->io_limits.bps[is_write];
3596
    } else {
3597
        if (wait) {
3598
            *wait = 0;
3599
        }
3600

    
3601
        return false;
3602
    }
3603

    
3604
    slice_time = bs->slice_end - bs->slice_start;
3605
    slice_time /= (NANOSECONDS_PER_SECOND);
3606
    bytes_limit = bps_limit * slice_time;
3607
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3608
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3609
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3610
    }
3611

    
3612
    /* bytes_base: the bytes of data which have been read/written; and
3613
     *             it is obtained from the history statistic info.
3614
     * bytes_res: the remaining bytes of data which need to be read/written.
3615
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3616
     *             the total time for completing reading/writting all data.
3617
     */
3618
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3619

    
3620
    if (bytes_base + bytes_res <= bytes_limit) {
3621
        if (wait) {
3622
            *wait = 0;
3623
        }
3624

    
3625
        return false;
3626
    }
3627

    
3628
    /* Calc approx time to dispatch */
3629
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3630

    
3631
    /* When the I/O rate at runtime exceeds the limits,
3632
     * bs->slice_end need to be extended in order that the current statistic
3633
     * info can be kept until the timer fire, so it is increased and tuned
3634
     * based on the result of experiment.
3635
     */
3636
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3637
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3638
    if (wait) {
3639
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3640
    }
3641

    
3642
    return true;
3643
}
3644

    
3645
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3646
                             double elapsed_time, uint64_t *wait)
3647
{
3648
    uint64_t iops_limit = 0;
3649
    double   ios_limit, ios_base;
3650
    double   slice_time, wait_time;
3651

    
3652
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3653
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3654
    } else if (bs->io_limits.iops[is_write]) {
3655
        iops_limit = bs->io_limits.iops[is_write];
3656
    } else {
3657
        if (wait) {
3658
            *wait = 0;
3659
        }
3660

    
3661
        return false;
3662
    }
3663

    
3664
    slice_time = bs->slice_end - bs->slice_start;
3665
    slice_time /= (NANOSECONDS_PER_SECOND);
3666
    ios_limit  = iops_limit * slice_time;
3667
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3668
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3669
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3670
    }
3671

    
3672
    if (ios_base + 1 <= ios_limit) {
3673
        if (wait) {
3674
            *wait = 0;
3675
        }
3676

    
3677
        return false;
3678
    }
3679

    
3680
    /* Calc approx time to dispatch */
3681
    wait_time = (ios_base + 1) / iops_limit;
3682
    if (wait_time > elapsed_time) {
3683
        wait_time = wait_time - elapsed_time;
3684
    } else {
3685
        wait_time = 0;
3686
    }
3687

    
3688
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3689
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3690
    if (wait) {
3691
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3692
    }
3693

    
3694
    return true;
3695
}
3696

    
3697
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3698
                           bool is_write, int64_t *wait)
3699
{
3700
    int64_t  now, max_wait;
3701
    uint64_t bps_wait = 0, iops_wait = 0;
3702
    double   elapsed_time;
3703
    int      bps_ret, iops_ret;
3704

    
3705
    now = qemu_get_clock_ns(vm_clock);
3706
    if ((bs->slice_start < now)
3707
        && (bs->slice_end > now)) {
3708
        bs->slice_end = now + bs->slice_time;
3709
    } else {
3710
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3711
        bs->slice_start = now;
3712
        bs->slice_end   = now + bs->slice_time;
3713

    
3714
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3715
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3716

    
3717
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3718
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3719
    }
3720

    
3721
    elapsed_time  = now - bs->slice_start;
3722
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3723

    
3724
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3725
                                      is_write, elapsed_time, &bps_wait);
3726
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3727
                                      elapsed_time, &iops_wait);
3728
    if (bps_ret || iops_ret) {
3729
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3730
        if (wait) {
3731
            *wait = max_wait;
3732
        }
3733

    
3734
        now = qemu_get_clock_ns(vm_clock);
3735
        if (bs->slice_end < now + max_wait) {
3736
            bs->slice_end = now + max_wait;
3737
        }
3738

    
3739
        return true;
3740
    }
3741

    
3742
    if (wait) {
3743
        *wait = 0;
3744
    }
3745

    
3746
    return false;
3747
}
3748

    
3749
/**************************************************************/
3750
/* async block device emulation */
3751

    
3752
typedef struct BlockDriverAIOCBSync {
3753
    BlockDriverAIOCB common;
3754
    QEMUBH *bh;
3755
    int ret;
3756
    /* vector translation state */
3757
    QEMUIOVector *qiov;
3758
    uint8_t *bounce;
3759
    int is_write;
3760
} BlockDriverAIOCBSync;
3761

    
3762
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3763
{
3764
    BlockDriverAIOCBSync *acb =
3765
        container_of(blockacb, BlockDriverAIOCBSync, common);
3766
    qemu_bh_delete(acb->bh);
3767
    acb->bh = NULL;
3768
    qemu_aio_release(acb);
3769
}
3770

    
3771
static const AIOCBInfo bdrv_em_aiocb_info = {
3772
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3773
    .cancel             = bdrv_aio_cancel_em,
3774
};
3775

    
3776
static void bdrv_aio_bh_cb(void *opaque)
3777
{
3778
    BlockDriverAIOCBSync *acb = opaque;
3779

    
3780
    if (!acb->is_write)
3781
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3782
    qemu_vfree(acb->bounce);
3783
    acb->common.cb(acb->common.opaque, acb->ret);
3784
    qemu_bh_delete(acb->bh);
3785
    acb->bh = NULL;
3786
    qemu_aio_release(acb);
3787
}
3788

    
3789
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3790
                                            int64_t sector_num,
3791
                                            QEMUIOVector *qiov,
3792
                                            int nb_sectors,
3793
                                            BlockDriverCompletionFunc *cb,
3794
                                            void *opaque,
3795
                                            int is_write)
3796

    
3797
{
3798
    BlockDriverAIOCBSync *acb;
3799

    
3800
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3801
    acb->is_write = is_write;
3802
    acb->qiov = qiov;
3803
    acb->bounce = qemu_blockalign(bs, qiov->size);
3804
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3805

    
3806
    if (is_write) {
3807
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3808
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3809
    } else {
3810
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3811
    }
3812

    
3813
    qemu_bh_schedule(acb->bh);
3814

    
3815
    return &acb->common;
3816
}
3817

    
3818
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3819
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3820
        BlockDriverCompletionFunc *cb, void *opaque)
3821
{
3822
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3823
}
3824

    
3825
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3826
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3827
        BlockDriverCompletionFunc *cb, void *opaque)
3828
{
3829
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3830
}
3831

    
3832

    
3833
typedef struct BlockDriverAIOCBCoroutine {
3834
    BlockDriverAIOCB common;
3835
    BlockRequest req;
3836
    bool is_write;
3837
    bool *done;
3838
    QEMUBH* bh;
3839
} BlockDriverAIOCBCoroutine;
3840

    
3841
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3842
{
3843
    BlockDriverAIOCBCoroutine *acb =
3844
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3845
    bool done = false;
3846

    
3847
    acb->done = &done;
3848
    while (!done) {
3849
        qemu_aio_wait();
3850
    }
3851
}
3852

    
3853
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3854
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3855
    .cancel             = bdrv_aio_co_cancel_em,
3856
};
3857

    
3858
static void bdrv_co_em_bh(void *opaque)
3859
{
3860
    BlockDriverAIOCBCoroutine *acb = opaque;
3861

    
3862
    acb->common.cb(acb->common.opaque, acb->req.error);
3863

    
3864
    if (acb->done) {
3865
        *acb->done = true;
3866
    }
3867

    
3868
    qemu_bh_delete(acb->bh);
3869
    qemu_aio_release(acb);
3870
}
3871

    
3872
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3873
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3874
{
3875
    BlockDriverAIOCBCoroutine *acb = opaque;
3876
    BlockDriverState *bs = acb->common.bs;
3877

    
3878
    if (!acb->is_write) {
3879
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3880
            acb->req.nb_sectors, acb->req.qiov, 0);
3881
    } else {
3882
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3883
            acb->req.nb_sectors, acb->req.qiov, 0);
3884
    }
3885

    
3886
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3887
    qemu_bh_schedule(acb->bh);
3888
}
3889

    
3890
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3891
                                               int64_t sector_num,
3892
                                               QEMUIOVector *qiov,
3893
                                               int nb_sectors,
3894
                                               BlockDriverCompletionFunc *cb,
3895
                                               void *opaque,
3896
                                               bool is_write)
3897
{
3898
    Coroutine *co;
3899
    BlockDriverAIOCBCoroutine *acb;
3900

    
3901
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3902
    acb->req.sector = sector_num;
3903
    acb->req.nb_sectors = nb_sectors;
3904
    acb->req.qiov = qiov;
3905
    acb->is_write = is_write;
3906
    acb->done = NULL;
3907

    
3908
    co = qemu_coroutine_create(bdrv_co_do_rw);
3909
    qemu_coroutine_enter(co, acb);
3910

    
3911
    return &acb->common;
3912
}
3913

    
3914
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3915
{
3916
    BlockDriverAIOCBCoroutine *acb = opaque;
3917
    BlockDriverState *bs = acb->common.bs;
3918

    
3919
    acb->req.error = bdrv_co_flush(bs);
3920
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3921
    qemu_bh_schedule(acb->bh);
3922
}
3923

    
3924
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3925
        BlockDriverCompletionFunc *cb, void *opaque)
3926
{
3927
    trace_bdrv_aio_flush(bs, opaque);
3928

    
3929
    Coroutine *co;
3930
    BlockDriverAIOCBCoroutine *acb;
3931

    
3932
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3933
    acb->done = NULL;
3934

    
3935
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3936
    qemu_coroutine_enter(co, acb);
3937

    
3938
    return &acb->common;
3939
}
3940

    
3941
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3942
{
3943
    BlockDriverAIOCBCoroutine *acb = opaque;
3944
    BlockDriverState *bs = acb->common.bs;
3945

    
3946
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3947
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3948
    qemu_bh_schedule(acb->bh);
3949
}
3950

    
3951
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3952
        int64_t sector_num, int nb_sectors,
3953
        BlockDriverCompletionFunc *cb, void *opaque)
3954
{
3955
    Coroutine *co;
3956
    BlockDriverAIOCBCoroutine *acb;
3957

    
3958
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3959

    
3960
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3961
    acb->req.sector = sector_num;
3962
    acb->req.nb_sectors = nb_sectors;
3963
    acb->done = NULL;
3964
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3965
    qemu_coroutine_enter(co, acb);
3966

    
3967
    return &acb->common;
3968
}
3969

    
3970
void bdrv_init(void)
3971
{
3972
    module_call_init(MODULE_INIT_BLOCK);
3973
}
3974

    
3975
void bdrv_init_with_whitelist(void)
3976
{
3977
    use_bdrv_whitelist = 1;
3978
    bdrv_init();
3979
}
3980

    
3981
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
3982
                   BlockDriverCompletionFunc *cb, void *opaque)
3983
{
3984
    BlockDriverAIOCB *acb;
3985

    
3986
    acb = g_slice_alloc(aiocb_info->aiocb_size);
3987
    acb->aiocb_info = aiocb_info;
3988
    acb->bs = bs;
3989
    acb->cb = cb;
3990
    acb->opaque = opaque;
3991
    return acb;
3992
}
3993

    
3994
void qemu_aio_release(void *p)
3995
{
3996
    BlockDriverAIOCB *acb = p;
3997
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
3998
}
3999

    
4000
/**************************************************************/
4001
/* Coroutine block device emulation */
4002

    
4003
typedef struct CoroutineIOCompletion {
4004
    Coroutine *coroutine;
4005
    int ret;
4006
} CoroutineIOCompletion;
4007

    
4008
static void bdrv_co_io_em_complete(void *opaque, int ret)
4009
{
4010
    CoroutineIOCompletion *co = opaque;
4011

    
4012
    co->ret = ret;
4013
    qemu_coroutine_enter(co->coroutine, NULL);
4014
}
4015

    
4016
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4017
                                      int nb_sectors, QEMUIOVector *iov,
4018
                                      bool is_write)
4019
{
4020
    CoroutineIOCompletion co = {
4021
        .coroutine = qemu_coroutine_self(),
4022
    };
4023
    BlockDriverAIOCB *acb;
4024

    
4025
    if (is_write) {
4026
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4027
                                       bdrv_co_io_em_complete, &co);
4028
    } else {
4029
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4030
                                      bdrv_co_io_em_complete, &co);
4031
    }
4032

    
4033
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4034
    if (!acb) {
4035
        return -EIO;
4036
    }
4037
    qemu_coroutine_yield();
4038

    
4039
    return co.ret;
4040
}
4041

    
4042
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4043
                                         int64_t sector_num, int nb_sectors,
4044
                                         QEMUIOVector *iov)
4045
{
4046
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4047
}
4048

    
4049
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4050
                                         int64_t sector_num, int nb_sectors,
4051
                                         QEMUIOVector *iov)
4052
{
4053
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4054
}
4055

    
4056
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4057
{
4058
    RwCo *rwco = opaque;
4059

    
4060
    rwco->ret = bdrv_co_flush(rwco->bs);
4061
}
4062

    
4063
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4064
{
4065
    int ret;
4066

    
4067
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4068
        return 0;
4069
    }
4070

    
4071
    /* Write back cached data to the OS even with cache=unsafe */
4072
    if (bs->drv->bdrv_co_flush_to_os) {
4073
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4074
        if (ret < 0) {
4075
            return ret;
4076
        }
4077
    }
4078

    
4079
    /* But don't actually force it to the disk with cache=unsafe */
4080
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4081
        goto flush_parent;
4082
    }
4083

    
4084
    if (bs->drv->bdrv_co_flush_to_disk) {
4085
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4086
    } else if (bs->drv->bdrv_aio_flush) {
4087
        BlockDriverAIOCB *acb;
4088
        CoroutineIOCompletion co = {
4089
            .coroutine = qemu_coroutine_self(),
4090
        };
4091

    
4092
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4093
        if (acb == NULL) {
4094
            ret = -EIO;
4095
        } else {
4096
            qemu_coroutine_yield();
4097
            ret = co.ret;
4098
        }
4099
    } else {
4100
        /*
4101
         * Some block drivers always operate in either writethrough or unsafe
4102
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4103
         * know how the server works (because the behaviour is hardcoded or
4104
         * depends on server-side configuration), so we can't ensure that
4105
         * everything is safe on disk. Returning an error doesn't work because
4106
         * that would break guests even if the server operates in writethrough
4107
         * mode.
4108
         *
4109
         * Let's hope the user knows what he's doing.
4110
         */
4111
        ret = 0;
4112
    }
4113
    if (ret < 0) {
4114
        return ret;
4115
    }
4116

    
4117
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4118
     * in the case of cache=unsafe, so there are no useless flushes.
4119
     */
4120
flush_parent:
4121
    return bdrv_co_flush(bs->file);
4122
}
4123

    
4124
void bdrv_invalidate_cache(BlockDriverState *bs)
4125
{
4126
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4127
        bs->drv->bdrv_invalidate_cache(bs);
4128
    }
4129
}
4130

    
4131
void bdrv_invalidate_cache_all(void)
4132
{
4133
    BlockDriverState *bs;
4134

    
4135
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4136
        bdrv_invalidate_cache(bs);
4137
    }
4138
}
4139

    
4140
void bdrv_clear_incoming_migration_all(void)
4141
{
4142
    BlockDriverState *bs;
4143

    
4144
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4145
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4146
    }
4147
}
4148

    
4149
int bdrv_flush(BlockDriverState *bs)
4150
{
4151
    Coroutine *co;
4152
    RwCo rwco = {
4153
        .bs = bs,
4154
        .ret = NOT_DONE,
4155
    };
4156

    
4157
    if (qemu_in_coroutine()) {
4158
        /* Fast-path if already in coroutine context */
4159
        bdrv_flush_co_entry(&rwco);
4160
    } else {
4161
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4162
        qemu_coroutine_enter(co, &rwco);
4163
        while (rwco.ret == NOT_DONE) {
4164
            qemu_aio_wait();
4165
        }
4166
    }
4167

    
4168
    return rwco.ret;
4169
}
4170

    
4171
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4172
{
4173
    RwCo *rwco = opaque;
4174

    
4175
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4176
}
4177

    
4178
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4179
                                 int nb_sectors)
4180
{
4181
    if (!bs->drv) {
4182
        return -ENOMEDIUM;
4183
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4184
        return -EIO;
4185
    } else if (bs->read_only) {
4186
        return -EROFS;
4187
    } else if (bs->drv->bdrv_co_discard) {
4188
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4189
    } else if (bs->drv->bdrv_aio_discard) {
4190
        BlockDriverAIOCB *acb;
4191
        CoroutineIOCompletion co = {
4192
            .coroutine = qemu_coroutine_self(),
4193
        };
4194

    
4195
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4196
                                        bdrv_co_io_em_complete, &co);
4197
        if (acb == NULL) {
4198
            return -EIO;
4199
        } else {
4200
            qemu_coroutine_yield();
4201
            return co.ret;
4202
        }
4203
    } else {
4204
        return 0;
4205
    }
4206
}
4207

    
4208
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4209
{
4210
    Coroutine *co;
4211
    RwCo rwco = {
4212
        .bs = bs,
4213
        .sector_num = sector_num,
4214
        .nb_sectors = nb_sectors,
4215
        .ret = NOT_DONE,
4216
    };
4217

    
4218
    if (qemu_in_coroutine()) {
4219
        /* Fast-path if already in coroutine context */
4220
        bdrv_discard_co_entry(&rwco);
4221
    } else {
4222
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4223
        qemu_coroutine_enter(co, &rwco);
4224
        while (rwco.ret == NOT_DONE) {
4225
            qemu_aio_wait();
4226
        }
4227
    }
4228

    
4229
    return rwco.ret;
4230
}
4231

    
4232
/**************************************************************/
4233
/* removable device support */
4234

    
4235
/**
4236
 * Return TRUE if the media is present
4237
 */
4238
int bdrv_is_inserted(BlockDriverState *bs)
4239
{
4240
    BlockDriver *drv = bs->drv;
4241

    
4242
    if (!drv)
4243
        return 0;
4244
    if (!drv->bdrv_is_inserted)
4245
        return 1;
4246
    return drv->bdrv_is_inserted(bs);
4247
}
4248

    
4249
/**
4250
 * Return whether the media changed since the last call to this
4251
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4252
 */
4253
int bdrv_media_changed(BlockDriverState *bs)
4254
{
4255
    BlockDriver *drv = bs->drv;
4256

    
4257
    if (drv && drv->bdrv_media_changed) {
4258
        return drv->bdrv_media_changed(bs);
4259
    }
4260
    return -ENOTSUP;
4261
}
4262

    
4263
/**
4264
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4265
 */
4266
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4267
{
4268
    BlockDriver *drv = bs->drv;
4269

    
4270
    if (drv && drv->bdrv_eject) {
4271
        drv->bdrv_eject(bs, eject_flag);
4272
    }
4273

    
4274
    if (bs->device_name[0] != '\0') {
4275
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4276
    }
4277
}
4278

    
4279
/**
4280
 * Lock or unlock the media (if it is locked, the user won't be able
4281
 * to eject it manually).
4282
 */
4283
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4284
{
4285
    BlockDriver *drv = bs->drv;
4286

    
4287
    trace_bdrv_lock_medium(bs, locked);
4288

    
4289
    if (drv && drv->bdrv_lock_medium) {
4290
        drv->bdrv_lock_medium(bs, locked);
4291
    }
4292
}
4293

    
4294
/* needed for generic scsi interface */
4295

    
4296
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4297
{
4298
    BlockDriver *drv = bs->drv;
4299

    
4300
    if (drv && drv->bdrv_ioctl)
4301
        return drv->bdrv_ioctl(bs, req, buf);
4302
    return -ENOTSUP;
4303
}
4304

    
4305
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4306
        unsigned long int req, void *buf,
4307
        BlockDriverCompletionFunc *cb, void *opaque)
4308
{
4309
    BlockDriver *drv = bs->drv;
4310

    
4311
    if (drv && drv->bdrv_aio_ioctl)
4312
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4313
    return NULL;
4314
}
4315

    
4316
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4317
{
4318
    bs->buffer_alignment = align;
4319
}
4320

    
4321
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4322
{
4323
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4324
}
4325

    
4326
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
4327
{
4328
    int64_t bitmap_size;
4329

    
4330
    bs->dirty_count = 0;
4331
    if (enable) {
4332
        if (!bs->dirty_bitmap) {
4333
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
4334
                    BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
4335
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
4336

    
4337
            bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
4338
        }
4339
    } else {
4340
        if (bs->dirty_bitmap) {
4341
            g_free(bs->dirty_bitmap);
4342
            bs->dirty_bitmap = NULL;
4343
        }
4344
    }
4345
}
4346

    
4347
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4348
{
4349
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4350

    
4351
    if (bs->dirty_bitmap &&
4352
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4353
        return !!(bs->dirty_bitmap[chunk / BITS_PER_LONG] &
4354
            (1UL << (chunk % BITS_PER_LONG)));
4355
    } else {
4356
        return 0;
4357
    }
4358
}
4359

    
4360
int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector)
4361
{
4362
    int64_t chunk;
4363
    int bit, elem;
4364

    
4365
    /* Avoid an infinite loop.  */
4366
    assert(bs->dirty_count > 0);
4367

    
4368
    sector = (sector | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
4369
    chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4370

    
4371
    QEMU_BUILD_BUG_ON(sizeof(bs->dirty_bitmap[0]) * 8 != BITS_PER_LONG);
4372
    elem = chunk / BITS_PER_LONG;
4373
    bit = chunk % BITS_PER_LONG;
4374
    for (;;) {
4375
        if (sector >= bs->total_sectors) {
4376
            sector = 0;
4377
            bit = elem = 0;
4378
        }
4379
        if (bit == 0 && bs->dirty_bitmap[elem] == 0) {
4380
            sector += BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
4381
            elem++;
4382
        } else {
4383
            if (bs->dirty_bitmap[elem] & (1UL << bit)) {
4384
                return sector;
4385
            }
4386
            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
4387
            if (++bit == BITS_PER_LONG) {
4388
                bit = 0;
4389
                elem++;
4390
            }
4391
        }
4392
    }
4393
}
4394

    
4395
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4396
                    int nr_sectors)
4397
{
4398
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 1);
4399
}
4400

    
4401
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4402
                      int nr_sectors)
4403
{
4404
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4405
}
4406

    
4407
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4408
{
4409
    return bs->dirty_count;
4410
}
4411

    
4412
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4413
{
4414
    assert(bs->in_use != in_use);
4415
    bs->in_use = in_use;
4416
}
4417

    
4418
int bdrv_in_use(BlockDriverState *bs)
4419
{
4420
    return bs->in_use;
4421
}
4422

    
4423
void bdrv_iostatus_enable(BlockDriverState *bs)
4424
{
4425
    bs->iostatus_enabled = true;
4426
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4427
}
4428

    
4429
/* The I/O status is only enabled if the drive explicitly
4430
 * enables it _and_ the VM is configured to stop on errors */
4431
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4432
{
4433
    return (bs->iostatus_enabled &&
4434
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4435
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4436
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4437
}
4438

    
4439
void bdrv_iostatus_disable(BlockDriverState *bs)
4440
{
4441
    bs->iostatus_enabled = false;
4442
}
4443

    
4444
void bdrv_iostatus_reset(BlockDriverState *bs)
4445
{
4446
    if (bdrv_iostatus_is_enabled(bs)) {
4447
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4448
        if (bs->job) {
4449
            block_job_iostatus_reset(bs->job);
4450
        }
4451
    }
4452
}
4453

    
4454
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4455
{
4456
    assert(bdrv_iostatus_is_enabled(bs));
4457
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4458
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4459
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4460
    }
4461
}
4462

    
4463
void
4464
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4465
        enum BlockAcctType type)
4466
{
4467
    assert(type < BDRV_MAX_IOTYPE);
4468

    
4469
    cookie->bytes = bytes;
4470
    cookie->start_time_ns = get_clock();
4471
    cookie->type = type;
4472
}
4473

    
4474
void
4475
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4476
{
4477
    assert(cookie->type < BDRV_MAX_IOTYPE);
4478

    
4479
    bs->nr_bytes[cookie->type] += cookie->bytes;
4480
    bs->nr_ops[cookie->type]++;
4481
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4482
}
4483

    
4484
void bdrv_img_create(const char *filename, const char *fmt,
4485
                     const char *base_filename, const char *base_fmt,
4486
                     char *options, uint64_t img_size, int flags, Error **errp)
4487
{
4488
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4489
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4490
    BlockDriverState *bs = NULL;
4491
    BlockDriver *drv, *proto_drv;
4492
    BlockDriver *backing_drv = NULL;
4493
    int ret = 0;
4494

    
4495
    /* Find driver and parse its options */
4496
    drv = bdrv_find_format(fmt);
4497
    if (!drv) {
4498
        error_setg(errp, "Unknown file format '%s'", fmt);
4499
        return;
4500
    }
4501

    
4502
    proto_drv = bdrv_find_protocol(filename);
4503
    if (!proto_drv) {
4504
        error_setg(errp, "Unknown protocol '%s'", filename);
4505
        return;
4506
    }
4507

    
4508
    create_options = append_option_parameters(create_options,
4509
                                              drv->create_options);
4510
    create_options = append_option_parameters(create_options,
4511
                                              proto_drv->create_options);
4512

    
4513
    /* Create parameter list with default values */
4514
    param = parse_option_parameters("", create_options, param);
4515

    
4516
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4517

    
4518
    /* Parse -o options */
4519
    if (options) {
4520
        param = parse_option_parameters(options, create_options, param);
4521
        if (param == NULL) {
4522
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4523
            goto out;
4524
        }
4525
    }
4526

    
4527
    if (base_filename) {
4528
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4529
                                 base_filename)) {
4530
            error_setg(errp, "Backing file not supported for file format '%s'",
4531
                       fmt);
4532
            goto out;
4533
        }
4534
    }
4535

    
4536
    if (base_fmt) {
4537
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4538
            error_setg(errp, "Backing file format not supported for file "
4539
                             "format '%s'", fmt);
4540
            goto out;
4541
        }
4542
    }
4543

    
4544
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4545
    if (backing_file && backing_file->value.s) {
4546
        if (!strcmp(filename, backing_file->value.s)) {
4547
            error_setg(errp, "Error: Trying to create an image with the "
4548
                             "same filename as the backing file");
4549
            goto out;
4550
        }
4551
    }
4552

    
4553
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4554
    if (backing_fmt && backing_fmt->value.s) {
4555
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4556
        if (!backing_drv) {
4557
            error_setg(errp, "Unknown backing file format '%s'",
4558
                       backing_fmt->value.s);
4559
            goto out;
4560
        }
4561
    }
4562

    
4563
    // The size for the image must always be specified, with one exception:
4564
    // If we are using a backing file, we can obtain the size from there
4565
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4566
    if (size && size->value.n == -1) {
4567
        if (backing_file && backing_file->value.s) {
4568
            uint64_t size;
4569
            char buf[32];
4570
            int back_flags;
4571

    
4572
            /* backing files always opened read-only */
4573
            back_flags =
4574
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4575

    
4576
            bs = bdrv_new("");
4577

    
4578
            ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4579
            if (ret < 0) {
4580
                error_setg_errno(errp, -ret, "Could not open '%s'",
4581
                                 backing_file->value.s);
4582
                goto out;
4583
            }
4584
            bdrv_get_geometry(bs, &size);
4585
            size *= 512;
4586

    
4587
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4588
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4589
        } else {
4590
            error_setg(errp, "Image creation needs a size parameter");
4591
            goto out;
4592
        }
4593
    }
4594

    
4595
    printf("Formatting '%s', fmt=%s ", filename, fmt);
4596
    print_option_parameters(param);
4597
    puts("");
4598

    
4599
    ret = bdrv_create(drv, filename, param);
4600
    if (ret < 0) {
4601
        if (ret == -ENOTSUP) {
4602
            error_setg(errp,"Formatting or formatting option not supported for "
4603
                            "file format '%s'", fmt);
4604
        } else if (ret == -EFBIG) {
4605
            error_setg(errp, "The image size is too large for file format '%s'",
4606
                       fmt);
4607
        } else {
4608
            error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
4609
                       strerror(-ret));
4610
        }
4611
    }
4612

    
4613
out:
4614
    free_option_parameters(create_options);
4615
    free_option_parameters(param);
4616

    
4617
    if (bs) {
4618
        bdrv_delete(bs);
4619
    }
4620
}