Statistics
| Branch: | Revision:

root / block.c @ 31410948

History | View | Annotate | Download (127.4 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90
        bool is_write, double elapsed_time, uint64_t *wait);
91
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92
        double elapsed_time, uint64_t *wait);
93
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94
        bool is_write, int64_t *wait);
95

    
96
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
98

    
99
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
101

    
102
/* The device to use for VM snapshots */
103
static BlockDriverState *bs_snapshots;
104

    
105
/* If non-zero, use only whitelisted block drivers */
106
static int use_bdrv_whitelist;
107

    
108
#ifdef _WIN32
109
static int is_windows_drive_prefix(const char *filename)
110
{
111
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
112
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113
            filename[1] == ':');
114
}
115

    
116
int is_windows_drive(const char *filename)
117
{
118
    if (is_windows_drive_prefix(filename) &&
119
        filename[2] == '\0')
120
        return 1;
121
    if (strstart(filename, "\\\\.\\", NULL) ||
122
        strstart(filename, "//./", NULL))
123
        return 1;
124
    return 0;
125
}
126
#endif
127

    
128
/* throttling disk I/O limits */
129
void bdrv_io_limits_disable(BlockDriverState *bs)
130
{
131
    bs->io_limits_enabled = false;
132

    
133
    while (qemu_co_queue_next(&bs->throttled_reqs));
134

    
135
    if (bs->block_timer) {
136
        qemu_del_timer(bs->block_timer);
137
        qemu_free_timer(bs->block_timer);
138
        bs->block_timer = NULL;
139
    }
140

    
141
    bs->slice_start = 0;
142
    bs->slice_end   = 0;
143
    bs->slice_time  = 0;
144
    memset(&bs->io_base, 0, sizeof(bs->io_base));
145
}
146

    
147
static void bdrv_block_timer(void *opaque)
148
{
149
    BlockDriverState *bs = opaque;
150

    
151
    qemu_co_queue_next(&bs->throttled_reqs);
152
}
153

    
154
void bdrv_io_limits_enable(BlockDriverState *bs)
155
{
156
    qemu_co_queue_init(&bs->throttled_reqs);
157
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
158
    bs->io_limits_enabled = true;
159
}
160

    
161
bool bdrv_io_limits_enabled(BlockDriverState *bs)
162
{
163
    BlockIOLimit *io_limits = &bs->io_limits;
164
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
165
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
166
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
167
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
168
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
169
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
170
}
171

    
172
static void bdrv_io_limits_intercept(BlockDriverState *bs,
173
                                     bool is_write, int nb_sectors)
174
{
175
    int64_t wait_time = -1;
176

    
177
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
178
        qemu_co_queue_wait(&bs->throttled_reqs);
179
    }
180

    
181
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
182
     * throttled requests will not be dequeued until the current request is
183
     * allowed to be serviced. So if the current request still exceeds the
184
     * limits, it will be inserted to the head. All requests followed it will
185
     * be still in throttled_reqs queue.
186
     */
187

    
188
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
189
        qemu_mod_timer(bs->block_timer,
190
                       wait_time + qemu_get_clock_ns(vm_clock));
191
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
192
    }
193

    
194
    qemu_co_queue_next(&bs->throttled_reqs);
195
}
196

    
197
/* check if the path starts with "<protocol>:" */
198
static int path_has_protocol(const char *path)
199
{
200
    const char *p;
201

    
202
#ifdef _WIN32
203
    if (is_windows_drive(path) ||
204
        is_windows_drive_prefix(path)) {
205
        return 0;
206
    }
207
    p = path + strcspn(path, ":/\\");
208
#else
209
    p = path + strcspn(path, ":/");
210
#endif
211

    
212
    return *p == ':';
213
}
214

    
215
int path_is_absolute(const char *path)
216
{
217
#ifdef _WIN32
218
    /* specific case for names like: "\\.\d:" */
219
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
220
        return 1;
221
    }
222
    return (*path == '/' || *path == '\\');
223
#else
224
    return (*path == '/');
225
#endif
226
}
227

    
228
/* if filename is absolute, just copy it to dest. Otherwise, build a
229
   path to it by considering it is relative to base_path. URL are
230
   supported. */
231
void path_combine(char *dest, int dest_size,
232
                  const char *base_path,
233
                  const char *filename)
234
{
235
    const char *p, *p1;
236
    int len;
237

    
238
    if (dest_size <= 0)
239
        return;
240
    if (path_is_absolute(filename)) {
241
        pstrcpy(dest, dest_size, filename);
242
    } else {
243
        p = strchr(base_path, ':');
244
        if (p)
245
            p++;
246
        else
247
            p = base_path;
248
        p1 = strrchr(base_path, '/');
249
#ifdef _WIN32
250
        {
251
            const char *p2;
252
            p2 = strrchr(base_path, '\\');
253
            if (!p1 || p2 > p1)
254
                p1 = p2;
255
        }
256
#endif
257
        if (p1)
258
            p1++;
259
        else
260
            p1 = base_path;
261
        if (p1 > p)
262
            p = p1;
263
        len = p - base_path;
264
        if (len > dest_size - 1)
265
            len = dest_size - 1;
266
        memcpy(dest, base_path, len);
267
        dest[len] = '\0';
268
        pstrcat(dest, dest_size, filename);
269
    }
270
}
271

    
272
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
273
{
274
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
275
        pstrcpy(dest, sz, bs->backing_file);
276
    } else {
277
        path_combine(dest, sz, bs->filename, bs->backing_file);
278
    }
279
}
280

    
281
void bdrv_register(BlockDriver *bdrv)
282
{
283
    /* Block drivers without coroutine functions need emulation */
284
    if (!bdrv->bdrv_co_readv) {
285
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
286
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
287

    
288
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
289
         * the block driver lacks aio we need to emulate that too.
290
         */
291
        if (!bdrv->bdrv_aio_readv) {
292
            /* add AIO emulation layer */
293
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
294
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
295
        }
296
    }
297

    
298
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
299
}
300

    
301
/* create a new block device (by default it is empty) */
302
BlockDriverState *bdrv_new(const char *device_name)
303
{
304
    BlockDriverState *bs;
305

    
306
    bs = g_malloc0(sizeof(BlockDriverState));
307
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
308
    if (device_name[0] != '\0') {
309
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
310
    }
311
    bdrv_iostatus_disable(bs);
312
    notifier_list_init(&bs->close_notifiers);
313

    
314
    return bs;
315
}
316

    
317
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
318
{
319
    notifier_list_add(&bs->close_notifiers, notify);
320
}
321

    
322
BlockDriver *bdrv_find_format(const char *format_name)
323
{
324
    BlockDriver *drv1;
325
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
326
        if (!strcmp(drv1->format_name, format_name)) {
327
            return drv1;
328
        }
329
    }
330
    return NULL;
331
}
332

    
333
static int bdrv_is_whitelisted(BlockDriver *drv)
334
{
335
    static const char *whitelist[] = {
336
        CONFIG_BDRV_WHITELIST
337
    };
338
    const char **p;
339

    
340
    if (!whitelist[0])
341
        return 1;               /* no whitelist, anything goes */
342

    
343
    for (p = whitelist; *p; p++) {
344
        if (!strcmp(drv->format_name, *p)) {
345
            return 1;
346
        }
347
    }
348
    return 0;
349
}
350

    
351
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
352
{
353
    BlockDriver *drv = bdrv_find_format(format_name);
354
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
355
}
356

    
357
typedef struct CreateCo {
358
    BlockDriver *drv;
359
    char *filename;
360
    QEMUOptionParameter *options;
361
    int ret;
362
} CreateCo;
363

    
364
static void coroutine_fn bdrv_create_co_entry(void *opaque)
365
{
366
    CreateCo *cco = opaque;
367
    assert(cco->drv);
368

    
369
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
370
}
371

    
372
int bdrv_create(BlockDriver *drv, const char* filename,
373
    QEMUOptionParameter *options)
374
{
375
    int ret;
376

    
377
    Coroutine *co;
378
    CreateCo cco = {
379
        .drv = drv,
380
        .filename = g_strdup(filename),
381
        .options = options,
382
        .ret = NOT_DONE,
383
    };
384

    
385
    if (!drv->bdrv_create) {
386
        ret = -ENOTSUP;
387
        goto out;
388
    }
389

    
390
    if (qemu_in_coroutine()) {
391
        /* Fast-path if already in coroutine context */
392
        bdrv_create_co_entry(&cco);
393
    } else {
394
        co = qemu_coroutine_create(bdrv_create_co_entry);
395
        qemu_coroutine_enter(co, &cco);
396
        while (cco.ret == NOT_DONE) {
397
            qemu_aio_wait();
398
        }
399
    }
400

    
401
    ret = cco.ret;
402

    
403
out:
404
    g_free(cco.filename);
405
    return ret;
406
}
407

    
408
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
409
{
410
    BlockDriver *drv;
411

    
412
    drv = bdrv_find_protocol(filename);
413
    if (drv == NULL) {
414
        return -ENOENT;
415
    }
416

    
417
    return bdrv_create(drv, filename, options);
418
}
419

    
420
/*
421
 * Create a uniquely-named empty temporary file.
422
 * Return 0 upon success, otherwise a negative errno value.
423
 */
424
int get_tmp_filename(char *filename, int size)
425
{
426
#ifdef _WIN32
427
    char temp_dir[MAX_PATH];
428
    /* GetTempFileName requires that its output buffer (4th param)
429
       have length MAX_PATH or greater.  */
430
    assert(size >= MAX_PATH);
431
    return (GetTempPath(MAX_PATH, temp_dir)
432
            && GetTempFileName(temp_dir, "qem", 0, filename)
433
            ? 0 : -GetLastError());
434
#else
435
    int fd;
436
    const char *tmpdir;
437
    tmpdir = getenv("TMPDIR");
438
    if (!tmpdir)
439
        tmpdir = "/tmp";
440
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
441
        return -EOVERFLOW;
442
    }
443
    fd = mkstemp(filename);
444
    if (fd < 0) {
445
        return -errno;
446
    }
447
    if (close(fd) != 0) {
448
        unlink(filename);
449
        return -errno;
450
    }
451
    return 0;
452
#endif
453
}
454

    
455
/*
456
 * Detect host devices. By convention, /dev/cdrom[N] is always
457
 * recognized as a host CDROM.
458
 */
459
static BlockDriver *find_hdev_driver(const char *filename)
460
{
461
    int score_max = 0, score;
462
    BlockDriver *drv = NULL, *d;
463

    
464
    QLIST_FOREACH(d, &bdrv_drivers, list) {
465
        if (d->bdrv_probe_device) {
466
            score = d->bdrv_probe_device(filename);
467
            if (score > score_max) {
468
                score_max = score;
469
                drv = d;
470
            }
471
        }
472
    }
473

    
474
    return drv;
475
}
476

    
477
BlockDriver *bdrv_find_protocol(const char *filename)
478
{
479
    BlockDriver *drv1;
480
    char protocol[128];
481
    int len;
482
    const char *p;
483

    
484
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
485

    
486
    /*
487
     * XXX(hch): we really should not let host device detection
488
     * override an explicit protocol specification, but moving this
489
     * later breaks access to device names with colons in them.
490
     * Thanks to the brain-dead persistent naming schemes on udev-
491
     * based Linux systems those actually are quite common.
492
     */
493
    drv1 = find_hdev_driver(filename);
494
    if (drv1) {
495
        return drv1;
496
    }
497

    
498
    if (!path_has_protocol(filename)) {
499
        return bdrv_find_format("file");
500
    }
501
    p = strchr(filename, ':');
502
    assert(p != NULL);
503
    len = p - filename;
504
    if (len > sizeof(protocol) - 1)
505
        len = sizeof(protocol) - 1;
506
    memcpy(protocol, filename, len);
507
    protocol[len] = '\0';
508
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
509
        if (drv1->protocol_name &&
510
            !strcmp(drv1->protocol_name, protocol)) {
511
            return drv1;
512
        }
513
    }
514
    return NULL;
515
}
516

    
517
static int find_image_format(BlockDriverState *bs, const char *filename,
518
                             BlockDriver **pdrv)
519
{
520
    int score, score_max;
521
    BlockDriver *drv1, *drv;
522
    uint8_t buf[2048];
523
    int ret = 0;
524

    
525
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
526
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
527
        drv = bdrv_find_format("raw");
528
        if (!drv) {
529
            ret = -ENOENT;
530
        }
531
        *pdrv = drv;
532
        return ret;
533
    }
534

    
535
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
536
    if (ret < 0) {
537
        *pdrv = NULL;
538
        return ret;
539
    }
540

    
541
    score_max = 0;
542
    drv = NULL;
543
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
544
        if (drv1->bdrv_probe) {
545
            score = drv1->bdrv_probe(buf, ret, filename);
546
            if (score > score_max) {
547
                score_max = score;
548
                drv = drv1;
549
            }
550
        }
551
    }
552
    if (!drv) {
553
        ret = -ENOENT;
554
    }
555
    *pdrv = drv;
556
    return ret;
557
}
558

    
559
/**
560
 * Set the current 'total_sectors' value
561
 */
562
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
563
{
564
    BlockDriver *drv = bs->drv;
565

    
566
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
567
    if (bs->sg)
568
        return 0;
569

    
570
    /* query actual device if possible, otherwise just trust the hint */
571
    if (drv->bdrv_getlength) {
572
        int64_t length = drv->bdrv_getlength(bs);
573
        if (length < 0) {
574
            return length;
575
        }
576
        hint = length >> BDRV_SECTOR_BITS;
577
    }
578

    
579
    bs->total_sectors = hint;
580
    return 0;
581
}
582

    
583
/**
584
 * Set open flags for a given discard mode
585
 *
586
 * Return 0 on success, -1 if the discard mode was invalid.
587
 */
588
int bdrv_parse_discard_flags(const char *mode, int *flags)
589
{
590
    *flags &= ~BDRV_O_UNMAP;
591

    
592
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
593
        /* do nothing */
594
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
595
        *flags |= BDRV_O_UNMAP;
596
    } else {
597
        return -1;
598
    }
599

    
600
    return 0;
601
}
602

    
603
/**
604
 * Set open flags for a given cache mode
605
 *
606
 * Return 0 on success, -1 if the cache mode was invalid.
607
 */
608
int bdrv_parse_cache_flags(const char *mode, int *flags)
609
{
610
    *flags &= ~BDRV_O_CACHE_MASK;
611

    
612
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
613
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
614
    } else if (!strcmp(mode, "directsync")) {
615
        *flags |= BDRV_O_NOCACHE;
616
    } else if (!strcmp(mode, "writeback")) {
617
        *flags |= BDRV_O_CACHE_WB;
618
    } else if (!strcmp(mode, "unsafe")) {
619
        *flags |= BDRV_O_CACHE_WB;
620
        *flags |= BDRV_O_NO_FLUSH;
621
    } else if (!strcmp(mode, "writethrough")) {
622
        /* this is the default */
623
    } else {
624
        return -1;
625
    }
626

    
627
    return 0;
628
}
629

    
630
/**
631
 * The copy-on-read flag is actually a reference count so multiple users may
632
 * use the feature without worrying about clobbering its previous state.
633
 * Copy-on-read stays enabled until all users have called to disable it.
634
 */
635
void bdrv_enable_copy_on_read(BlockDriverState *bs)
636
{
637
    bs->copy_on_read++;
638
}
639

    
640
void bdrv_disable_copy_on_read(BlockDriverState *bs)
641
{
642
    assert(bs->copy_on_read > 0);
643
    bs->copy_on_read--;
644
}
645

    
646
static int bdrv_open_flags(BlockDriverState *bs, int flags)
647
{
648
    int open_flags = flags | BDRV_O_CACHE_WB;
649

    
650
    /*
651
     * Clear flags that are internal to the block layer before opening the
652
     * image.
653
     */
654
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
655

    
656
    /*
657
     * Snapshots should be writable.
658
     */
659
    if (bs->is_temporary) {
660
        open_flags |= BDRV_O_RDWR;
661
    }
662

    
663
    return open_flags;
664
}
665

    
666
/*
667
 * Common part for opening disk images and files
668
 *
669
 * Removes all processed options from *options.
670
 */
671
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
672
    const char *filename, QDict *options,
673
    int flags, BlockDriver *drv)
674
{
675
    int ret, open_flags;
676

    
677
    assert(drv != NULL);
678
    assert(bs->file == NULL);
679
    assert(options == NULL || bs->options != options);
680

    
681
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
682

    
683
    bs->open_flags = flags;
684
    bs->buffer_alignment = 512;
685

    
686
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
687
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
688
        bdrv_enable_copy_on_read(bs);
689
    }
690

    
691
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
692

    
693
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
694
        return -ENOTSUP;
695
    }
696

    
697
    bs->drv = drv;
698
    bs->opaque = g_malloc0(drv->instance_size);
699

    
700
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
701
    open_flags = bdrv_open_flags(bs, flags);
702

    
703
    bs->read_only = !(open_flags & BDRV_O_RDWR);
704

    
705
    /* Open the image, either directly or using a protocol */
706
    if (drv->bdrv_file_open) {
707
        if (file != NULL) {
708
            bdrv_swap(file, bs);
709
            ret = 0;
710
        } else {
711
            ret = drv->bdrv_file_open(bs, filename, open_flags);
712
        }
713
    } else {
714
        assert(file != NULL);
715
        bs->file = file;
716
        ret = drv->bdrv_open(bs, options, open_flags);
717
    }
718

    
719
    if (ret < 0) {
720
        goto free_and_fail;
721
    }
722

    
723
    ret = refresh_total_sectors(bs, bs->total_sectors);
724
    if (ret < 0) {
725
        goto free_and_fail;
726
    }
727

    
728
#ifndef _WIN32
729
    if (bs->is_temporary) {
730
        unlink(filename);
731
    }
732
#endif
733
    return 0;
734

    
735
free_and_fail:
736
    bs->file = NULL;
737
    g_free(bs->opaque);
738
    bs->opaque = NULL;
739
    bs->drv = NULL;
740
    return ret;
741
}
742

    
743
/*
744
 * Opens a file using a protocol (file, host_device, nbd, ...)
745
 */
746
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
747
{
748
    BlockDriverState *bs;
749
    BlockDriver *drv;
750
    int ret;
751

    
752
    drv = bdrv_find_protocol(filename);
753
    if (!drv) {
754
        return -ENOENT;
755
    }
756

    
757
    bs = bdrv_new("");
758
    ret = bdrv_open_common(bs, NULL, filename, NULL, flags, drv);
759
    if (ret < 0) {
760
        bdrv_delete(bs);
761
        return ret;
762
    }
763
    bs->growable = 1;
764
    *pbs = bs;
765
    return 0;
766
}
767

    
768
int bdrv_open_backing_file(BlockDriverState *bs)
769
{
770
    char backing_filename[PATH_MAX];
771
    int back_flags, ret;
772
    BlockDriver *back_drv = NULL;
773

    
774
    if (bs->backing_hd != NULL) {
775
        return 0;
776
    }
777

    
778
    bs->open_flags &= ~BDRV_O_NO_BACKING;
779
    if (bs->backing_file[0] == '\0') {
780
        return 0;
781
    }
782

    
783
    bs->backing_hd = bdrv_new("");
784
    bdrv_get_full_backing_filename(bs, backing_filename,
785
                                   sizeof(backing_filename));
786

    
787
    if (bs->backing_format[0] != '\0') {
788
        back_drv = bdrv_find_format(bs->backing_format);
789
    }
790

    
791
    /* backing files always opened read-only */
792
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
793

    
794
    ret = bdrv_open(bs->backing_hd, backing_filename, NULL,
795
                    back_flags, back_drv);
796
    if (ret < 0) {
797
        bdrv_delete(bs->backing_hd);
798
        bs->backing_hd = NULL;
799
        bs->open_flags |= BDRV_O_NO_BACKING;
800
        return ret;
801
    }
802
    return 0;
803
}
804

    
805
/*
806
 * Opens a disk image (raw, qcow2, vmdk, ...)
807
 *
808
 * options is a QDict of options to pass to the block drivers, or NULL for an
809
 * empty set of options. The reference to the QDict belongs to the block layer
810
 * after the call (even on failure), so if the caller intends to reuse the
811
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
812
 */
813
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
814
              int flags, BlockDriver *drv)
815
{
816
    int ret;
817
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
818
    char tmp_filename[PATH_MAX + 1];
819
    BlockDriverState *file = NULL;
820

    
821
    /* NULL means an empty set of options */
822
    if (options == NULL) {
823
        options = qdict_new();
824
    }
825

    
826
    bs->options = options;
827
    options = qdict_clone_shallow(options);
828

    
829
    /* For snapshot=on, create a temporary qcow2 overlay */
830
    if (flags & BDRV_O_SNAPSHOT) {
831
        BlockDriverState *bs1;
832
        int64_t total_size;
833
        int is_protocol = 0;
834
        BlockDriver *bdrv_qcow2;
835
        QEMUOptionParameter *options;
836
        char backing_filename[PATH_MAX];
837

    
838
        /* if snapshot, we create a temporary backing file and open it
839
           instead of opening 'filename' directly */
840

    
841
        /* if there is a backing file, use it */
842
        bs1 = bdrv_new("");
843
        ret = bdrv_open(bs1, filename, NULL, 0, drv);
844
        if (ret < 0) {
845
            bdrv_delete(bs1);
846
            goto fail;
847
        }
848
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
849

    
850
        if (bs1->drv && bs1->drv->protocol_name)
851
            is_protocol = 1;
852

    
853
        bdrv_delete(bs1);
854

    
855
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
856
        if (ret < 0) {
857
            goto fail;
858
        }
859

    
860
        /* Real path is meaningless for protocols */
861
        if (is_protocol) {
862
            snprintf(backing_filename, sizeof(backing_filename),
863
                     "%s", filename);
864
        } else if (!realpath(filename, backing_filename)) {
865
            ret = -errno;
866
            goto fail;
867
        }
868

    
869
        bdrv_qcow2 = bdrv_find_format("qcow2");
870
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
871

    
872
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
873
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
874
        if (drv) {
875
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
876
                drv->format_name);
877
        }
878

    
879
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
880
        free_option_parameters(options);
881
        if (ret < 0) {
882
            goto fail;
883
        }
884

    
885
        filename = tmp_filename;
886
        drv = bdrv_qcow2;
887
        bs->is_temporary = 1;
888
    }
889

    
890
    /* Open image file without format layer */
891
    if (flags & BDRV_O_RDWR) {
892
        flags |= BDRV_O_ALLOW_RDWR;
893
    }
894

    
895
    ret = bdrv_file_open(&file, filename, bdrv_open_flags(bs, flags));
896
    if (ret < 0) {
897
        goto fail;
898
    }
899

    
900
    /* Find the right image format driver */
901
    if (!drv) {
902
        ret = find_image_format(file, filename, &drv);
903
    }
904

    
905
    if (!drv) {
906
        goto unlink_and_fail;
907
    }
908

    
909
    /* Open the image */
910
    ret = bdrv_open_common(bs, file, filename, options, flags, drv);
911
    if (ret < 0) {
912
        goto unlink_and_fail;
913
    }
914

    
915
    if (bs->file != file) {
916
        bdrv_delete(file);
917
        file = NULL;
918
    }
919

    
920
    /* If there is a backing file, use it */
921
    if ((flags & BDRV_O_NO_BACKING) == 0) {
922
        ret = bdrv_open_backing_file(bs);
923
        if (ret < 0) {
924
            goto close_and_fail;
925
        }
926
    }
927

    
928
    /* Check if any unknown options were used */
929
    if (qdict_size(options) != 0) {
930
        const QDictEntry *entry = qdict_first(options);
931
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by "
932
            "device '%s' doesn't support the option '%s'",
933
            drv->format_name, bs->device_name, entry->key);
934

    
935
        ret = -EINVAL;
936
        goto close_and_fail;
937
    }
938
    QDECREF(options);
939

    
940
    if (!bdrv_key_required(bs)) {
941
        bdrv_dev_change_media_cb(bs, true);
942
    }
943

    
944
    /* throttling disk I/O limits */
945
    if (bs->io_limits_enabled) {
946
        bdrv_io_limits_enable(bs);
947
    }
948

    
949
    return 0;
950

    
951
unlink_and_fail:
952
    if (file != NULL) {
953
        bdrv_delete(file);
954
    }
955
    if (bs->is_temporary) {
956
        unlink(filename);
957
    }
958
fail:
959
    QDECREF(bs->options);
960
    QDECREF(options);
961
    bs->options = NULL;
962
    return ret;
963

    
964
close_and_fail:
965
    bdrv_close(bs);
966
    QDECREF(options);
967
    return ret;
968
}
969

    
970
typedef struct BlockReopenQueueEntry {
971
     bool prepared;
972
     BDRVReopenState state;
973
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
974
} BlockReopenQueueEntry;
975

    
976
/*
977
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
978
 * reopen of multiple devices.
979
 *
980
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
981
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
982
 * be created and initialized. This newly created BlockReopenQueue should be
983
 * passed back in for subsequent calls that are intended to be of the same
984
 * atomic 'set'.
985
 *
986
 * bs is the BlockDriverState to add to the reopen queue.
987
 *
988
 * flags contains the open flags for the associated bs
989
 *
990
 * returns a pointer to bs_queue, which is either the newly allocated
991
 * bs_queue, or the existing bs_queue being used.
992
 *
993
 */
994
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
995
                                    BlockDriverState *bs, int flags)
996
{
997
    assert(bs != NULL);
998

    
999
    BlockReopenQueueEntry *bs_entry;
1000
    if (bs_queue == NULL) {
1001
        bs_queue = g_new0(BlockReopenQueue, 1);
1002
        QSIMPLEQ_INIT(bs_queue);
1003
    }
1004

    
1005
    if (bs->file) {
1006
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1007
    }
1008

    
1009
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1010
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1011

    
1012
    bs_entry->state.bs = bs;
1013
    bs_entry->state.flags = flags;
1014

    
1015
    return bs_queue;
1016
}
1017

    
1018
/*
1019
 * Reopen multiple BlockDriverStates atomically & transactionally.
1020
 *
1021
 * The queue passed in (bs_queue) must have been built up previous
1022
 * via bdrv_reopen_queue().
1023
 *
1024
 * Reopens all BDS specified in the queue, with the appropriate
1025
 * flags.  All devices are prepared for reopen, and failure of any
1026
 * device will cause all device changes to be abandonded, and intermediate
1027
 * data cleaned up.
1028
 *
1029
 * If all devices prepare successfully, then the changes are committed
1030
 * to all devices.
1031
 *
1032
 */
1033
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1034
{
1035
    int ret = -1;
1036
    BlockReopenQueueEntry *bs_entry, *next;
1037
    Error *local_err = NULL;
1038

    
1039
    assert(bs_queue != NULL);
1040

    
1041
    bdrv_drain_all();
1042

    
1043
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1044
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1045
            error_propagate(errp, local_err);
1046
            goto cleanup;
1047
        }
1048
        bs_entry->prepared = true;
1049
    }
1050

    
1051
    /* If we reach this point, we have success and just need to apply the
1052
     * changes
1053
     */
1054
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1055
        bdrv_reopen_commit(&bs_entry->state);
1056
    }
1057

    
1058
    ret = 0;
1059

    
1060
cleanup:
1061
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1062
        if (ret && bs_entry->prepared) {
1063
            bdrv_reopen_abort(&bs_entry->state);
1064
        }
1065
        g_free(bs_entry);
1066
    }
1067
    g_free(bs_queue);
1068
    return ret;
1069
}
1070

    
1071

    
1072
/* Reopen a single BlockDriverState with the specified flags. */
1073
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1074
{
1075
    int ret = -1;
1076
    Error *local_err = NULL;
1077
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1078

    
1079
    ret = bdrv_reopen_multiple(queue, &local_err);
1080
    if (local_err != NULL) {
1081
        error_propagate(errp, local_err);
1082
    }
1083
    return ret;
1084
}
1085

    
1086

    
1087
/*
1088
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1089
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1090
 * the block driver layer .bdrv_reopen_prepare()
1091
 *
1092
 * bs is the BlockDriverState to reopen
1093
 * flags are the new open flags
1094
 * queue is the reopen queue
1095
 *
1096
 * Returns 0 on success, non-zero on error.  On error errp will be set
1097
 * as well.
1098
 *
1099
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1100
 * It is the responsibility of the caller to then call the abort() or
1101
 * commit() for any other BDS that have been left in a prepare() state
1102
 *
1103
 */
1104
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1105
                        Error **errp)
1106
{
1107
    int ret = -1;
1108
    Error *local_err = NULL;
1109
    BlockDriver *drv;
1110

    
1111
    assert(reopen_state != NULL);
1112
    assert(reopen_state->bs->drv != NULL);
1113
    drv = reopen_state->bs->drv;
1114

    
1115
    /* if we are to stay read-only, do not allow permission change
1116
     * to r/w */
1117
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1118
        reopen_state->flags & BDRV_O_RDWR) {
1119
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1120
                  reopen_state->bs->device_name);
1121
        goto error;
1122
    }
1123

    
1124

    
1125
    ret = bdrv_flush(reopen_state->bs);
1126
    if (ret) {
1127
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1128
                  strerror(-ret));
1129
        goto error;
1130
    }
1131

    
1132
    if (drv->bdrv_reopen_prepare) {
1133
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1134
        if (ret) {
1135
            if (local_err != NULL) {
1136
                error_propagate(errp, local_err);
1137
            } else {
1138
                error_set(errp, QERR_OPEN_FILE_FAILED,
1139
                          reopen_state->bs->filename);
1140
            }
1141
            goto error;
1142
        }
1143
    } else {
1144
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1145
         * handler for each supported drv. */
1146
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1147
                  drv->format_name, reopen_state->bs->device_name,
1148
                 "reopening of file");
1149
        ret = -1;
1150
        goto error;
1151
    }
1152

    
1153
    ret = 0;
1154

    
1155
error:
1156
    return ret;
1157
}
1158

    
1159
/*
1160
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1161
 * makes them final by swapping the staging BlockDriverState contents into
1162
 * the active BlockDriverState contents.
1163
 */
1164
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1165
{
1166
    BlockDriver *drv;
1167

    
1168
    assert(reopen_state != NULL);
1169
    drv = reopen_state->bs->drv;
1170
    assert(drv != NULL);
1171

    
1172
    /* If there are any driver level actions to take */
1173
    if (drv->bdrv_reopen_commit) {
1174
        drv->bdrv_reopen_commit(reopen_state);
1175
    }
1176

    
1177
    /* set BDS specific flags now */
1178
    reopen_state->bs->open_flags         = reopen_state->flags;
1179
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1180
                                              BDRV_O_CACHE_WB);
1181
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1182
}
1183

    
1184
/*
1185
 * Abort the reopen, and delete and free the staged changes in
1186
 * reopen_state
1187
 */
1188
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1189
{
1190
    BlockDriver *drv;
1191

    
1192
    assert(reopen_state != NULL);
1193
    drv = reopen_state->bs->drv;
1194
    assert(drv != NULL);
1195

    
1196
    if (drv->bdrv_reopen_abort) {
1197
        drv->bdrv_reopen_abort(reopen_state);
1198
    }
1199
}
1200

    
1201

    
1202
void bdrv_close(BlockDriverState *bs)
1203
{
1204
    bdrv_flush(bs);
1205
    if (bs->job) {
1206
        block_job_cancel_sync(bs->job);
1207
    }
1208
    bdrv_drain_all();
1209
    notifier_list_notify(&bs->close_notifiers, bs);
1210

    
1211
    if (bs->drv) {
1212
        if (bs == bs_snapshots) {
1213
            bs_snapshots = NULL;
1214
        }
1215
        if (bs->backing_hd) {
1216
            bdrv_delete(bs->backing_hd);
1217
            bs->backing_hd = NULL;
1218
        }
1219
        bs->drv->bdrv_close(bs);
1220
        g_free(bs->opaque);
1221
#ifdef _WIN32
1222
        if (bs->is_temporary) {
1223
            unlink(bs->filename);
1224
        }
1225
#endif
1226
        bs->opaque = NULL;
1227
        bs->drv = NULL;
1228
        bs->copy_on_read = 0;
1229
        bs->backing_file[0] = '\0';
1230
        bs->backing_format[0] = '\0';
1231
        bs->total_sectors = 0;
1232
        bs->encrypted = 0;
1233
        bs->valid_key = 0;
1234
        bs->sg = 0;
1235
        bs->growable = 0;
1236
        QDECREF(bs->options);
1237
        bs->options = NULL;
1238

    
1239
        if (bs->file != NULL) {
1240
            bdrv_delete(bs->file);
1241
            bs->file = NULL;
1242
        }
1243
    }
1244

    
1245
    bdrv_dev_change_media_cb(bs, false);
1246

    
1247
    /*throttling disk I/O limits*/
1248
    if (bs->io_limits_enabled) {
1249
        bdrv_io_limits_disable(bs);
1250
    }
1251
}
1252

    
1253
void bdrv_close_all(void)
1254
{
1255
    BlockDriverState *bs;
1256

    
1257
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1258
        bdrv_close(bs);
1259
    }
1260
}
1261

    
1262
/*
1263
 * Wait for pending requests to complete across all BlockDriverStates
1264
 *
1265
 * This function does not flush data to disk, use bdrv_flush_all() for that
1266
 * after calling this function.
1267
 *
1268
 * Note that completion of an asynchronous I/O operation can trigger any
1269
 * number of other I/O operations on other devices---for example a coroutine
1270
 * can be arbitrarily complex and a constant flow of I/O can come until the
1271
 * coroutine is complete.  Because of this, it is not possible to have a
1272
 * function to drain a single device's I/O queue.
1273
 */
1274
void bdrv_drain_all(void)
1275
{
1276
    BlockDriverState *bs;
1277
    bool busy;
1278

    
1279
    do {
1280
        busy = qemu_aio_wait();
1281

    
1282
        /* FIXME: We do not have timer support here, so this is effectively
1283
         * a busy wait.
1284
         */
1285
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1286
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1287
                qemu_co_queue_restart_all(&bs->throttled_reqs);
1288
                busy = true;
1289
            }
1290
        }
1291
    } while (busy);
1292

    
1293
    /* If requests are still pending there is a bug somewhere */
1294
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1295
        assert(QLIST_EMPTY(&bs->tracked_requests));
1296
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
1297
    }
1298
}
1299

    
1300
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1301
   Also, NULL terminate the device_name to prevent double remove */
1302
void bdrv_make_anon(BlockDriverState *bs)
1303
{
1304
    if (bs->device_name[0] != '\0') {
1305
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1306
    }
1307
    bs->device_name[0] = '\0';
1308
}
1309

    
1310
static void bdrv_rebind(BlockDriverState *bs)
1311
{
1312
    if (bs->drv && bs->drv->bdrv_rebind) {
1313
        bs->drv->bdrv_rebind(bs);
1314
    }
1315
}
1316

    
1317
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1318
                                     BlockDriverState *bs_src)
1319
{
1320
    /* move some fields that need to stay attached to the device */
1321
    bs_dest->open_flags         = bs_src->open_flags;
1322

    
1323
    /* dev info */
1324
    bs_dest->dev_ops            = bs_src->dev_ops;
1325
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1326
    bs_dest->dev                = bs_src->dev;
1327
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1328
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1329

    
1330
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1331

    
1332
    /* i/o timing parameters */
1333
    bs_dest->slice_time         = bs_src->slice_time;
1334
    bs_dest->slice_start        = bs_src->slice_start;
1335
    bs_dest->slice_end          = bs_src->slice_end;
1336
    bs_dest->io_limits          = bs_src->io_limits;
1337
    bs_dest->io_base            = bs_src->io_base;
1338
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1339
    bs_dest->block_timer        = bs_src->block_timer;
1340
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1341

    
1342
    /* r/w error */
1343
    bs_dest->on_read_error      = bs_src->on_read_error;
1344
    bs_dest->on_write_error     = bs_src->on_write_error;
1345

    
1346
    /* i/o status */
1347
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1348
    bs_dest->iostatus           = bs_src->iostatus;
1349

    
1350
    /* dirty bitmap */
1351
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1352

    
1353
    /* job */
1354
    bs_dest->in_use             = bs_src->in_use;
1355
    bs_dest->job                = bs_src->job;
1356

    
1357
    /* keep the same entry in bdrv_states */
1358
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1359
            bs_src->device_name);
1360
    bs_dest->list = bs_src->list;
1361
}
1362

    
1363
/*
1364
 * Swap bs contents for two image chains while they are live,
1365
 * while keeping required fields on the BlockDriverState that is
1366
 * actually attached to a device.
1367
 *
1368
 * This will modify the BlockDriverState fields, and swap contents
1369
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1370
 *
1371
 * bs_new is required to be anonymous.
1372
 *
1373
 * This function does not create any image files.
1374
 */
1375
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1376
{
1377
    BlockDriverState tmp;
1378

    
1379
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1380
    assert(bs_new->device_name[0] == '\0');
1381
    assert(bs_new->dirty_bitmap == NULL);
1382
    assert(bs_new->job == NULL);
1383
    assert(bs_new->dev == NULL);
1384
    assert(bs_new->in_use == 0);
1385
    assert(bs_new->io_limits_enabled == false);
1386
    assert(bs_new->block_timer == NULL);
1387

    
1388
    tmp = *bs_new;
1389
    *bs_new = *bs_old;
1390
    *bs_old = tmp;
1391

    
1392
    /* there are some fields that should not be swapped, move them back */
1393
    bdrv_move_feature_fields(&tmp, bs_old);
1394
    bdrv_move_feature_fields(bs_old, bs_new);
1395
    bdrv_move_feature_fields(bs_new, &tmp);
1396

    
1397
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1398
    assert(bs_new->device_name[0] == '\0');
1399

    
1400
    /* Check a few fields that should remain attached to the device */
1401
    assert(bs_new->dev == NULL);
1402
    assert(bs_new->job == NULL);
1403
    assert(bs_new->in_use == 0);
1404
    assert(bs_new->io_limits_enabled == false);
1405
    assert(bs_new->block_timer == NULL);
1406

    
1407
    bdrv_rebind(bs_new);
1408
    bdrv_rebind(bs_old);
1409
}
1410

    
1411
/*
1412
 * Add new bs contents at the top of an image chain while the chain is
1413
 * live, while keeping required fields on the top layer.
1414
 *
1415
 * This will modify the BlockDriverState fields, and swap contents
1416
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1417
 *
1418
 * bs_new is required to be anonymous.
1419
 *
1420
 * This function does not create any image files.
1421
 */
1422
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1423
{
1424
    bdrv_swap(bs_new, bs_top);
1425

    
1426
    /* The contents of 'tmp' will become bs_top, as we are
1427
     * swapping bs_new and bs_top contents. */
1428
    bs_top->backing_hd = bs_new;
1429
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1430
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1431
            bs_new->filename);
1432
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1433
            bs_new->drv ? bs_new->drv->format_name : "");
1434
}
1435

    
1436
void bdrv_delete(BlockDriverState *bs)
1437
{
1438
    assert(!bs->dev);
1439
    assert(!bs->job);
1440
    assert(!bs->in_use);
1441

    
1442
    /* remove from list, if necessary */
1443
    bdrv_make_anon(bs);
1444

    
1445
    bdrv_close(bs);
1446

    
1447
    assert(bs != bs_snapshots);
1448
    g_free(bs);
1449
}
1450

    
1451
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1452
/* TODO change to DeviceState *dev when all users are qdevified */
1453
{
1454
    if (bs->dev) {
1455
        return -EBUSY;
1456
    }
1457
    bs->dev = dev;
1458
    bdrv_iostatus_reset(bs);
1459
    return 0;
1460
}
1461

    
1462
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1463
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1464
{
1465
    if (bdrv_attach_dev(bs, dev) < 0) {
1466
        abort();
1467
    }
1468
}
1469

    
1470
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1471
/* TODO change to DeviceState *dev when all users are qdevified */
1472
{
1473
    assert(bs->dev == dev);
1474
    bs->dev = NULL;
1475
    bs->dev_ops = NULL;
1476
    bs->dev_opaque = NULL;
1477
    bs->buffer_alignment = 512;
1478
}
1479

    
1480
/* TODO change to return DeviceState * when all users are qdevified */
1481
void *bdrv_get_attached_dev(BlockDriverState *bs)
1482
{
1483
    return bs->dev;
1484
}
1485

    
1486
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1487
                      void *opaque)
1488
{
1489
    bs->dev_ops = ops;
1490
    bs->dev_opaque = opaque;
1491
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1492
        bs_snapshots = NULL;
1493
    }
1494
}
1495

    
1496
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1497
                               enum MonitorEvent ev,
1498
                               BlockErrorAction action, bool is_read)
1499
{
1500
    QObject *data;
1501
    const char *action_str;
1502

    
1503
    switch (action) {
1504
    case BDRV_ACTION_REPORT:
1505
        action_str = "report";
1506
        break;
1507
    case BDRV_ACTION_IGNORE:
1508
        action_str = "ignore";
1509
        break;
1510
    case BDRV_ACTION_STOP:
1511
        action_str = "stop";
1512
        break;
1513
    default:
1514
        abort();
1515
    }
1516

    
1517
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1518
                              bdrv->device_name,
1519
                              action_str,
1520
                              is_read ? "read" : "write");
1521
    monitor_protocol_event(ev, data);
1522

    
1523
    qobject_decref(data);
1524
}
1525

    
1526
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1527
{
1528
    QObject *data;
1529

    
1530
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1531
                              bdrv_get_device_name(bs), ejected);
1532
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1533

    
1534
    qobject_decref(data);
1535
}
1536

    
1537
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1538
{
1539
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1540
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1541
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1542
        if (tray_was_closed) {
1543
            /* tray open */
1544
            bdrv_emit_qmp_eject_event(bs, true);
1545
        }
1546
        if (load) {
1547
            /* tray close */
1548
            bdrv_emit_qmp_eject_event(bs, false);
1549
        }
1550
    }
1551
}
1552

    
1553
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1554
{
1555
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1556
}
1557

    
1558
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1559
{
1560
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1561
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1562
    }
1563
}
1564

    
1565
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1566
{
1567
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1568
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1569
    }
1570
    return false;
1571
}
1572

    
1573
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1574
{
1575
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1576
        bs->dev_ops->resize_cb(bs->dev_opaque);
1577
    }
1578
}
1579

    
1580
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1581
{
1582
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1583
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1584
    }
1585
    return false;
1586
}
1587

    
1588
/*
1589
 * Run consistency checks on an image
1590
 *
1591
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1592
 * free of errors) or -errno when an internal error occurred. The results of the
1593
 * check are stored in res.
1594
 */
1595
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1596
{
1597
    if (bs->drv->bdrv_check == NULL) {
1598
        return -ENOTSUP;
1599
    }
1600

    
1601
    memset(res, 0, sizeof(*res));
1602
    return bs->drv->bdrv_check(bs, res, fix);
1603
}
1604

    
1605
#define COMMIT_BUF_SECTORS 2048
1606

    
1607
/* commit COW file into the raw image */
1608
int bdrv_commit(BlockDriverState *bs)
1609
{
1610
    BlockDriver *drv = bs->drv;
1611
    int64_t sector, total_sectors;
1612
    int n, ro, open_flags;
1613
    int ret = 0;
1614
    uint8_t *buf;
1615
    char filename[PATH_MAX];
1616

    
1617
    if (!drv)
1618
        return -ENOMEDIUM;
1619
    
1620
    if (!bs->backing_hd) {
1621
        return -ENOTSUP;
1622
    }
1623

    
1624
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1625
        return -EBUSY;
1626
    }
1627

    
1628
    ro = bs->backing_hd->read_only;
1629
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1630
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1631
    open_flags =  bs->backing_hd->open_flags;
1632

    
1633
    if (ro) {
1634
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1635
            return -EACCES;
1636
        }
1637
    }
1638

    
1639
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1640
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1641

    
1642
    for (sector = 0; sector < total_sectors; sector += n) {
1643
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1644

    
1645
            if (bdrv_read(bs, sector, buf, n) != 0) {
1646
                ret = -EIO;
1647
                goto ro_cleanup;
1648
            }
1649

    
1650
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1651
                ret = -EIO;
1652
                goto ro_cleanup;
1653
            }
1654
        }
1655
    }
1656

    
1657
    if (drv->bdrv_make_empty) {
1658
        ret = drv->bdrv_make_empty(bs);
1659
        bdrv_flush(bs);
1660
    }
1661

    
1662
    /*
1663
     * Make sure all data we wrote to the backing device is actually
1664
     * stable on disk.
1665
     */
1666
    if (bs->backing_hd)
1667
        bdrv_flush(bs->backing_hd);
1668

    
1669
ro_cleanup:
1670
    g_free(buf);
1671

    
1672
    if (ro) {
1673
        /* ignoring error return here */
1674
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1675
    }
1676

    
1677
    return ret;
1678
}
1679

    
1680
int bdrv_commit_all(void)
1681
{
1682
    BlockDriverState *bs;
1683

    
1684
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1685
        if (bs->drv && bs->backing_hd) {
1686
            int ret = bdrv_commit(bs);
1687
            if (ret < 0) {
1688
                return ret;
1689
            }
1690
        }
1691
    }
1692
    return 0;
1693
}
1694

    
1695
struct BdrvTrackedRequest {
1696
    BlockDriverState *bs;
1697
    int64_t sector_num;
1698
    int nb_sectors;
1699
    bool is_write;
1700
    QLIST_ENTRY(BdrvTrackedRequest) list;
1701
    Coroutine *co; /* owner, used for deadlock detection */
1702
    CoQueue wait_queue; /* coroutines blocked on this request */
1703
};
1704

    
1705
/**
1706
 * Remove an active request from the tracked requests list
1707
 *
1708
 * This function should be called when a tracked request is completing.
1709
 */
1710
static void tracked_request_end(BdrvTrackedRequest *req)
1711
{
1712
    QLIST_REMOVE(req, list);
1713
    qemu_co_queue_restart_all(&req->wait_queue);
1714
}
1715

    
1716
/**
1717
 * Add an active request to the tracked requests list
1718
 */
1719
static void tracked_request_begin(BdrvTrackedRequest *req,
1720
                                  BlockDriverState *bs,
1721
                                  int64_t sector_num,
1722
                                  int nb_sectors, bool is_write)
1723
{
1724
    *req = (BdrvTrackedRequest){
1725
        .bs = bs,
1726
        .sector_num = sector_num,
1727
        .nb_sectors = nb_sectors,
1728
        .is_write = is_write,
1729
        .co = qemu_coroutine_self(),
1730
    };
1731

    
1732
    qemu_co_queue_init(&req->wait_queue);
1733

    
1734
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1735
}
1736

    
1737
/**
1738
 * Round a region to cluster boundaries
1739
 */
1740
void bdrv_round_to_clusters(BlockDriverState *bs,
1741
                            int64_t sector_num, int nb_sectors,
1742
                            int64_t *cluster_sector_num,
1743
                            int *cluster_nb_sectors)
1744
{
1745
    BlockDriverInfo bdi;
1746

    
1747
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1748
        *cluster_sector_num = sector_num;
1749
        *cluster_nb_sectors = nb_sectors;
1750
    } else {
1751
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1752
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1753
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1754
                                            nb_sectors, c);
1755
    }
1756
}
1757

    
1758
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1759
                                     int64_t sector_num, int nb_sectors) {
1760
    /*        aaaa   bbbb */
1761
    if (sector_num >= req->sector_num + req->nb_sectors) {
1762
        return false;
1763
    }
1764
    /* bbbb   aaaa        */
1765
    if (req->sector_num >= sector_num + nb_sectors) {
1766
        return false;
1767
    }
1768
    return true;
1769
}
1770

    
1771
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1772
        int64_t sector_num, int nb_sectors)
1773
{
1774
    BdrvTrackedRequest *req;
1775
    int64_t cluster_sector_num;
1776
    int cluster_nb_sectors;
1777
    bool retry;
1778

    
1779
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1780
     * that allocating writes will be serialized and not race with each other
1781
     * for the same cluster.  For example, in copy-on-read it ensures that the
1782
     * CoR read and write operations are atomic and guest writes cannot
1783
     * interleave between them.
1784
     */
1785
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
1786
                           &cluster_sector_num, &cluster_nb_sectors);
1787

    
1788
    do {
1789
        retry = false;
1790
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1791
            if (tracked_request_overlaps(req, cluster_sector_num,
1792
                                         cluster_nb_sectors)) {
1793
                /* Hitting this means there was a reentrant request, for
1794
                 * example, a block driver issuing nested requests.  This must
1795
                 * never happen since it means deadlock.
1796
                 */
1797
                assert(qemu_coroutine_self() != req->co);
1798

    
1799
                qemu_co_queue_wait(&req->wait_queue);
1800
                retry = true;
1801
                break;
1802
            }
1803
        }
1804
    } while (retry);
1805
}
1806

    
1807
/*
1808
 * Return values:
1809
 * 0        - success
1810
 * -EINVAL  - backing format specified, but no file
1811
 * -ENOSPC  - can't update the backing file because no space is left in the
1812
 *            image file header
1813
 * -ENOTSUP - format driver doesn't support changing the backing file
1814
 */
1815
int bdrv_change_backing_file(BlockDriverState *bs,
1816
    const char *backing_file, const char *backing_fmt)
1817
{
1818
    BlockDriver *drv = bs->drv;
1819
    int ret;
1820

    
1821
    /* Backing file format doesn't make sense without a backing file */
1822
    if (backing_fmt && !backing_file) {
1823
        return -EINVAL;
1824
    }
1825

    
1826
    if (drv->bdrv_change_backing_file != NULL) {
1827
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1828
    } else {
1829
        ret = -ENOTSUP;
1830
    }
1831

    
1832
    if (ret == 0) {
1833
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1834
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1835
    }
1836
    return ret;
1837
}
1838

    
1839
/*
1840
 * Finds the image layer in the chain that has 'bs' as its backing file.
1841
 *
1842
 * active is the current topmost image.
1843
 *
1844
 * Returns NULL if bs is not found in active's image chain,
1845
 * or if active == bs.
1846
 */
1847
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1848
                                    BlockDriverState *bs)
1849
{
1850
    BlockDriverState *overlay = NULL;
1851
    BlockDriverState *intermediate;
1852

    
1853
    assert(active != NULL);
1854
    assert(bs != NULL);
1855

    
1856
    /* if bs is the same as active, then by definition it has no overlay
1857
     */
1858
    if (active == bs) {
1859
        return NULL;
1860
    }
1861

    
1862
    intermediate = active;
1863
    while (intermediate->backing_hd) {
1864
        if (intermediate->backing_hd == bs) {
1865
            overlay = intermediate;
1866
            break;
1867
        }
1868
        intermediate = intermediate->backing_hd;
1869
    }
1870

    
1871
    return overlay;
1872
}
1873

    
1874
typedef struct BlkIntermediateStates {
1875
    BlockDriverState *bs;
1876
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
1877
} BlkIntermediateStates;
1878

    
1879

    
1880
/*
1881
 * Drops images above 'base' up to and including 'top', and sets the image
1882
 * above 'top' to have base as its backing file.
1883
 *
1884
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
1885
 * information in 'bs' can be properly updated.
1886
 *
1887
 * E.g., this will convert the following chain:
1888
 * bottom <- base <- intermediate <- top <- active
1889
 *
1890
 * to
1891
 *
1892
 * bottom <- base <- active
1893
 *
1894
 * It is allowed for bottom==base, in which case it converts:
1895
 *
1896
 * base <- intermediate <- top <- active
1897
 *
1898
 * to
1899
 *
1900
 * base <- active
1901
 *
1902
 * Error conditions:
1903
 *  if active == top, that is considered an error
1904
 *
1905
 */
1906
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
1907
                           BlockDriverState *base)
1908
{
1909
    BlockDriverState *intermediate;
1910
    BlockDriverState *base_bs = NULL;
1911
    BlockDriverState *new_top_bs = NULL;
1912
    BlkIntermediateStates *intermediate_state, *next;
1913
    int ret = -EIO;
1914

    
1915
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
1916
    QSIMPLEQ_INIT(&states_to_delete);
1917

    
1918
    if (!top->drv || !base->drv) {
1919
        goto exit;
1920
    }
1921

    
1922
    new_top_bs = bdrv_find_overlay(active, top);
1923

    
1924
    if (new_top_bs == NULL) {
1925
        /* we could not find the image above 'top', this is an error */
1926
        goto exit;
1927
    }
1928

    
1929
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
1930
     * to do, no intermediate images */
1931
    if (new_top_bs->backing_hd == base) {
1932
        ret = 0;
1933
        goto exit;
1934
    }
1935

    
1936
    intermediate = top;
1937

    
1938
    /* now we will go down through the list, and add each BDS we find
1939
     * into our deletion queue, until we hit the 'base'
1940
     */
1941
    while (intermediate) {
1942
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
1943
        intermediate_state->bs = intermediate;
1944
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
1945

    
1946
        if (intermediate->backing_hd == base) {
1947
            base_bs = intermediate->backing_hd;
1948
            break;
1949
        }
1950
        intermediate = intermediate->backing_hd;
1951
    }
1952
    if (base_bs == NULL) {
1953
        /* something went wrong, we did not end at the base. safely
1954
         * unravel everything, and exit with error */
1955
        goto exit;
1956
    }
1957

    
1958
    /* success - we can delete the intermediate states, and link top->base */
1959
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
1960
                                   base_bs->drv ? base_bs->drv->format_name : "");
1961
    if (ret) {
1962
        goto exit;
1963
    }
1964
    new_top_bs->backing_hd = base_bs;
1965

    
1966

    
1967
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1968
        /* so that bdrv_close() does not recursively close the chain */
1969
        intermediate_state->bs->backing_hd = NULL;
1970
        bdrv_delete(intermediate_state->bs);
1971
    }
1972
    ret = 0;
1973

    
1974
exit:
1975
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1976
        g_free(intermediate_state);
1977
    }
1978
    return ret;
1979
}
1980

    
1981

    
1982
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1983
                                   size_t size)
1984
{
1985
    int64_t len;
1986

    
1987
    if (!bdrv_is_inserted(bs))
1988
        return -ENOMEDIUM;
1989

    
1990
    if (bs->growable)
1991
        return 0;
1992

    
1993
    len = bdrv_getlength(bs);
1994

    
1995
    if (offset < 0)
1996
        return -EIO;
1997

    
1998
    if ((offset > len) || (len - offset < size))
1999
        return -EIO;
2000

    
2001
    return 0;
2002
}
2003

    
2004
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2005
                              int nb_sectors)
2006
{
2007
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2008
                                   nb_sectors * BDRV_SECTOR_SIZE);
2009
}
2010

    
2011
typedef struct RwCo {
2012
    BlockDriverState *bs;
2013
    int64_t sector_num;
2014
    int nb_sectors;
2015
    QEMUIOVector *qiov;
2016
    bool is_write;
2017
    int ret;
2018
} RwCo;
2019

    
2020
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2021
{
2022
    RwCo *rwco = opaque;
2023

    
2024
    if (!rwco->is_write) {
2025
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2026
                                     rwco->nb_sectors, rwco->qiov, 0);
2027
    } else {
2028
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2029
                                      rwco->nb_sectors, rwco->qiov, 0);
2030
    }
2031
}
2032

    
2033
/*
2034
 * Process a synchronous request using coroutines
2035
 */
2036
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2037
                      int nb_sectors, bool is_write)
2038
{
2039
    QEMUIOVector qiov;
2040
    struct iovec iov = {
2041
        .iov_base = (void *)buf,
2042
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2043
    };
2044
    Coroutine *co;
2045
    RwCo rwco = {
2046
        .bs = bs,
2047
        .sector_num = sector_num,
2048
        .nb_sectors = nb_sectors,
2049
        .qiov = &qiov,
2050
        .is_write = is_write,
2051
        .ret = NOT_DONE,
2052
    };
2053

    
2054
    qemu_iovec_init_external(&qiov, &iov, 1);
2055

    
2056
    /**
2057
     * In sync call context, when the vcpu is blocked, this throttling timer
2058
     * will not fire; so the I/O throttling function has to be disabled here
2059
     * if it has been enabled.
2060
     */
2061
    if (bs->io_limits_enabled) {
2062
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2063
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2064
        bdrv_io_limits_disable(bs);
2065
    }
2066

    
2067
    if (qemu_in_coroutine()) {
2068
        /* Fast-path if already in coroutine context */
2069
        bdrv_rw_co_entry(&rwco);
2070
    } else {
2071
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2072
        qemu_coroutine_enter(co, &rwco);
2073
        while (rwco.ret == NOT_DONE) {
2074
            qemu_aio_wait();
2075
        }
2076
    }
2077
    return rwco.ret;
2078
}
2079

    
2080
/* return < 0 if error. See bdrv_write() for the return codes */
2081
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2082
              uint8_t *buf, int nb_sectors)
2083
{
2084
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
2085
}
2086

    
2087
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2088
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2089
                          uint8_t *buf, int nb_sectors)
2090
{
2091
    bool enabled;
2092
    int ret;
2093

    
2094
    enabled = bs->io_limits_enabled;
2095
    bs->io_limits_enabled = false;
2096
    ret = bdrv_read(bs, 0, buf, 1);
2097
    bs->io_limits_enabled = enabled;
2098
    return ret;
2099
}
2100

    
2101
/* Return < 0 if error. Important errors are:
2102
  -EIO         generic I/O error (may happen for all errors)
2103
  -ENOMEDIUM   No media inserted.
2104
  -EINVAL      Invalid sector number or nb_sectors
2105
  -EACCES      Trying to write a read-only device
2106
*/
2107
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2108
               const uint8_t *buf, int nb_sectors)
2109
{
2110
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
2111
}
2112

    
2113
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2114
               void *buf, int count1)
2115
{
2116
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2117
    int len, nb_sectors, count;
2118
    int64_t sector_num;
2119
    int ret;
2120

    
2121
    count = count1;
2122
    /* first read to align to sector start */
2123
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2124
    if (len > count)
2125
        len = count;
2126
    sector_num = offset >> BDRV_SECTOR_BITS;
2127
    if (len > 0) {
2128
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2129
            return ret;
2130
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2131
        count -= len;
2132
        if (count == 0)
2133
            return count1;
2134
        sector_num++;
2135
        buf += len;
2136
    }
2137

    
2138
    /* read the sectors "in place" */
2139
    nb_sectors = count >> BDRV_SECTOR_BITS;
2140
    if (nb_sectors > 0) {
2141
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2142
            return ret;
2143
        sector_num += nb_sectors;
2144
        len = nb_sectors << BDRV_SECTOR_BITS;
2145
        buf += len;
2146
        count -= len;
2147
    }
2148

    
2149
    /* add data from the last sector */
2150
    if (count > 0) {
2151
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2152
            return ret;
2153
        memcpy(buf, tmp_buf, count);
2154
    }
2155
    return count1;
2156
}
2157

    
2158
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2159
                const void *buf, int count1)
2160
{
2161
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2162
    int len, nb_sectors, count;
2163
    int64_t sector_num;
2164
    int ret;
2165

    
2166
    count = count1;
2167
    /* first write to align to sector start */
2168
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2169
    if (len > count)
2170
        len = count;
2171
    sector_num = offset >> BDRV_SECTOR_BITS;
2172
    if (len > 0) {
2173
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2174
            return ret;
2175
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
2176
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2177
            return ret;
2178
        count -= len;
2179
        if (count == 0)
2180
            return count1;
2181
        sector_num++;
2182
        buf += len;
2183
    }
2184

    
2185
    /* write the sectors "in place" */
2186
    nb_sectors = count >> BDRV_SECTOR_BITS;
2187
    if (nb_sectors > 0) {
2188
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
2189
            return ret;
2190
        sector_num += nb_sectors;
2191
        len = nb_sectors << BDRV_SECTOR_BITS;
2192
        buf += len;
2193
        count -= len;
2194
    }
2195

    
2196
    /* add data from the last sector */
2197
    if (count > 0) {
2198
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2199
            return ret;
2200
        memcpy(tmp_buf, buf, count);
2201
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2202
            return ret;
2203
    }
2204
    return count1;
2205
}
2206

    
2207
/*
2208
 * Writes to the file and ensures that no writes are reordered across this
2209
 * request (acts as a barrier)
2210
 *
2211
 * Returns 0 on success, -errno in error cases.
2212
 */
2213
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2214
    const void *buf, int count)
2215
{
2216
    int ret;
2217

    
2218
    ret = bdrv_pwrite(bs, offset, buf, count);
2219
    if (ret < 0) {
2220
        return ret;
2221
    }
2222

    
2223
    /* No flush needed for cache modes that already do it */
2224
    if (bs->enable_write_cache) {
2225
        bdrv_flush(bs);
2226
    }
2227

    
2228
    return 0;
2229
}
2230

    
2231
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2232
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2233
{
2234
    /* Perform I/O through a temporary buffer so that users who scribble over
2235
     * their read buffer while the operation is in progress do not end up
2236
     * modifying the image file.  This is critical for zero-copy guest I/O
2237
     * where anything might happen inside guest memory.
2238
     */
2239
    void *bounce_buffer;
2240

    
2241
    BlockDriver *drv = bs->drv;
2242
    struct iovec iov;
2243
    QEMUIOVector bounce_qiov;
2244
    int64_t cluster_sector_num;
2245
    int cluster_nb_sectors;
2246
    size_t skip_bytes;
2247
    int ret;
2248

    
2249
    /* Cover entire cluster so no additional backing file I/O is required when
2250
     * allocating cluster in the image file.
2251
     */
2252
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2253
                           &cluster_sector_num, &cluster_nb_sectors);
2254

    
2255
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2256
                                   cluster_sector_num, cluster_nb_sectors);
2257

    
2258
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2259
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2260
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2261

    
2262
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2263
                             &bounce_qiov);
2264
    if (ret < 0) {
2265
        goto err;
2266
    }
2267

    
2268
    if (drv->bdrv_co_write_zeroes &&
2269
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2270
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2271
                                      cluster_nb_sectors);
2272
    } else {
2273
        /* This does not change the data on the disk, it is not necessary
2274
         * to flush even in cache=writethrough mode.
2275
         */
2276
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2277
                                  &bounce_qiov);
2278
    }
2279

    
2280
    if (ret < 0) {
2281
        /* It might be okay to ignore write errors for guest requests.  If this
2282
         * is a deliberate copy-on-read then we don't want to ignore the error.
2283
         * Simply report it in all cases.
2284
         */
2285
        goto err;
2286
    }
2287

    
2288
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2289
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2290
                        nb_sectors * BDRV_SECTOR_SIZE);
2291

    
2292
err:
2293
    qemu_vfree(bounce_buffer);
2294
    return ret;
2295
}
2296

    
2297
/*
2298
 * Handle a read request in coroutine context
2299
 */
2300
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2301
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2302
    BdrvRequestFlags flags)
2303
{
2304
    BlockDriver *drv = bs->drv;
2305
    BdrvTrackedRequest req;
2306
    int ret;
2307

    
2308
    if (!drv) {
2309
        return -ENOMEDIUM;
2310
    }
2311
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2312
        return -EIO;
2313
    }
2314

    
2315
    /* throttling disk read I/O */
2316
    if (bs->io_limits_enabled) {
2317
        bdrv_io_limits_intercept(bs, false, nb_sectors);
2318
    }
2319

    
2320
    if (bs->copy_on_read) {
2321
        flags |= BDRV_REQ_COPY_ON_READ;
2322
    }
2323
    if (flags & BDRV_REQ_COPY_ON_READ) {
2324
        bs->copy_on_read_in_flight++;
2325
    }
2326

    
2327
    if (bs->copy_on_read_in_flight) {
2328
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2329
    }
2330

    
2331
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2332

    
2333
    if (flags & BDRV_REQ_COPY_ON_READ) {
2334
        int pnum;
2335

    
2336
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2337
        if (ret < 0) {
2338
            goto out;
2339
        }
2340

    
2341
        if (!ret || pnum != nb_sectors) {
2342
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2343
            goto out;
2344
        }
2345
    }
2346

    
2347
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2348

    
2349
out:
2350
    tracked_request_end(&req);
2351

    
2352
    if (flags & BDRV_REQ_COPY_ON_READ) {
2353
        bs->copy_on_read_in_flight--;
2354
    }
2355

    
2356
    return ret;
2357
}
2358

    
2359
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2360
    int nb_sectors, QEMUIOVector *qiov)
2361
{
2362
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2363

    
2364
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2365
}
2366

    
2367
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2368
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2369
{
2370
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2371

    
2372
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2373
                            BDRV_REQ_COPY_ON_READ);
2374
}
2375

    
2376
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2377
    int64_t sector_num, int nb_sectors)
2378
{
2379
    BlockDriver *drv = bs->drv;
2380
    QEMUIOVector qiov;
2381
    struct iovec iov;
2382
    int ret;
2383

    
2384
    /* TODO Emulate only part of misaligned requests instead of letting block
2385
     * drivers return -ENOTSUP and emulate everything */
2386

    
2387
    /* First try the efficient write zeroes operation */
2388
    if (drv->bdrv_co_write_zeroes) {
2389
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2390
        if (ret != -ENOTSUP) {
2391
            return ret;
2392
        }
2393
    }
2394

    
2395
    /* Fall back to bounce buffer if write zeroes is unsupported */
2396
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2397
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2398
    memset(iov.iov_base, 0, iov.iov_len);
2399
    qemu_iovec_init_external(&qiov, &iov, 1);
2400

    
2401
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2402

    
2403
    qemu_vfree(iov.iov_base);
2404
    return ret;
2405
}
2406

    
2407
/*
2408
 * Handle a write request in coroutine context
2409
 */
2410
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2411
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2412
    BdrvRequestFlags flags)
2413
{
2414
    BlockDriver *drv = bs->drv;
2415
    BdrvTrackedRequest req;
2416
    int ret;
2417

    
2418
    if (!bs->drv) {
2419
        return -ENOMEDIUM;
2420
    }
2421
    if (bs->read_only) {
2422
        return -EACCES;
2423
    }
2424
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2425
        return -EIO;
2426
    }
2427

    
2428
    /* throttling disk write I/O */
2429
    if (bs->io_limits_enabled) {
2430
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2431
    }
2432

    
2433
    if (bs->copy_on_read_in_flight) {
2434
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2435
    }
2436

    
2437
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2438

    
2439
    if (flags & BDRV_REQ_ZERO_WRITE) {
2440
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2441
    } else {
2442
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2443
    }
2444

    
2445
    if (ret == 0 && !bs->enable_write_cache) {
2446
        ret = bdrv_co_flush(bs);
2447
    }
2448

    
2449
    if (bs->dirty_bitmap) {
2450
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2451
    }
2452

    
2453
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2454
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2455
    }
2456

    
2457
    tracked_request_end(&req);
2458

    
2459
    return ret;
2460
}
2461

    
2462
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2463
    int nb_sectors, QEMUIOVector *qiov)
2464
{
2465
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2466

    
2467
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2468
}
2469

    
2470
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2471
                                      int64_t sector_num, int nb_sectors)
2472
{
2473
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2474

    
2475
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2476
                             BDRV_REQ_ZERO_WRITE);
2477
}
2478

    
2479
/**
2480
 * Truncate file to 'offset' bytes (needed only for file protocols)
2481
 */
2482
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2483
{
2484
    BlockDriver *drv = bs->drv;
2485
    int ret;
2486
    if (!drv)
2487
        return -ENOMEDIUM;
2488
    if (!drv->bdrv_truncate)
2489
        return -ENOTSUP;
2490
    if (bs->read_only)
2491
        return -EACCES;
2492
    if (bdrv_in_use(bs))
2493
        return -EBUSY;
2494

    
2495
    /* There better not be any in-flight IOs when we truncate the device. */
2496
    bdrv_drain_all();
2497

    
2498
    ret = drv->bdrv_truncate(bs, offset);
2499
    if (ret == 0) {
2500
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2501
        bdrv_dev_resize_cb(bs);
2502
    }
2503
    return ret;
2504
}
2505

    
2506
/**
2507
 * Length of a allocated file in bytes. Sparse files are counted by actual
2508
 * allocated space. Return < 0 if error or unknown.
2509
 */
2510
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2511
{
2512
    BlockDriver *drv = bs->drv;
2513
    if (!drv) {
2514
        return -ENOMEDIUM;
2515
    }
2516
    if (drv->bdrv_get_allocated_file_size) {
2517
        return drv->bdrv_get_allocated_file_size(bs);
2518
    }
2519
    if (bs->file) {
2520
        return bdrv_get_allocated_file_size(bs->file);
2521
    }
2522
    return -ENOTSUP;
2523
}
2524

    
2525
/**
2526
 * Length of a file in bytes. Return < 0 if error or unknown.
2527
 */
2528
int64_t bdrv_getlength(BlockDriverState *bs)
2529
{
2530
    BlockDriver *drv = bs->drv;
2531
    if (!drv)
2532
        return -ENOMEDIUM;
2533

    
2534
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2535
        if (drv->bdrv_getlength) {
2536
            return drv->bdrv_getlength(bs);
2537
        }
2538
    }
2539
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2540
}
2541

    
2542
/* return 0 as number of sectors if no device present or error */
2543
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2544
{
2545
    int64_t length;
2546
    length = bdrv_getlength(bs);
2547
    if (length < 0)
2548
        length = 0;
2549
    else
2550
        length = length >> BDRV_SECTOR_BITS;
2551
    *nb_sectors_ptr = length;
2552
}
2553

    
2554
/* throttling disk io limits */
2555
void bdrv_set_io_limits(BlockDriverState *bs,
2556
                        BlockIOLimit *io_limits)
2557
{
2558
    bs->io_limits = *io_limits;
2559
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2560
}
2561

    
2562
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2563
                       BlockdevOnError on_write_error)
2564
{
2565
    bs->on_read_error = on_read_error;
2566
    bs->on_write_error = on_write_error;
2567
}
2568

    
2569
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2570
{
2571
    return is_read ? bs->on_read_error : bs->on_write_error;
2572
}
2573

    
2574
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2575
{
2576
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2577

    
2578
    switch (on_err) {
2579
    case BLOCKDEV_ON_ERROR_ENOSPC:
2580
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2581
    case BLOCKDEV_ON_ERROR_STOP:
2582
        return BDRV_ACTION_STOP;
2583
    case BLOCKDEV_ON_ERROR_REPORT:
2584
        return BDRV_ACTION_REPORT;
2585
    case BLOCKDEV_ON_ERROR_IGNORE:
2586
        return BDRV_ACTION_IGNORE;
2587
    default:
2588
        abort();
2589
    }
2590
}
2591

    
2592
/* This is done by device models because, while the block layer knows
2593
 * about the error, it does not know whether an operation comes from
2594
 * the device or the block layer (from a job, for example).
2595
 */
2596
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2597
                       bool is_read, int error)
2598
{
2599
    assert(error >= 0);
2600
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2601
    if (action == BDRV_ACTION_STOP) {
2602
        vm_stop(RUN_STATE_IO_ERROR);
2603
        bdrv_iostatus_set_err(bs, error);
2604
    }
2605
}
2606

    
2607
int bdrv_is_read_only(BlockDriverState *bs)
2608
{
2609
    return bs->read_only;
2610
}
2611

    
2612
int bdrv_is_sg(BlockDriverState *bs)
2613
{
2614
    return bs->sg;
2615
}
2616

    
2617
int bdrv_enable_write_cache(BlockDriverState *bs)
2618
{
2619
    return bs->enable_write_cache;
2620
}
2621

    
2622
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2623
{
2624
    bs->enable_write_cache = wce;
2625

    
2626
    /* so a reopen() will preserve wce */
2627
    if (wce) {
2628
        bs->open_flags |= BDRV_O_CACHE_WB;
2629
    } else {
2630
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2631
    }
2632
}
2633

    
2634
int bdrv_is_encrypted(BlockDriverState *bs)
2635
{
2636
    if (bs->backing_hd && bs->backing_hd->encrypted)
2637
        return 1;
2638
    return bs->encrypted;
2639
}
2640

    
2641
int bdrv_key_required(BlockDriverState *bs)
2642
{
2643
    BlockDriverState *backing_hd = bs->backing_hd;
2644

    
2645
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2646
        return 1;
2647
    return (bs->encrypted && !bs->valid_key);
2648
}
2649

    
2650
int bdrv_set_key(BlockDriverState *bs, const char *key)
2651
{
2652
    int ret;
2653
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2654
        ret = bdrv_set_key(bs->backing_hd, key);
2655
        if (ret < 0)
2656
            return ret;
2657
        if (!bs->encrypted)
2658
            return 0;
2659
    }
2660
    if (!bs->encrypted) {
2661
        return -EINVAL;
2662
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2663
        return -ENOMEDIUM;
2664
    }
2665
    ret = bs->drv->bdrv_set_key(bs, key);
2666
    if (ret < 0) {
2667
        bs->valid_key = 0;
2668
    } else if (!bs->valid_key) {
2669
        bs->valid_key = 1;
2670
        /* call the change callback now, we skipped it on open */
2671
        bdrv_dev_change_media_cb(bs, true);
2672
    }
2673
    return ret;
2674
}
2675

    
2676
const char *bdrv_get_format_name(BlockDriverState *bs)
2677
{
2678
    return bs->drv ? bs->drv->format_name : NULL;
2679
}
2680

    
2681
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2682
                         void *opaque)
2683
{
2684
    BlockDriver *drv;
2685

    
2686
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2687
        it(opaque, drv->format_name);
2688
    }
2689
}
2690

    
2691
BlockDriverState *bdrv_find(const char *name)
2692
{
2693
    BlockDriverState *bs;
2694

    
2695
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2696
        if (!strcmp(name, bs->device_name)) {
2697
            return bs;
2698
        }
2699
    }
2700
    return NULL;
2701
}
2702

    
2703
BlockDriverState *bdrv_next(BlockDriverState *bs)
2704
{
2705
    if (!bs) {
2706
        return QTAILQ_FIRST(&bdrv_states);
2707
    }
2708
    return QTAILQ_NEXT(bs, list);
2709
}
2710

    
2711
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2712
{
2713
    BlockDriverState *bs;
2714

    
2715
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2716
        it(opaque, bs);
2717
    }
2718
}
2719

    
2720
const char *bdrv_get_device_name(BlockDriverState *bs)
2721
{
2722
    return bs->device_name;
2723
}
2724

    
2725
int bdrv_get_flags(BlockDriverState *bs)
2726
{
2727
    return bs->open_flags;
2728
}
2729

    
2730
void bdrv_flush_all(void)
2731
{
2732
    BlockDriverState *bs;
2733

    
2734
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2735
        bdrv_flush(bs);
2736
    }
2737
}
2738

    
2739
int bdrv_has_zero_init(BlockDriverState *bs)
2740
{
2741
    assert(bs->drv);
2742

    
2743
    if (bs->drv->bdrv_has_zero_init) {
2744
        return bs->drv->bdrv_has_zero_init(bs);
2745
    }
2746

    
2747
    return 1;
2748
}
2749

    
2750
typedef struct BdrvCoIsAllocatedData {
2751
    BlockDriverState *bs;
2752
    BlockDriverState *base;
2753
    int64_t sector_num;
2754
    int nb_sectors;
2755
    int *pnum;
2756
    int ret;
2757
    bool done;
2758
} BdrvCoIsAllocatedData;
2759

    
2760
/*
2761
 * Returns true iff the specified sector is present in the disk image. Drivers
2762
 * not implementing the functionality are assumed to not support backing files,
2763
 * hence all their sectors are reported as allocated.
2764
 *
2765
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2766
 * and 'pnum' is set to 0.
2767
 *
2768
 * 'pnum' is set to the number of sectors (including and immediately following
2769
 * the specified sector) that are known to be in the same
2770
 * allocated/unallocated state.
2771
 *
2772
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2773
 * beyond the end of the disk image it will be clamped.
2774
 */
2775
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2776
                                      int nb_sectors, int *pnum)
2777
{
2778
    int64_t n;
2779

    
2780
    if (sector_num >= bs->total_sectors) {
2781
        *pnum = 0;
2782
        return 0;
2783
    }
2784

    
2785
    n = bs->total_sectors - sector_num;
2786
    if (n < nb_sectors) {
2787
        nb_sectors = n;
2788
    }
2789

    
2790
    if (!bs->drv->bdrv_co_is_allocated) {
2791
        *pnum = nb_sectors;
2792
        return 1;
2793
    }
2794

    
2795
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2796
}
2797

    
2798
/* Coroutine wrapper for bdrv_is_allocated() */
2799
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2800
{
2801
    BdrvCoIsAllocatedData *data = opaque;
2802
    BlockDriverState *bs = data->bs;
2803

    
2804
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2805
                                     data->pnum);
2806
    data->done = true;
2807
}
2808

    
2809
/*
2810
 * Synchronous wrapper around bdrv_co_is_allocated().
2811
 *
2812
 * See bdrv_co_is_allocated() for details.
2813
 */
2814
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2815
                      int *pnum)
2816
{
2817
    Coroutine *co;
2818
    BdrvCoIsAllocatedData data = {
2819
        .bs = bs,
2820
        .sector_num = sector_num,
2821
        .nb_sectors = nb_sectors,
2822
        .pnum = pnum,
2823
        .done = false,
2824
    };
2825

    
2826
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2827
    qemu_coroutine_enter(co, &data);
2828
    while (!data.done) {
2829
        qemu_aio_wait();
2830
    }
2831
    return data.ret;
2832
}
2833

    
2834
/*
2835
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2836
 *
2837
 * Return true if the given sector is allocated in any image between
2838
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2839
 * sector is allocated in any image of the chain.  Return false otherwise.
2840
 *
2841
 * 'pnum' is set to the number of sectors (including and immediately following
2842
 *  the specified sector) that are known to be in the same
2843
 *  allocated/unallocated state.
2844
 *
2845
 */
2846
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2847
                                            BlockDriverState *base,
2848
                                            int64_t sector_num,
2849
                                            int nb_sectors, int *pnum)
2850
{
2851
    BlockDriverState *intermediate;
2852
    int ret, n = nb_sectors;
2853

    
2854
    intermediate = top;
2855
    while (intermediate && intermediate != base) {
2856
        int pnum_inter;
2857
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2858
                                   &pnum_inter);
2859
        if (ret < 0) {
2860
            return ret;
2861
        } else if (ret) {
2862
            *pnum = pnum_inter;
2863
            return 1;
2864
        }
2865

    
2866
        /*
2867
         * [sector_num, nb_sectors] is unallocated on top but intermediate
2868
         * might have
2869
         *
2870
         * [sector_num+x, nr_sectors] allocated.
2871
         */
2872
        if (n > pnum_inter &&
2873
            (intermediate == top ||
2874
             sector_num + pnum_inter < intermediate->total_sectors)) {
2875
            n = pnum_inter;
2876
        }
2877

    
2878
        intermediate = intermediate->backing_hd;
2879
    }
2880

    
2881
    *pnum = n;
2882
    return 0;
2883
}
2884

    
2885
/* Coroutine wrapper for bdrv_is_allocated_above() */
2886
static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque)
2887
{
2888
    BdrvCoIsAllocatedData *data = opaque;
2889
    BlockDriverState *top = data->bs;
2890
    BlockDriverState *base = data->base;
2891

    
2892
    data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num,
2893
                                           data->nb_sectors, data->pnum);
2894
    data->done = true;
2895
}
2896

    
2897
/*
2898
 * Synchronous wrapper around bdrv_co_is_allocated_above().
2899
 *
2900
 * See bdrv_co_is_allocated_above() for details.
2901
 */
2902
int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
2903
                            int64_t sector_num, int nb_sectors, int *pnum)
2904
{
2905
    Coroutine *co;
2906
    BdrvCoIsAllocatedData data = {
2907
        .bs = top,
2908
        .base = base,
2909
        .sector_num = sector_num,
2910
        .nb_sectors = nb_sectors,
2911
        .pnum = pnum,
2912
        .done = false,
2913
    };
2914

    
2915
    co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry);
2916
    qemu_coroutine_enter(co, &data);
2917
    while (!data.done) {
2918
        qemu_aio_wait();
2919
    }
2920
    return data.ret;
2921
}
2922

    
2923
BlockInfo *bdrv_query_info(BlockDriverState *bs)
2924
{
2925
    BlockInfo *info = g_malloc0(sizeof(*info));
2926
    info->device = g_strdup(bs->device_name);
2927
    info->type = g_strdup("unknown");
2928
    info->locked = bdrv_dev_is_medium_locked(bs);
2929
    info->removable = bdrv_dev_has_removable_media(bs);
2930

    
2931
    if (bdrv_dev_has_removable_media(bs)) {
2932
        info->has_tray_open = true;
2933
        info->tray_open = bdrv_dev_is_tray_open(bs);
2934
    }
2935

    
2936
    if (bdrv_iostatus_is_enabled(bs)) {
2937
        info->has_io_status = true;
2938
        info->io_status = bs->iostatus;
2939
    }
2940

    
2941
    if (bs->dirty_bitmap) {
2942
        info->has_dirty = true;
2943
        info->dirty = g_malloc0(sizeof(*info->dirty));
2944
        info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
2945
        info->dirty->granularity =
2946
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
2947
    }
2948

    
2949
    if (bs->drv) {
2950
        info->has_inserted = true;
2951
        info->inserted = g_malloc0(sizeof(*info->inserted));
2952
        info->inserted->file = g_strdup(bs->filename);
2953
        info->inserted->ro = bs->read_only;
2954
        info->inserted->drv = g_strdup(bs->drv->format_name);
2955
        info->inserted->encrypted = bs->encrypted;
2956
        info->inserted->encryption_key_missing = bdrv_key_required(bs);
2957

    
2958
        if (bs->backing_file[0]) {
2959
            info->inserted->has_backing_file = true;
2960
            info->inserted->backing_file = g_strdup(bs->backing_file);
2961
        }
2962

    
2963
        info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
2964

    
2965
        if (bs->io_limits_enabled) {
2966
            info->inserted->bps =
2967
                           bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2968
            info->inserted->bps_rd =
2969
                           bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2970
            info->inserted->bps_wr =
2971
                           bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2972
            info->inserted->iops =
2973
                           bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2974
            info->inserted->iops_rd =
2975
                           bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2976
            info->inserted->iops_wr =
2977
                           bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2978
        }
2979
    }
2980
    return info;
2981
}
2982

    
2983
BlockInfoList *qmp_query_block(Error **errp)
2984
{
2985
    BlockInfoList *head = NULL, **p_next = &head;
2986
    BlockDriverState *bs;
2987

    
2988
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2989
        BlockInfoList *info = g_malloc0(sizeof(*info));
2990
        info->value = bdrv_query_info(bs);
2991

    
2992
        *p_next = info;
2993
        p_next = &info->next;
2994
    }
2995

    
2996
    return head;
2997
}
2998

    
2999
BlockStats *bdrv_query_stats(const BlockDriverState *bs)
3000
{
3001
    BlockStats *s;
3002

    
3003
    s = g_malloc0(sizeof(*s));
3004

    
3005
    if (bs->device_name[0]) {
3006
        s->has_device = true;
3007
        s->device = g_strdup(bs->device_name);
3008
    }
3009

    
3010
    s->stats = g_malloc0(sizeof(*s->stats));
3011
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
3012
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
3013
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
3014
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
3015
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
3016
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
3017
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
3018
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
3019
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
3020

    
3021
    if (bs->file) {
3022
        s->has_parent = true;
3023
        s->parent = bdrv_query_stats(bs->file);
3024
    }
3025

    
3026
    return s;
3027
}
3028

    
3029
BlockStatsList *qmp_query_blockstats(Error **errp)
3030
{
3031
    BlockStatsList *head = NULL, **p_next = &head;
3032
    BlockDriverState *bs;
3033

    
3034
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3035
        BlockStatsList *info = g_malloc0(sizeof(*info));
3036
        info->value = bdrv_query_stats(bs);
3037

    
3038
        *p_next = info;
3039
        p_next = &info->next;
3040
    }
3041

    
3042
    return head;
3043
}
3044

    
3045
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3046
{
3047
    if (bs->backing_hd && bs->backing_hd->encrypted)
3048
        return bs->backing_file;
3049
    else if (bs->encrypted)
3050
        return bs->filename;
3051
    else
3052
        return NULL;
3053
}
3054

    
3055
void bdrv_get_backing_filename(BlockDriverState *bs,
3056
                               char *filename, int filename_size)
3057
{
3058
    pstrcpy(filename, filename_size, bs->backing_file);
3059
}
3060

    
3061
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3062
                          const uint8_t *buf, int nb_sectors)
3063
{
3064
    BlockDriver *drv = bs->drv;
3065
    if (!drv)
3066
        return -ENOMEDIUM;
3067
    if (!drv->bdrv_write_compressed)
3068
        return -ENOTSUP;
3069
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3070
        return -EIO;
3071

    
3072
    assert(!bs->dirty_bitmap);
3073

    
3074
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3075
}
3076

    
3077
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3078
{
3079
    BlockDriver *drv = bs->drv;
3080
    if (!drv)
3081
        return -ENOMEDIUM;
3082
    if (!drv->bdrv_get_info)
3083
        return -ENOTSUP;
3084
    memset(bdi, 0, sizeof(*bdi));
3085
    return drv->bdrv_get_info(bs, bdi);
3086
}
3087

    
3088
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3089
                      int64_t pos, int size)
3090
{
3091
    BlockDriver *drv = bs->drv;
3092
    if (!drv)
3093
        return -ENOMEDIUM;
3094
    if (drv->bdrv_save_vmstate)
3095
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
3096
    if (bs->file)
3097
        return bdrv_save_vmstate(bs->file, buf, pos, size);
3098
    return -ENOTSUP;
3099
}
3100

    
3101
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3102
                      int64_t pos, int size)
3103
{
3104
    BlockDriver *drv = bs->drv;
3105
    if (!drv)
3106
        return -ENOMEDIUM;
3107
    if (drv->bdrv_load_vmstate)
3108
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3109
    if (bs->file)
3110
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3111
    return -ENOTSUP;
3112
}
3113

    
3114
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3115
{
3116
    BlockDriver *drv = bs->drv;
3117

    
3118
    if (!drv || !drv->bdrv_debug_event) {
3119
        return;
3120
    }
3121

    
3122
    drv->bdrv_debug_event(bs, event);
3123
}
3124

    
3125
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3126
                          const char *tag)
3127
{
3128
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3129
        bs = bs->file;
3130
    }
3131

    
3132
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3133
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3134
    }
3135

    
3136
    return -ENOTSUP;
3137
}
3138

    
3139
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3140
{
3141
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3142
        bs = bs->file;
3143
    }
3144

    
3145
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3146
        return bs->drv->bdrv_debug_resume(bs, tag);
3147
    }
3148

    
3149
    return -ENOTSUP;
3150
}
3151

    
3152
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3153
{
3154
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3155
        bs = bs->file;
3156
    }
3157

    
3158
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3159
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3160
    }
3161

    
3162
    return false;
3163
}
3164

    
3165
/**************************************************************/
3166
/* handling of snapshots */
3167

    
3168
int bdrv_can_snapshot(BlockDriverState *bs)
3169
{
3170
    BlockDriver *drv = bs->drv;
3171
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3172
        return 0;
3173
    }
3174

    
3175
    if (!drv->bdrv_snapshot_create) {
3176
        if (bs->file != NULL) {
3177
            return bdrv_can_snapshot(bs->file);
3178
        }
3179
        return 0;
3180
    }
3181

    
3182
    return 1;
3183
}
3184

    
3185
int bdrv_is_snapshot(BlockDriverState *bs)
3186
{
3187
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3188
}
3189

    
3190
BlockDriverState *bdrv_snapshots(void)
3191
{
3192
    BlockDriverState *bs;
3193

    
3194
    if (bs_snapshots) {
3195
        return bs_snapshots;
3196
    }
3197

    
3198
    bs = NULL;
3199
    while ((bs = bdrv_next(bs))) {
3200
        if (bdrv_can_snapshot(bs)) {
3201
            bs_snapshots = bs;
3202
            return bs;
3203
        }
3204
    }
3205
    return NULL;
3206
}
3207

    
3208
int bdrv_snapshot_create(BlockDriverState *bs,
3209
                         QEMUSnapshotInfo *sn_info)
3210
{
3211
    BlockDriver *drv = bs->drv;
3212
    if (!drv)
3213
        return -ENOMEDIUM;
3214
    if (drv->bdrv_snapshot_create)
3215
        return drv->bdrv_snapshot_create(bs, sn_info);
3216
    if (bs->file)
3217
        return bdrv_snapshot_create(bs->file, sn_info);
3218
    return -ENOTSUP;
3219
}
3220

    
3221
int bdrv_snapshot_goto(BlockDriverState *bs,
3222
                       const char *snapshot_id)
3223
{
3224
    BlockDriver *drv = bs->drv;
3225
    int ret, open_ret;
3226

    
3227
    if (!drv)
3228
        return -ENOMEDIUM;
3229
    if (drv->bdrv_snapshot_goto)
3230
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
3231

    
3232
    if (bs->file) {
3233
        drv->bdrv_close(bs);
3234
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
3235
        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags);
3236
        if (open_ret < 0) {
3237
            bdrv_delete(bs->file);
3238
            bs->drv = NULL;
3239
            return open_ret;
3240
        }
3241
        return ret;
3242
    }
3243

    
3244
    return -ENOTSUP;
3245
}
3246

    
3247
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
3248
{
3249
    BlockDriver *drv = bs->drv;
3250
    if (!drv)
3251
        return -ENOMEDIUM;
3252
    if (drv->bdrv_snapshot_delete)
3253
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
3254
    if (bs->file)
3255
        return bdrv_snapshot_delete(bs->file, snapshot_id);
3256
    return -ENOTSUP;
3257
}
3258

    
3259
int bdrv_snapshot_list(BlockDriverState *bs,
3260
                       QEMUSnapshotInfo **psn_info)
3261
{
3262
    BlockDriver *drv = bs->drv;
3263
    if (!drv)
3264
        return -ENOMEDIUM;
3265
    if (drv->bdrv_snapshot_list)
3266
        return drv->bdrv_snapshot_list(bs, psn_info);
3267
    if (bs->file)
3268
        return bdrv_snapshot_list(bs->file, psn_info);
3269
    return -ENOTSUP;
3270
}
3271

    
3272
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
3273
        const char *snapshot_name)
3274
{
3275
    BlockDriver *drv = bs->drv;
3276
    if (!drv) {
3277
        return -ENOMEDIUM;
3278
    }
3279
    if (!bs->read_only) {
3280
        return -EINVAL;
3281
    }
3282
    if (drv->bdrv_snapshot_load_tmp) {
3283
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
3284
    }
3285
    return -ENOTSUP;
3286
}
3287

    
3288
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3289
 * relative, it must be relative to the chain.  So, passing in bs->filename
3290
 * from a BDS as backing_file should not be done, as that may be relative to
3291
 * the CWD rather than the chain. */
3292
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3293
        const char *backing_file)
3294
{
3295
    char *filename_full = NULL;
3296
    char *backing_file_full = NULL;
3297
    char *filename_tmp = NULL;
3298
    int is_protocol = 0;
3299
    BlockDriverState *curr_bs = NULL;
3300
    BlockDriverState *retval = NULL;
3301

    
3302
    if (!bs || !bs->drv || !backing_file) {
3303
        return NULL;
3304
    }
3305

    
3306
    filename_full     = g_malloc(PATH_MAX);
3307
    backing_file_full = g_malloc(PATH_MAX);
3308
    filename_tmp      = g_malloc(PATH_MAX);
3309

    
3310
    is_protocol = path_has_protocol(backing_file);
3311

    
3312
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3313

    
3314
        /* If either of the filename paths is actually a protocol, then
3315
         * compare unmodified paths; otherwise make paths relative */
3316
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3317
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3318
                retval = curr_bs->backing_hd;
3319
                break;
3320
            }
3321
        } else {
3322
            /* If not an absolute filename path, make it relative to the current
3323
             * image's filename path */
3324
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3325
                         backing_file);
3326

    
3327
            /* We are going to compare absolute pathnames */
3328
            if (!realpath(filename_tmp, filename_full)) {
3329
                continue;
3330
            }
3331

    
3332
            /* We need to make sure the backing filename we are comparing against
3333
             * is relative to the current image filename (or absolute) */
3334
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3335
                         curr_bs->backing_file);
3336

    
3337
            if (!realpath(filename_tmp, backing_file_full)) {
3338
                continue;
3339
            }
3340

    
3341
            if (strcmp(backing_file_full, filename_full) == 0) {
3342
                retval = curr_bs->backing_hd;
3343
                break;
3344
            }
3345
        }
3346
    }
3347

    
3348
    g_free(filename_full);
3349
    g_free(backing_file_full);
3350
    g_free(filename_tmp);
3351
    return retval;
3352
}
3353

    
3354
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3355
{
3356
    if (!bs->drv) {
3357
        return 0;
3358
    }
3359

    
3360
    if (!bs->backing_hd) {
3361
        return 0;
3362
    }
3363

    
3364
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3365
}
3366

    
3367
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3368
{
3369
    BlockDriverState *curr_bs = NULL;
3370

    
3371
    if (!bs) {
3372
        return NULL;
3373
    }
3374

    
3375
    curr_bs = bs;
3376

    
3377
    while (curr_bs->backing_hd) {
3378
        curr_bs = curr_bs->backing_hd;
3379
    }
3380
    return curr_bs;
3381
}
3382

    
3383
#define NB_SUFFIXES 4
3384

    
3385
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
3386
{
3387
    static const char suffixes[NB_SUFFIXES] = "KMGT";
3388
    int64_t base;
3389
    int i;
3390

    
3391
    if (size <= 999) {
3392
        snprintf(buf, buf_size, "%" PRId64, size);
3393
    } else {
3394
        base = 1024;
3395
        for(i = 0; i < NB_SUFFIXES; i++) {
3396
            if (size < (10 * base)) {
3397
                snprintf(buf, buf_size, "%0.1f%c",
3398
                         (double)size / base,
3399
                         suffixes[i]);
3400
                break;
3401
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3402
                snprintf(buf, buf_size, "%" PRId64 "%c",
3403
                         ((size + (base >> 1)) / base),
3404
                         suffixes[i]);
3405
                break;
3406
            }
3407
            base = base * 1024;
3408
        }
3409
    }
3410
    return buf;
3411
}
3412

    
3413
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3414
{
3415
    char buf1[128], date_buf[128], clock_buf[128];
3416
    struct tm tm;
3417
    time_t ti;
3418
    int64_t secs;
3419

    
3420
    if (!sn) {
3421
        snprintf(buf, buf_size,
3422
                 "%-10s%-20s%7s%20s%15s",
3423
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3424
    } else {
3425
        ti = sn->date_sec;
3426
        localtime_r(&ti, &tm);
3427
        strftime(date_buf, sizeof(date_buf),
3428
                 "%Y-%m-%d %H:%M:%S", &tm);
3429
        secs = sn->vm_clock_nsec / 1000000000;
3430
        snprintf(clock_buf, sizeof(clock_buf),
3431
                 "%02d:%02d:%02d.%03d",
3432
                 (int)(secs / 3600),
3433
                 (int)((secs / 60) % 60),
3434
                 (int)(secs % 60),
3435
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3436
        snprintf(buf, buf_size,
3437
                 "%-10s%-20s%7s%20s%15s",
3438
                 sn->id_str, sn->name,
3439
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3440
                 date_buf,
3441
                 clock_buf);
3442
    }
3443
    return buf;
3444
}
3445

    
3446
/**************************************************************/
3447
/* async I/Os */
3448

    
3449
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3450
                                 QEMUIOVector *qiov, int nb_sectors,
3451
                                 BlockDriverCompletionFunc *cb, void *opaque)
3452
{
3453
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3454

    
3455
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3456
                                 cb, opaque, false);
3457
}
3458

    
3459
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3460
                                  QEMUIOVector *qiov, int nb_sectors,
3461
                                  BlockDriverCompletionFunc *cb, void *opaque)
3462
{
3463
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3464

    
3465
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3466
                                 cb, opaque, true);
3467
}
3468

    
3469

    
3470
typedef struct MultiwriteCB {
3471
    int error;
3472
    int num_requests;
3473
    int num_callbacks;
3474
    struct {
3475
        BlockDriverCompletionFunc *cb;
3476
        void *opaque;
3477
        QEMUIOVector *free_qiov;
3478
    } callbacks[];
3479
} MultiwriteCB;
3480

    
3481
static void multiwrite_user_cb(MultiwriteCB *mcb)
3482
{
3483
    int i;
3484

    
3485
    for (i = 0; i < mcb->num_callbacks; i++) {
3486
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3487
        if (mcb->callbacks[i].free_qiov) {
3488
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3489
        }
3490
        g_free(mcb->callbacks[i].free_qiov);
3491
    }
3492
}
3493

    
3494
static void multiwrite_cb(void *opaque, int ret)
3495
{
3496
    MultiwriteCB *mcb = opaque;
3497

    
3498
    trace_multiwrite_cb(mcb, ret);
3499

    
3500
    if (ret < 0 && !mcb->error) {
3501
        mcb->error = ret;
3502
    }
3503

    
3504
    mcb->num_requests--;
3505
    if (mcb->num_requests == 0) {
3506
        multiwrite_user_cb(mcb);
3507
        g_free(mcb);
3508
    }
3509
}
3510

    
3511
static int multiwrite_req_compare(const void *a, const void *b)
3512
{
3513
    const BlockRequest *req1 = a, *req2 = b;
3514

    
3515
    /*
3516
     * Note that we can't simply subtract req2->sector from req1->sector
3517
     * here as that could overflow the return value.
3518
     */
3519
    if (req1->sector > req2->sector) {
3520
        return 1;
3521
    } else if (req1->sector < req2->sector) {
3522
        return -1;
3523
    } else {
3524
        return 0;
3525
    }
3526
}
3527

    
3528
/*
3529
 * Takes a bunch of requests and tries to merge them. Returns the number of
3530
 * requests that remain after merging.
3531
 */
3532
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3533
    int num_reqs, MultiwriteCB *mcb)
3534
{
3535
    int i, outidx;
3536

    
3537
    // Sort requests by start sector
3538
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3539

    
3540
    // Check if adjacent requests touch the same clusters. If so, combine them,
3541
    // filling up gaps with zero sectors.
3542
    outidx = 0;
3543
    for (i = 1; i < num_reqs; i++) {
3544
        int merge = 0;
3545
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3546

    
3547
        // Handle exactly sequential writes and overlapping writes.
3548
        if (reqs[i].sector <= oldreq_last) {
3549
            merge = 1;
3550
        }
3551

    
3552
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3553
            merge = 0;
3554
        }
3555

    
3556
        if (merge) {
3557
            size_t size;
3558
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3559
            qemu_iovec_init(qiov,
3560
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3561

    
3562
            // Add the first request to the merged one. If the requests are
3563
            // overlapping, drop the last sectors of the first request.
3564
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3565
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3566

    
3567
            // We should need to add any zeros between the two requests
3568
            assert (reqs[i].sector <= oldreq_last);
3569

    
3570
            // Add the second request
3571
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3572

    
3573
            reqs[outidx].nb_sectors = qiov->size >> 9;
3574
            reqs[outidx].qiov = qiov;
3575

    
3576
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3577
        } else {
3578
            outidx++;
3579
            reqs[outidx].sector     = reqs[i].sector;
3580
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3581
            reqs[outidx].qiov       = reqs[i].qiov;
3582
        }
3583
    }
3584

    
3585
    return outidx + 1;
3586
}
3587

    
3588
/*
3589
 * Submit multiple AIO write requests at once.
3590
 *
3591
 * On success, the function returns 0 and all requests in the reqs array have
3592
 * been submitted. In error case this function returns -1, and any of the
3593
 * requests may or may not be submitted yet. In particular, this means that the
3594
 * callback will be called for some of the requests, for others it won't. The
3595
 * caller must check the error field of the BlockRequest to wait for the right
3596
 * callbacks (if error != 0, no callback will be called).
3597
 *
3598
 * The implementation may modify the contents of the reqs array, e.g. to merge
3599
 * requests. However, the fields opaque and error are left unmodified as they
3600
 * are used to signal failure for a single request to the caller.
3601
 */
3602
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3603
{
3604
    MultiwriteCB *mcb;
3605
    int i;
3606

    
3607
    /* don't submit writes if we don't have a medium */
3608
    if (bs->drv == NULL) {
3609
        for (i = 0; i < num_reqs; i++) {
3610
            reqs[i].error = -ENOMEDIUM;
3611
        }
3612
        return -1;
3613
    }
3614

    
3615
    if (num_reqs == 0) {
3616
        return 0;
3617
    }
3618

    
3619
    // Create MultiwriteCB structure
3620
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3621
    mcb->num_requests = 0;
3622
    mcb->num_callbacks = num_reqs;
3623

    
3624
    for (i = 0; i < num_reqs; i++) {
3625
        mcb->callbacks[i].cb = reqs[i].cb;
3626
        mcb->callbacks[i].opaque = reqs[i].opaque;
3627
    }
3628

    
3629
    // Check for mergable requests
3630
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3631

    
3632
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3633

    
3634
    /* Run the aio requests. */
3635
    mcb->num_requests = num_reqs;
3636
    for (i = 0; i < num_reqs; i++) {
3637
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3638
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3639
    }
3640

    
3641
    return 0;
3642
}
3643

    
3644
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3645
{
3646
    acb->aiocb_info->cancel(acb);
3647
}
3648

    
3649
/* block I/O throttling */
3650
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3651
                 bool is_write, double elapsed_time, uint64_t *wait)
3652
{
3653
    uint64_t bps_limit = 0;
3654
    double   bytes_limit, bytes_base, bytes_res;
3655
    double   slice_time, wait_time;
3656

    
3657
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3658
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3659
    } else if (bs->io_limits.bps[is_write]) {
3660
        bps_limit = bs->io_limits.bps[is_write];
3661
    } else {
3662
        if (wait) {
3663
            *wait = 0;
3664
        }
3665

    
3666
        return false;
3667
    }
3668

    
3669
    slice_time = bs->slice_end - bs->slice_start;
3670
    slice_time /= (NANOSECONDS_PER_SECOND);
3671
    bytes_limit = bps_limit * slice_time;
3672
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3673
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3674
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3675
    }
3676

    
3677
    /* bytes_base: the bytes of data which have been read/written; and
3678
     *             it is obtained from the history statistic info.
3679
     * bytes_res: the remaining bytes of data which need to be read/written.
3680
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3681
     *             the total time for completing reading/writting all data.
3682
     */
3683
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3684

    
3685
    if (bytes_base + bytes_res <= bytes_limit) {
3686
        if (wait) {
3687
            *wait = 0;
3688
        }
3689

    
3690
        return false;
3691
    }
3692

    
3693
    /* Calc approx time to dispatch */
3694
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3695

    
3696
    /* When the I/O rate at runtime exceeds the limits,
3697
     * bs->slice_end need to be extended in order that the current statistic
3698
     * info can be kept until the timer fire, so it is increased and tuned
3699
     * based on the result of experiment.
3700
     */
3701
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3702
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3703
    if (wait) {
3704
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3705
    }
3706

    
3707
    return true;
3708
}
3709

    
3710
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3711
                             double elapsed_time, uint64_t *wait)
3712
{
3713
    uint64_t iops_limit = 0;
3714
    double   ios_limit, ios_base;
3715
    double   slice_time, wait_time;
3716

    
3717
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3718
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3719
    } else if (bs->io_limits.iops[is_write]) {
3720
        iops_limit = bs->io_limits.iops[is_write];
3721
    } else {
3722
        if (wait) {
3723
            *wait = 0;
3724
        }
3725

    
3726
        return false;
3727
    }
3728

    
3729
    slice_time = bs->slice_end - bs->slice_start;
3730
    slice_time /= (NANOSECONDS_PER_SECOND);
3731
    ios_limit  = iops_limit * slice_time;
3732
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3733
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3734
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3735
    }
3736

    
3737
    if (ios_base + 1 <= ios_limit) {
3738
        if (wait) {
3739
            *wait = 0;
3740
        }
3741

    
3742
        return false;
3743
    }
3744

    
3745
    /* Calc approx time to dispatch */
3746
    wait_time = (ios_base + 1) / iops_limit;
3747
    if (wait_time > elapsed_time) {
3748
        wait_time = wait_time - elapsed_time;
3749
    } else {
3750
        wait_time = 0;
3751
    }
3752

    
3753
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3754
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3755
    if (wait) {
3756
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3757
    }
3758

    
3759
    return true;
3760
}
3761

    
3762
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3763
                           bool is_write, int64_t *wait)
3764
{
3765
    int64_t  now, max_wait;
3766
    uint64_t bps_wait = 0, iops_wait = 0;
3767
    double   elapsed_time;
3768
    int      bps_ret, iops_ret;
3769

    
3770
    now = qemu_get_clock_ns(vm_clock);
3771
    if ((bs->slice_start < now)
3772
        && (bs->slice_end > now)) {
3773
        bs->slice_end = now + bs->slice_time;
3774
    } else {
3775
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3776
        bs->slice_start = now;
3777
        bs->slice_end   = now + bs->slice_time;
3778

    
3779
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3780
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3781

    
3782
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3783
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3784
    }
3785

    
3786
    elapsed_time  = now - bs->slice_start;
3787
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3788

    
3789
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3790
                                      is_write, elapsed_time, &bps_wait);
3791
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3792
                                      elapsed_time, &iops_wait);
3793
    if (bps_ret || iops_ret) {
3794
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3795
        if (wait) {
3796
            *wait = max_wait;
3797
        }
3798

    
3799
        now = qemu_get_clock_ns(vm_clock);
3800
        if (bs->slice_end < now + max_wait) {
3801
            bs->slice_end = now + max_wait;
3802
        }
3803

    
3804
        return true;
3805
    }
3806

    
3807
    if (wait) {
3808
        *wait = 0;
3809
    }
3810

    
3811
    return false;
3812
}
3813

    
3814
/**************************************************************/
3815
/* async block device emulation */
3816

    
3817
typedef struct BlockDriverAIOCBSync {
3818
    BlockDriverAIOCB common;
3819
    QEMUBH *bh;
3820
    int ret;
3821
    /* vector translation state */
3822
    QEMUIOVector *qiov;
3823
    uint8_t *bounce;
3824
    int is_write;
3825
} BlockDriverAIOCBSync;
3826

    
3827
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3828
{
3829
    BlockDriverAIOCBSync *acb =
3830
        container_of(blockacb, BlockDriverAIOCBSync, common);
3831
    qemu_bh_delete(acb->bh);
3832
    acb->bh = NULL;
3833
    qemu_aio_release(acb);
3834
}
3835

    
3836
static const AIOCBInfo bdrv_em_aiocb_info = {
3837
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3838
    .cancel             = bdrv_aio_cancel_em,
3839
};
3840

    
3841
static void bdrv_aio_bh_cb(void *opaque)
3842
{
3843
    BlockDriverAIOCBSync *acb = opaque;
3844

    
3845
    if (!acb->is_write)
3846
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3847
    qemu_vfree(acb->bounce);
3848
    acb->common.cb(acb->common.opaque, acb->ret);
3849
    qemu_bh_delete(acb->bh);
3850
    acb->bh = NULL;
3851
    qemu_aio_release(acb);
3852
}
3853

    
3854
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3855
                                            int64_t sector_num,
3856
                                            QEMUIOVector *qiov,
3857
                                            int nb_sectors,
3858
                                            BlockDriverCompletionFunc *cb,
3859
                                            void *opaque,
3860
                                            int is_write)
3861

    
3862
{
3863
    BlockDriverAIOCBSync *acb;
3864

    
3865
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3866
    acb->is_write = is_write;
3867
    acb->qiov = qiov;
3868
    acb->bounce = qemu_blockalign(bs, qiov->size);
3869
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3870

    
3871
    if (is_write) {
3872
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3873
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3874
    } else {
3875
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3876
    }
3877

    
3878
    qemu_bh_schedule(acb->bh);
3879

    
3880
    return &acb->common;
3881
}
3882

    
3883
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3884
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3885
        BlockDriverCompletionFunc *cb, void *opaque)
3886
{
3887
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3888
}
3889

    
3890
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3891
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3892
        BlockDriverCompletionFunc *cb, void *opaque)
3893
{
3894
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3895
}
3896

    
3897

    
3898
typedef struct BlockDriverAIOCBCoroutine {
3899
    BlockDriverAIOCB common;
3900
    BlockRequest req;
3901
    bool is_write;
3902
    bool *done;
3903
    QEMUBH* bh;
3904
} BlockDriverAIOCBCoroutine;
3905

    
3906
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3907
{
3908
    BlockDriverAIOCBCoroutine *acb =
3909
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3910
    bool done = false;
3911

    
3912
    acb->done = &done;
3913
    while (!done) {
3914
        qemu_aio_wait();
3915
    }
3916
}
3917

    
3918
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3919
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3920
    .cancel             = bdrv_aio_co_cancel_em,
3921
};
3922

    
3923
static void bdrv_co_em_bh(void *opaque)
3924
{
3925
    BlockDriverAIOCBCoroutine *acb = opaque;
3926

    
3927
    acb->common.cb(acb->common.opaque, acb->req.error);
3928

    
3929
    if (acb->done) {
3930
        *acb->done = true;
3931
    }
3932

    
3933
    qemu_bh_delete(acb->bh);
3934
    qemu_aio_release(acb);
3935
}
3936

    
3937
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3938
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3939
{
3940
    BlockDriverAIOCBCoroutine *acb = opaque;
3941
    BlockDriverState *bs = acb->common.bs;
3942

    
3943
    if (!acb->is_write) {
3944
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3945
            acb->req.nb_sectors, acb->req.qiov, 0);
3946
    } else {
3947
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3948
            acb->req.nb_sectors, acb->req.qiov, 0);
3949
    }
3950

    
3951
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3952
    qemu_bh_schedule(acb->bh);
3953
}
3954

    
3955
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3956
                                               int64_t sector_num,
3957
                                               QEMUIOVector *qiov,
3958
                                               int nb_sectors,
3959
                                               BlockDriverCompletionFunc *cb,
3960
                                               void *opaque,
3961
                                               bool is_write)
3962
{
3963
    Coroutine *co;
3964
    BlockDriverAIOCBCoroutine *acb;
3965

    
3966
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3967
    acb->req.sector = sector_num;
3968
    acb->req.nb_sectors = nb_sectors;
3969
    acb->req.qiov = qiov;
3970
    acb->is_write = is_write;
3971
    acb->done = NULL;
3972

    
3973
    co = qemu_coroutine_create(bdrv_co_do_rw);
3974
    qemu_coroutine_enter(co, acb);
3975

    
3976
    return &acb->common;
3977
}
3978

    
3979
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3980
{
3981
    BlockDriverAIOCBCoroutine *acb = opaque;
3982
    BlockDriverState *bs = acb->common.bs;
3983

    
3984
    acb->req.error = bdrv_co_flush(bs);
3985
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3986
    qemu_bh_schedule(acb->bh);
3987
}
3988

    
3989
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3990
        BlockDriverCompletionFunc *cb, void *opaque)
3991
{
3992
    trace_bdrv_aio_flush(bs, opaque);
3993

    
3994
    Coroutine *co;
3995
    BlockDriverAIOCBCoroutine *acb;
3996

    
3997
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
3998
    acb->done = NULL;
3999

    
4000
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4001
    qemu_coroutine_enter(co, acb);
4002

    
4003
    return &acb->common;
4004
}
4005

    
4006
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4007
{
4008
    BlockDriverAIOCBCoroutine *acb = opaque;
4009
    BlockDriverState *bs = acb->common.bs;
4010

    
4011
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4012
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4013
    qemu_bh_schedule(acb->bh);
4014
}
4015

    
4016
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4017
        int64_t sector_num, int nb_sectors,
4018
        BlockDriverCompletionFunc *cb, void *opaque)
4019
{
4020
    Coroutine *co;
4021
    BlockDriverAIOCBCoroutine *acb;
4022

    
4023
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4024

    
4025
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4026
    acb->req.sector = sector_num;
4027
    acb->req.nb_sectors = nb_sectors;
4028
    acb->done = NULL;
4029
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4030
    qemu_coroutine_enter(co, acb);
4031

    
4032
    return &acb->common;
4033
}
4034

    
4035
void bdrv_init(void)
4036
{
4037
    module_call_init(MODULE_INIT_BLOCK);
4038
}
4039

    
4040
void bdrv_init_with_whitelist(void)
4041
{
4042
    use_bdrv_whitelist = 1;
4043
    bdrv_init();
4044
}
4045

    
4046
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4047
                   BlockDriverCompletionFunc *cb, void *opaque)
4048
{
4049
    BlockDriverAIOCB *acb;
4050

    
4051
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4052
    acb->aiocb_info = aiocb_info;
4053
    acb->bs = bs;
4054
    acb->cb = cb;
4055
    acb->opaque = opaque;
4056
    return acb;
4057
}
4058

    
4059
void qemu_aio_release(void *p)
4060
{
4061
    BlockDriverAIOCB *acb = p;
4062
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4063
}
4064

    
4065
/**************************************************************/
4066
/* Coroutine block device emulation */
4067

    
4068
typedef struct CoroutineIOCompletion {
4069
    Coroutine *coroutine;
4070
    int ret;
4071
} CoroutineIOCompletion;
4072

    
4073
static void bdrv_co_io_em_complete(void *opaque, int ret)
4074
{
4075
    CoroutineIOCompletion *co = opaque;
4076

    
4077
    co->ret = ret;
4078
    qemu_coroutine_enter(co->coroutine, NULL);
4079
}
4080

    
4081
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4082
                                      int nb_sectors, QEMUIOVector *iov,
4083
                                      bool is_write)
4084
{
4085
    CoroutineIOCompletion co = {
4086
        .coroutine = qemu_coroutine_self(),
4087
    };
4088
    BlockDriverAIOCB *acb;
4089

    
4090
    if (is_write) {
4091
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4092
                                       bdrv_co_io_em_complete, &co);
4093
    } else {
4094
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4095
                                      bdrv_co_io_em_complete, &co);
4096
    }
4097

    
4098
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4099
    if (!acb) {
4100
        return -EIO;
4101
    }
4102
    qemu_coroutine_yield();
4103

    
4104
    return co.ret;
4105
}
4106

    
4107
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4108
                                         int64_t sector_num, int nb_sectors,
4109
                                         QEMUIOVector *iov)
4110
{
4111
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4112
}
4113

    
4114
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4115
                                         int64_t sector_num, int nb_sectors,
4116
                                         QEMUIOVector *iov)
4117
{
4118
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4119
}
4120

    
4121
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4122
{
4123
    RwCo *rwco = opaque;
4124

    
4125
    rwco->ret = bdrv_co_flush(rwco->bs);
4126
}
4127

    
4128
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4129
{
4130
    int ret;
4131

    
4132
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4133
        return 0;
4134
    }
4135

    
4136
    /* Write back cached data to the OS even with cache=unsafe */
4137
    if (bs->drv->bdrv_co_flush_to_os) {
4138
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4139
        if (ret < 0) {
4140
            return ret;
4141
        }
4142
    }
4143

    
4144
    /* But don't actually force it to the disk with cache=unsafe */
4145
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4146
        goto flush_parent;
4147
    }
4148

    
4149
    if (bs->drv->bdrv_co_flush_to_disk) {
4150
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4151
    } else if (bs->drv->bdrv_aio_flush) {
4152
        BlockDriverAIOCB *acb;
4153
        CoroutineIOCompletion co = {
4154
            .coroutine = qemu_coroutine_self(),
4155
        };
4156

    
4157
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4158
        if (acb == NULL) {
4159
            ret = -EIO;
4160
        } else {
4161
            qemu_coroutine_yield();
4162
            ret = co.ret;
4163
        }
4164
    } else {
4165
        /*
4166
         * Some block drivers always operate in either writethrough or unsafe
4167
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4168
         * know how the server works (because the behaviour is hardcoded or
4169
         * depends on server-side configuration), so we can't ensure that
4170
         * everything is safe on disk. Returning an error doesn't work because
4171
         * that would break guests even if the server operates in writethrough
4172
         * mode.
4173
         *
4174
         * Let's hope the user knows what he's doing.
4175
         */
4176
        ret = 0;
4177
    }
4178
    if (ret < 0) {
4179
        return ret;
4180
    }
4181

    
4182
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4183
     * in the case of cache=unsafe, so there are no useless flushes.
4184
     */
4185
flush_parent:
4186
    return bdrv_co_flush(bs->file);
4187
}
4188

    
4189
void bdrv_invalidate_cache(BlockDriverState *bs)
4190
{
4191
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4192
        bs->drv->bdrv_invalidate_cache(bs);
4193
    }
4194
}
4195

    
4196
void bdrv_invalidate_cache_all(void)
4197
{
4198
    BlockDriverState *bs;
4199

    
4200
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4201
        bdrv_invalidate_cache(bs);
4202
    }
4203
}
4204

    
4205
void bdrv_clear_incoming_migration_all(void)
4206
{
4207
    BlockDriverState *bs;
4208

    
4209
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4210
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4211
    }
4212
}
4213

    
4214
int bdrv_flush(BlockDriverState *bs)
4215
{
4216
    Coroutine *co;
4217
    RwCo rwco = {
4218
        .bs = bs,
4219
        .ret = NOT_DONE,
4220
    };
4221

    
4222
    if (qemu_in_coroutine()) {
4223
        /* Fast-path if already in coroutine context */
4224
        bdrv_flush_co_entry(&rwco);
4225
    } else {
4226
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4227
        qemu_coroutine_enter(co, &rwco);
4228
        while (rwco.ret == NOT_DONE) {
4229
            qemu_aio_wait();
4230
        }
4231
    }
4232

    
4233
    return rwco.ret;
4234
}
4235

    
4236
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4237
{
4238
    RwCo *rwco = opaque;
4239

    
4240
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4241
}
4242

    
4243
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4244
                                 int nb_sectors)
4245
{
4246
    if (!bs->drv) {
4247
        return -ENOMEDIUM;
4248
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4249
        return -EIO;
4250
    } else if (bs->read_only) {
4251
        return -EROFS;
4252
    }
4253

    
4254
    if (bs->dirty_bitmap) {
4255
        bdrv_reset_dirty(bs, sector_num, nb_sectors);
4256
    }
4257

    
4258
    /* Do nothing if disabled.  */
4259
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4260
        return 0;
4261
    }
4262

    
4263
    if (bs->drv->bdrv_co_discard) {
4264
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4265
    } else if (bs->drv->bdrv_aio_discard) {
4266
        BlockDriverAIOCB *acb;
4267
        CoroutineIOCompletion co = {
4268
            .coroutine = qemu_coroutine_self(),
4269
        };
4270

    
4271
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4272
                                        bdrv_co_io_em_complete, &co);
4273
        if (acb == NULL) {
4274
            return -EIO;
4275
        } else {
4276
            qemu_coroutine_yield();
4277
            return co.ret;
4278
        }
4279
    } else {
4280
        return 0;
4281
    }
4282
}
4283

    
4284
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4285
{
4286
    Coroutine *co;
4287
    RwCo rwco = {
4288
        .bs = bs,
4289
        .sector_num = sector_num,
4290
        .nb_sectors = nb_sectors,
4291
        .ret = NOT_DONE,
4292
    };
4293

    
4294
    if (qemu_in_coroutine()) {
4295
        /* Fast-path if already in coroutine context */
4296
        bdrv_discard_co_entry(&rwco);
4297
    } else {
4298
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4299
        qemu_coroutine_enter(co, &rwco);
4300
        while (rwco.ret == NOT_DONE) {
4301
            qemu_aio_wait();
4302
        }
4303
    }
4304

    
4305
    return rwco.ret;
4306
}
4307

    
4308
/**************************************************************/
4309
/* removable device support */
4310

    
4311
/**
4312
 * Return TRUE if the media is present
4313
 */
4314
int bdrv_is_inserted(BlockDriverState *bs)
4315
{
4316
    BlockDriver *drv = bs->drv;
4317

    
4318
    if (!drv)
4319
        return 0;
4320
    if (!drv->bdrv_is_inserted)
4321
        return 1;
4322
    return drv->bdrv_is_inserted(bs);
4323
}
4324

    
4325
/**
4326
 * Return whether the media changed since the last call to this
4327
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4328
 */
4329
int bdrv_media_changed(BlockDriverState *bs)
4330
{
4331
    BlockDriver *drv = bs->drv;
4332

    
4333
    if (drv && drv->bdrv_media_changed) {
4334
        return drv->bdrv_media_changed(bs);
4335
    }
4336
    return -ENOTSUP;
4337
}
4338

    
4339
/**
4340
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4341
 */
4342
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4343
{
4344
    BlockDriver *drv = bs->drv;
4345

    
4346
    if (drv && drv->bdrv_eject) {
4347
        drv->bdrv_eject(bs, eject_flag);
4348
    }
4349

    
4350
    if (bs->device_name[0] != '\0') {
4351
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4352
    }
4353
}
4354

    
4355
/**
4356
 * Lock or unlock the media (if it is locked, the user won't be able
4357
 * to eject it manually).
4358
 */
4359
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4360
{
4361
    BlockDriver *drv = bs->drv;
4362

    
4363
    trace_bdrv_lock_medium(bs, locked);
4364

    
4365
    if (drv && drv->bdrv_lock_medium) {
4366
        drv->bdrv_lock_medium(bs, locked);
4367
    }
4368
}
4369

    
4370
/* needed for generic scsi interface */
4371

    
4372
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4373
{
4374
    BlockDriver *drv = bs->drv;
4375

    
4376
    if (drv && drv->bdrv_ioctl)
4377
        return drv->bdrv_ioctl(bs, req, buf);
4378
    return -ENOTSUP;
4379
}
4380

    
4381
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4382
        unsigned long int req, void *buf,
4383
        BlockDriverCompletionFunc *cb, void *opaque)
4384
{
4385
    BlockDriver *drv = bs->drv;
4386

    
4387
    if (drv && drv->bdrv_aio_ioctl)
4388
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4389
    return NULL;
4390
}
4391

    
4392
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4393
{
4394
    bs->buffer_alignment = align;
4395
}
4396

    
4397
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4398
{
4399
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4400
}
4401

    
4402
/*
4403
 * Check if all memory in this vector is sector aligned.
4404
 */
4405
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4406
{
4407
    int i;
4408

    
4409
    for (i = 0; i < qiov->niov; i++) {
4410
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4411
            return false;
4412
        }
4413
    }
4414

    
4415
    return true;
4416
}
4417

    
4418
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4419
{
4420
    int64_t bitmap_size;
4421

    
4422
    assert((granularity & (granularity - 1)) == 0);
4423

    
4424
    if (granularity) {
4425
        granularity >>= BDRV_SECTOR_BITS;
4426
        assert(!bs->dirty_bitmap);
4427
        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4428
        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4429
    } else {
4430
        if (bs->dirty_bitmap) {
4431
            hbitmap_free(bs->dirty_bitmap);
4432
            bs->dirty_bitmap = NULL;
4433
        }
4434
    }
4435
}
4436

    
4437
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4438
{
4439
    if (bs->dirty_bitmap) {
4440
        return hbitmap_get(bs->dirty_bitmap, sector);
4441
    } else {
4442
        return 0;
4443
    }
4444
}
4445

    
4446
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4447
{
4448
    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4449
}
4450

    
4451
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4452
                    int nr_sectors)
4453
{
4454
    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4455
}
4456

    
4457
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4458
                      int nr_sectors)
4459
{
4460
    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4461
}
4462

    
4463
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4464
{
4465
    if (bs->dirty_bitmap) {
4466
        return hbitmap_count(bs->dirty_bitmap);
4467
    } else {
4468
        return 0;
4469
    }
4470
}
4471

    
4472
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4473
{
4474
    assert(bs->in_use != in_use);
4475
    bs->in_use = in_use;
4476
}
4477

    
4478
int bdrv_in_use(BlockDriverState *bs)
4479
{
4480
    return bs->in_use;
4481
}
4482

    
4483
void bdrv_iostatus_enable(BlockDriverState *bs)
4484
{
4485
    bs->iostatus_enabled = true;
4486
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4487
}
4488

    
4489
/* The I/O status is only enabled if the drive explicitly
4490
 * enables it _and_ the VM is configured to stop on errors */
4491
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4492
{
4493
    return (bs->iostatus_enabled &&
4494
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4495
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4496
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4497
}
4498

    
4499
void bdrv_iostatus_disable(BlockDriverState *bs)
4500
{
4501
    bs->iostatus_enabled = false;
4502
}
4503

    
4504
void bdrv_iostatus_reset(BlockDriverState *bs)
4505
{
4506
    if (bdrv_iostatus_is_enabled(bs)) {
4507
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4508
        if (bs->job) {
4509
            block_job_iostatus_reset(bs->job);
4510
        }
4511
    }
4512
}
4513

    
4514
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4515
{
4516
    assert(bdrv_iostatus_is_enabled(bs));
4517
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4518
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4519
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4520
    }
4521
}
4522

    
4523
void
4524
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4525
        enum BlockAcctType type)
4526
{
4527
    assert(type < BDRV_MAX_IOTYPE);
4528

    
4529
    cookie->bytes = bytes;
4530
    cookie->start_time_ns = get_clock();
4531
    cookie->type = type;
4532
}
4533

    
4534
void
4535
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4536
{
4537
    assert(cookie->type < BDRV_MAX_IOTYPE);
4538

    
4539
    bs->nr_bytes[cookie->type] += cookie->bytes;
4540
    bs->nr_ops[cookie->type]++;
4541
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4542
}
4543

    
4544
void bdrv_img_create(const char *filename, const char *fmt,
4545
                     const char *base_filename, const char *base_fmt,
4546
                     char *options, uint64_t img_size, int flags,
4547
                     Error **errp, bool quiet)
4548
{
4549
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4550
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4551
    BlockDriverState *bs = NULL;
4552
    BlockDriver *drv, *proto_drv;
4553
    BlockDriver *backing_drv = NULL;
4554
    int ret = 0;
4555

    
4556
    /* Find driver and parse its options */
4557
    drv = bdrv_find_format(fmt);
4558
    if (!drv) {
4559
        error_setg(errp, "Unknown file format '%s'", fmt);
4560
        return;
4561
    }
4562

    
4563
    proto_drv = bdrv_find_protocol(filename);
4564
    if (!proto_drv) {
4565
        error_setg(errp, "Unknown protocol '%s'", filename);
4566
        return;
4567
    }
4568

    
4569
    create_options = append_option_parameters(create_options,
4570
                                              drv->create_options);
4571
    create_options = append_option_parameters(create_options,
4572
                                              proto_drv->create_options);
4573

    
4574
    /* Create parameter list with default values */
4575
    param = parse_option_parameters("", create_options, param);
4576

    
4577
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4578

    
4579
    /* Parse -o options */
4580
    if (options) {
4581
        param = parse_option_parameters(options, create_options, param);
4582
        if (param == NULL) {
4583
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4584
            goto out;
4585
        }
4586
    }
4587

    
4588
    if (base_filename) {
4589
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4590
                                 base_filename)) {
4591
            error_setg(errp, "Backing file not supported for file format '%s'",
4592
                       fmt);
4593
            goto out;
4594
        }
4595
    }
4596

    
4597
    if (base_fmt) {
4598
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4599
            error_setg(errp, "Backing file format not supported for file "
4600
                             "format '%s'", fmt);
4601
            goto out;
4602
        }
4603
    }
4604

    
4605
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4606
    if (backing_file && backing_file->value.s) {
4607
        if (!strcmp(filename, backing_file->value.s)) {
4608
            error_setg(errp, "Error: Trying to create an image with the "
4609
                             "same filename as the backing file");
4610
            goto out;
4611
        }
4612
    }
4613

    
4614
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4615
    if (backing_fmt && backing_fmt->value.s) {
4616
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4617
        if (!backing_drv) {
4618
            error_setg(errp, "Unknown backing file format '%s'",
4619
                       backing_fmt->value.s);
4620
            goto out;
4621
        }
4622
    }
4623

    
4624
    // The size for the image must always be specified, with one exception:
4625
    // If we are using a backing file, we can obtain the size from there
4626
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4627
    if (size && size->value.n == -1) {
4628
        if (backing_file && backing_file->value.s) {
4629
            uint64_t size;
4630
            char buf[32];
4631
            int back_flags;
4632

    
4633
            /* backing files always opened read-only */
4634
            back_flags =
4635
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4636

    
4637
            bs = bdrv_new("");
4638

    
4639
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4640
                            backing_drv);
4641
            if (ret < 0) {
4642
                error_setg_errno(errp, -ret, "Could not open '%s'",
4643
                                 backing_file->value.s);
4644
                goto out;
4645
            }
4646
            bdrv_get_geometry(bs, &size);
4647
            size *= 512;
4648

    
4649
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4650
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4651
        } else {
4652
            error_setg(errp, "Image creation needs a size parameter");
4653
            goto out;
4654
        }
4655
    }
4656

    
4657
    if (!quiet) {
4658
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4659
        print_option_parameters(param);
4660
        puts("");
4661
    }
4662
    ret = bdrv_create(drv, filename, param);
4663
    if (ret < 0) {
4664
        if (ret == -ENOTSUP) {
4665
            error_setg(errp,"Formatting or formatting option not supported for "
4666
                            "file format '%s'", fmt);
4667
        } else if (ret == -EFBIG) {
4668
            error_setg(errp, "The image size is too large for file format '%s'",
4669
                       fmt);
4670
        } else {
4671
            error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
4672
                       strerror(-ret));
4673
        }
4674
    }
4675

    
4676
out:
4677
    free_option_parameters(create_options);
4678
    free_option_parameters(param);
4679

    
4680
    if (bs) {
4681
        bdrv_delete(bs);
4682
    }
4683
}
4684

    
4685
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4686
{
4687
    /* Currently BlockDriverState always uses the main loop AioContext */
4688
    return qemu_get_aio_context();
4689
}