Statistics
| Branch: | Revision:

root / block.c @ 8d3b1a2d

History | View | Annotate | Download (131.6 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53

    
54
typedef enum {
55
    BDRV_REQ_COPY_ON_READ = 0x1,
56
    BDRV_REQ_ZERO_WRITE   = 0x2,
57
} BdrvRequestFlags;
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors);
88

    
89
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90
        bool is_write, double elapsed_time, uint64_t *wait);
91
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92
        double elapsed_time, uint64_t *wait);
93
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94
        bool is_write, int64_t *wait);
95

    
96
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
98

    
99
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
101

    
102
/* The device to use for VM snapshots */
103
static BlockDriverState *bs_snapshots;
104

    
105
/* If non-zero, use only whitelisted block drivers */
106
static int use_bdrv_whitelist;
107

    
108
#ifdef _WIN32
109
static int is_windows_drive_prefix(const char *filename)
110
{
111
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
112
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113
            filename[1] == ':');
114
}
115

    
116
int is_windows_drive(const char *filename)
117
{
118
    if (is_windows_drive_prefix(filename) &&
119
        filename[2] == '\0')
120
        return 1;
121
    if (strstart(filename, "\\\\.\\", NULL) ||
122
        strstart(filename, "//./", NULL))
123
        return 1;
124
    return 0;
125
}
126
#endif
127

    
128
/* throttling disk I/O limits */
129
void bdrv_io_limits_disable(BlockDriverState *bs)
130
{
131
    bs->io_limits_enabled = false;
132

    
133
    while (qemu_co_queue_next(&bs->throttled_reqs));
134

    
135
    if (bs->block_timer) {
136
        qemu_del_timer(bs->block_timer);
137
        qemu_free_timer(bs->block_timer);
138
        bs->block_timer = NULL;
139
    }
140

    
141
    bs->slice_start = 0;
142
    bs->slice_end   = 0;
143
}
144

    
145
static void bdrv_block_timer(void *opaque)
146
{
147
    BlockDriverState *bs = opaque;
148

    
149
    qemu_co_queue_next(&bs->throttled_reqs);
150
}
151

    
152
void bdrv_io_limits_enable(BlockDriverState *bs)
153
{
154
    qemu_co_queue_init(&bs->throttled_reqs);
155
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
156
    bs->io_limits_enabled = true;
157
}
158

    
159
bool bdrv_io_limits_enabled(BlockDriverState *bs)
160
{
161
    BlockIOLimit *io_limits = &bs->io_limits;
162
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
163
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
164
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
165
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
166
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
167
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
168
}
169

    
170
static void bdrv_io_limits_intercept(BlockDriverState *bs,
171
                                     bool is_write, int nb_sectors)
172
{
173
    int64_t wait_time = -1;
174

    
175
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
176
        qemu_co_queue_wait(&bs->throttled_reqs);
177
    }
178

    
179
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
180
     * throttled requests will not be dequeued until the current request is
181
     * allowed to be serviced. So if the current request still exceeds the
182
     * limits, it will be inserted to the head. All requests followed it will
183
     * be still in throttled_reqs queue.
184
     */
185

    
186
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
187
        qemu_mod_timer(bs->block_timer,
188
                       wait_time + qemu_get_clock_ns(vm_clock));
189
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
190
    }
191

    
192
    qemu_co_queue_next(&bs->throttled_reqs);
193
}
194

    
195
/* check if the path starts with "<protocol>:" */
196
static int path_has_protocol(const char *path)
197
{
198
    const char *p;
199

    
200
#ifdef _WIN32
201
    if (is_windows_drive(path) ||
202
        is_windows_drive_prefix(path)) {
203
        return 0;
204
    }
205
    p = path + strcspn(path, ":/\\");
206
#else
207
    p = path + strcspn(path, ":/");
208
#endif
209

    
210
    return *p == ':';
211
}
212

    
213
int path_is_absolute(const char *path)
214
{
215
#ifdef _WIN32
216
    /* specific case for names like: "\\.\d:" */
217
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
218
        return 1;
219
    }
220
    return (*path == '/' || *path == '\\');
221
#else
222
    return (*path == '/');
223
#endif
224
}
225

    
226
/* if filename is absolute, just copy it to dest. Otherwise, build a
227
   path to it by considering it is relative to base_path. URL are
228
   supported. */
229
void path_combine(char *dest, int dest_size,
230
                  const char *base_path,
231
                  const char *filename)
232
{
233
    const char *p, *p1;
234
    int len;
235

    
236
    if (dest_size <= 0)
237
        return;
238
    if (path_is_absolute(filename)) {
239
        pstrcpy(dest, dest_size, filename);
240
    } else {
241
        p = strchr(base_path, ':');
242
        if (p)
243
            p++;
244
        else
245
            p = base_path;
246
        p1 = strrchr(base_path, '/');
247
#ifdef _WIN32
248
        {
249
            const char *p2;
250
            p2 = strrchr(base_path, '\\');
251
            if (!p1 || p2 > p1)
252
                p1 = p2;
253
        }
254
#endif
255
        if (p1)
256
            p1++;
257
        else
258
            p1 = base_path;
259
        if (p1 > p)
260
            p = p1;
261
        len = p - base_path;
262
        if (len > dest_size - 1)
263
            len = dest_size - 1;
264
        memcpy(dest, base_path, len);
265
        dest[len] = '\0';
266
        pstrcat(dest, dest_size, filename);
267
    }
268
}
269

    
270
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
271
{
272
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
273
        pstrcpy(dest, sz, bs->backing_file);
274
    } else {
275
        path_combine(dest, sz, bs->filename, bs->backing_file);
276
    }
277
}
278

    
279
void bdrv_register(BlockDriver *bdrv)
280
{
281
    /* Block drivers without coroutine functions need emulation */
282
    if (!bdrv->bdrv_co_readv) {
283
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
284
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
285

    
286
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
287
         * the block driver lacks aio we need to emulate that too.
288
         */
289
        if (!bdrv->bdrv_aio_readv) {
290
            /* add AIO emulation layer */
291
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
292
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
293
        }
294
    }
295

    
296
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
297
}
298

    
299
/* create a new block device (by default it is empty) */
300
BlockDriverState *bdrv_new(const char *device_name)
301
{
302
    BlockDriverState *bs;
303

    
304
    bs = g_malloc0(sizeof(BlockDriverState));
305
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
306
    if (device_name[0] != '\0') {
307
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
308
    }
309
    bdrv_iostatus_disable(bs);
310
    notifier_list_init(&bs->close_notifiers);
311

    
312
    return bs;
313
}
314

    
315
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
316
{
317
    notifier_list_add(&bs->close_notifiers, notify);
318
}
319

    
320
BlockDriver *bdrv_find_format(const char *format_name)
321
{
322
    BlockDriver *drv1;
323
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
324
        if (!strcmp(drv1->format_name, format_name)) {
325
            return drv1;
326
        }
327
    }
328
    return NULL;
329
}
330

    
331
static int bdrv_is_whitelisted(BlockDriver *drv)
332
{
333
    static const char *whitelist[] = {
334
        CONFIG_BDRV_WHITELIST
335
    };
336
    const char **p;
337

    
338
    if (!whitelist[0])
339
        return 1;               /* no whitelist, anything goes */
340

    
341
    for (p = whitelist; *p; p++) {
342
        if (!strcmp(drv->format_name, *p)) {
343
            return 1;
344
        }
345
    }
346
    return 0;
347
}
348

    
349
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
350
{
351
    BlockDriver *drv = bdrv_find_format(format_name);
352
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
353
}
354

    
355
typedef struct CreateCo {
356
    BlockDriver *drv;
357
    char *filename;
358
    QEMUOptionParameter *options;
359
    int ret;
360
} CreateCo;
361

    
362
static void coroutine_fn bdrv_create_co_entry(void *opaque)
363
{
364
    CreateCo *cco = opaque;
365
    assert(cco->drv);
366

    
367
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
368
}
369

    
370
int bdrv_create(BlockDriver *drv, const char* filename,
371
    QEMUOptionParameter *options)
372
{
373
    int ret;
374

    
375
    Coroutine *co;
376
    CreateCo cco = {
377
        .drv = drv,
378
        .filename = g_strdup(filename),
379
        .options = options,
380
        .ret = NOT_DONE,
381
    };
382

    
383
    if (!drv->bdrv_create) {
384
        ret = -ENOTSUP;
385
        goto out;
386
    }
387

    
388
    if (qemu_in_coroutine()) {
389
        /* Fast-path if already in coroutine context */
390
        bdrv_create_co_entry(&cco);
391
    } else {
392
        co = qemu_coroutine_create(bdrv_create_co_entry);
393
        qemu_coroutine_enter(co, &cco);
394
        while (cco.ret == NOT_DONE) {
395
            qemu_aio_wait();
396
        }
397
    }
398

    
399
    ret = cco.ret;
400

    
401
out:
402
    g_free(cco.filename);
403
    return ret;
404
}
405

    
406
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
407
{
408
    BlockDriver *drv;
409

    
410
    drv = bdrv_find_protocol(filename);
411
    if (drv == NULL) {
412
        return -ENOENT;
413
    }
414

    
415
    return bdrv_create(drv, filename, options);
416
}
417

    
418
/*
419
 * Create a uniquely-named empty temporary file.
420
 * Return 0 upon success, otherwise a negative errno value.
421
 */
422
int get_tmp_filename(char *filename, int size)
423
{
424
#ifdef _WIN32
425
    char temp_dir[MAX_PATH];
426
    /* GetTempFileName requires that its output buffer (4th param)
427
       have length MAX_PATH or greater.  */
428
    assert(size >= MAX_PATH);
429
    return (GetTempPath(MAX_PATH, temp_dir)
430
            && GetTempFileName(temp_dir, "qem", 0, filename)
431
            ? 0 : -GetLastError());
432
#else
433
    int fd;
434
    const char *tmpdir;
435
    tmpdir = getenv("TMPDIR");
436
    if (!tmpdir)
437
        tmpdir = "/tmp";
438
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
439
        return -EOVERFLOW;
440
    }
441
    fd = mkstemp(filename);
442
    if (fd < 0) {
443
        return -errno;
444
    }
445
    if (close(fd) != 0) {
446
        unlink(filename);
447
        return -errno;
448
    }
449
    return 0;
450
#endif
451
}
452

    
453
/*
454
 * Detect host devices. By convention, /dev/cdrom[N] is always
455
 * recognized as a host CDROM.
456
 */
457
static BlockDriver *find_hdev_driver(const char *filename)
458
{
459
    int score_max = 0, score;
460
    BlockDriver *drv = NULL, *d;
461

    
462
    QLIST_FOREACH(d, &bdrv_drivers, list) {
463
        if (d->bdrv_probe_device) {
464
            score = d->bdrv_probe_device(filename);
465
            if (score > score_max) {
466
                score_max = score;
467
                drv = d;
468
            }
469
        }
470
    }
471

    
472
    return drv;
473
}
474

    
475
BlockDriver *bdrv_find_protocol(const char *filename)
476
{
477
    BlockDriver *drv1;
478
    char protocol[128];
479
    int len;
480
    const char *p;
481

    
482
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
483

    
484
    /*
485
     * XXX(hch): we really should not let host device detection
486
     * override an explicit protocol specification, but moving this
487
     * later breaks access to device names with colons in them.
488
     * Thanks to the brain-dead persistent naming schemes on udev-
489
     * based Linux systems those actually are quite common.
490
     */
491
    drv1 = find_hdev_driver(filename);
492
    if (drv1) {
493
        return drv1;
494
    }
495

    
496
    if (!path_has_protocol(filename)) {
497
        return bdrv_find_format("file");
498
    }
499
    p = strchr(filename, ':');
500
    assert(p != NULL);
501
    len = p - filename;
502
    if (len > sizeof(protocol) - 1)
503
        len = sizeof(protocol) - 1;
504
    memcpy(protocol, filename, len);
505
    protocol[len] = '\0';
506
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
507
        if (drv1->protocol_name &&
508
            !strcmp(drv1->protocol_name, protocol)) {
509
            return drv1;
510
        }
511
    }
512
    return NULL;
513
}
514

    
515
static int find_image_format(BlockDriverState *bs, const char *filename,
516
                             BlockDriver **pdrv)
517
{
518
    int score, score_max;
519
    BlockDriver *drv1, *drv;
520
    uint8_t buf[2048];
521
    int ret = 0;
522

    
523
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
524
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
525
        drv = bdrv_find_format("raw");
526
        if (!drv) {
527
            ret = -ENOENT;
528
        }
529
        *pdrv = drv;
530
        return ret;
531
    }
532

    
533
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
534
    if (ret < 0) {
535
        *pdrv = NULL;
536
        return ret;
537
    }
538

    
539
    score_max = 0;
540
    drv = NULL;
541
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
542
        if (drv1->bdrv_probe) {
543
            score = drv1->bdrv_probe(buf, ret, filename);
544
            if (score > score_max) {
545
                score_max = score;
546
                drv = drv1;
547
            }
548
        }
549
    }
550
    if (!drv) {
551
        ret = -ENOENT;
552
    }
553
    *pdrv = drv;
554
    return ret;
555
}
556

    
557
/**
558
 * Set the current 'total_sectors' value
559
 */
560
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
561
{
562
    BlockDriver *drv = bs->drv;
563

    
564
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
565
    if (bs->sg)
566
        return 0;
567

    
568
    /* query actual device if possible, otherwise just trust the hint */
569
    if (drv->bdrv_getlength) {
570
        int64_t length = drv->bdrv_getlength(bs);
571
        if (length < 0) {
572
            return length;
573
        }
574
        hint = length >> BDRV_SECTOR_BITS;
575
    }
576

    
577
    bs->total_sectors = hint;
578
    return 0;
579
}
580

    
581
/**
582
 * Set open flags for a given discard mode
583
 *
584
 * Return 0 on success, -1 if the discard mode was invalid.
585
 */
586
int bdrv_parse_discard_flags(const char *mode, int *flags)
587
{
588
    *flags &= ~BDRV_O_UNMAP;
589

    
590
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
591
        /* do nothing */
592
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
593
        *flags |= BDRV_O_UNMAP;
594
    } else {
595
        return -1;
596
    }
597

    
598
    return 0;
599
}
600

    
601
/**
602
 * Set open flags for a given cache mode
603
 *
604
 * Return 0 on success, -1 if the cache mode was invalid.
605
 */
606
int bdrv_parse_cache_flags(const char *mode, int *flags)
607
{
608
    *flags &= ~BDRV_O_CACHE_MASK;
609

    
610
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
611
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
612
    } else if (!strcmp(mode, "directsync")) {
613
        *flags |= BDRV_O_NOCACHE;
614
    } else if (!strcmp(mode, "writeback")) {
615
        *flags |= BDRV_O_CACHE_WB;
616
    } else if (!strcmp(mode, "unsafe")) {
617
        *flags |= BDRV_O_CACHE_WB;
618
        *flags |= BDRV_O_NO_FLUSH;
619
    } else if (!strcmp(mode, "writethrough")) {
620
        /* this is the default */
621
    } else {
622
        return -1;
623
    }
624

    
625
    return 0;
626
}
627

    
628
/**
629
 * The copy-on-read flag is actually a reference count so multiple users may
630
 * use the feature without worrying about clobbering its previous state.
631
 * Copy-on-read stays enabled until all users have called to disable it.
632
 */
633
void bdrv_enable_copy_on_read(BlockDriverState *bs)
634
{
635
    bs->copy_on_read++;
636
}
637

    
638
void bdrv_disable_copy_on_read(BlockDriverState *bs)
639
{
640
    assert(bs->copy_on_read > 0);
641
    bs->copy_on_read--;
642
}
643

    
644
static int bdrv_open_flags(BlockDriverState *bs, int flags)
645
{
646
    int open_flags = flags | BDRV_O_CACHE_WB;
647

    
648
    /*
649
     * Clear flags that are internal to the block layer before opening the
650
     * image.
651
     */
652
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
653

    
654
    /*
655
     * Snapshots should be writable.
656
     */
657
    if (bs->is_temporary) {
658
        open_flags |= BDRV_O_RDWR;
659
    }
660

    
661
    return open_flags;
662
}
663

    
664
/*
665
 * Common part for opening disk images and files
666
 *
667
 * Removes all processed options from *options.
668
 */
669
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
670
    const char *filename, QDict *options,
671
    int flags, BlockDriver *drv)
672
{
673
    int ret, open_flags;
674

    
675
    assert(drv != NULL);
676
    assert(bs->file == NULL);
677
    assert(options != NULL && bs->options != options);
678

    
679
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
680

    
681
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
682
        return -ENOTSUP;
683
    }
684

    
685
    /* bdrv_open() with directly using a protocol as drv. This layer is already
686
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
687
     * and return immediately. */
688
    if (file != NULL && drv->bdrv_file_open) {
689
        bdrv_swap(file, bs);
690
        return 0;
691
    }
692

    
693
    bs->open_flags = flags;
694
    bs->buffer_alignment = 512;
695

    
696
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
697
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
698
        bdrv_enable_copy_on_read(bs);
699
    }
700

    
701
    if (filename != NULL) {
702
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
703
    } else {
704
        bs->filename[0] = '\0';
705
    }
706

    
707
    bs->drv = drv;
708
    bs->opaque = g_malloc0(drv->instance_size);
709

    
710
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
711
    open_flags = bdrv_open_flags(bs, flags);
712

    
713
    bs->read_only = !(open_flags & BDRV_O_RDWR);
714

    
715
    /* Open the image, either directly or using a protocol */
716
    if (drv->bdrv_file_open) {
717
        assert(file == NULL);
718
        assert(drv->bdrv_parse_filename || filename != NULL);
719
        ret = drv->bdrv_file_open(bs, filename, options, open_flags);
720
    } else {
721
        assert(file != NULL);
722
        bs->file = file;
723
        ret = drv->bdrv_open(bs, options, open_flags);
724
    }
725

    
726
    if (ret < 0) {
727
        goto free_and_fail;
728
    }
729

    
730
    ret = refresh_total_sectors(bs, bs->total_sectors);
731
    if (ret < 0) {
732
        goto free_and_fail;
733
    }
734

    
735
#ifndef _WIN32
736
    if (bs->is_temporary) {
737
        assert(filename != NULL);
738
        unlink(filename);
739
    }
740
#endif
741
    return 0;
742

    
743
free_and_fail:
744
    bs->file = NULL;
745
    g_free(bs->opaque);
746
    bs->opaque = NULL;
747
    bs->drv = NULL;
748
    return ret;
749
}
750

    
751
/*
752
 * Opens a file using a protocol (file, host_device, nbd, ...)
753
 *
754
 * options is a QDict of options to pass to the block drivers, or NULL for an
755
 * empty set of options. The reference to the QDict belongs to the block layer
756
 * after the call (even on failure), so if the caller intends to reuse the
757
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
758
 */
759
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
760
                   QDict *options, int flags)
761
{
762
    BlockDriverState *bs;
763
    BlockDriver *drv;
764
    const char *drvname;
765
    int ret;
766

    
767
    /* NULL means an empty set of options */
768
    if (options == NULL) {
769
        options = qdict_new();
770
    }
771

    
772
    bs = bdrv_new("");
773
    bs->options = options;
774
    options = qdict_clone_shallow(options);
775

    
776
    /* Find the right block driver */
777
    drvname = qdict_get_try_str(options, "driver");
778
    if (drvname) {
779
        drv = bdrv_find_whitelisted_format(drvname);
780
        qdict_del(options, "driver");
781
    } else if (filename) {
782
        drv = bdrv_find_protocol(filename);
783
    } else {
784
        qerror_report(ERROR_CLASS_GENERIC_ERROR,
785
                      "Must specify either driver or file");
786
        drv = NULL;
787
    }
788

    
789
    if (!drv) {
790
        ret = -ENOENT;
791
        goto fail;
792
    }
793

    
794
    /* Parse the filename and open it */
795
    if (drv->bdrv_parse_filename && filename) {
796
        Error *local_err = NULL;
797
        drv->bdrv_parse_filename(filename, options, &local_err);
798
        if (error_is_set(&local_err)) {
799
            qerror_report_err(local_err);
800
            error_free(local_err);
801
            ret = -EINVAL;
802
            goto fail;
803
        }
804
    } else if (!drv->bdrv_parse_filename && !filename) {
805
        qerror_report(ERROR_CLASS_GENERIC_ERROR,
806
                      "The '%s' block driver requires a file name",
807
                      drv->format_name);
808
        ret = -EINVAL;
809
        goto fail;
810
    }
811

    
812
    ret = bdrv_open_common(bs, NULL, filename, options, flags, drv);
813
    if (ret < 0) {
814
        goto fail;
815
    }
816

    
817
    /* Check if any unknown options were used */
818
    if (qdict_size(options) != 0) {
819
        const QDictEntry *entry = qdict_first(options);
820
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't "
821
                      "support the option '%s'",
822
                      drv->format_name, entry->key);
823
        ret = -EINVAL;
824
        goto fail;
825
    }
826
    QDECREF(options);
827

    
828
    bs->growable = 1;
829
    *pbs = bs;
830
    return 0;
831

    
832
fail:
833
    QDECREF(options);
834
    if (!bs->drv) {
835
        QDECREF(bs->options);
836
    }
837
    bdrv_delete(bs);
838
    return ret;
839
}
840

    
841
int bdrv_open_backing_file(BlockDriverState *bs)
842
{
843
    char backing_filename[PATH_MAX];
844
    int back_flags, ret;
845
    BlockDriver *back_drv = NULL;
846

    
847
    if (bs->backing_hd != NULL) {
848
        return 0;
849
    }
850

    
851
    bs->open_flags &= ~BDRV_O_NO_BACKING;
852
    if (bs->backing_file[0] == '\0') {
853
        return 0;
854
    }
855

    
856
    bs->backing_hd = bdrv_new("");
857
    bdrv_get_full_backing_filename(bs, backing_filename,
858
                                   sizeof(backing_filename));
859

    
860
    if (bs->backing_format[0] != '\0') {
861
        back_drv = bdrv_find_format(bs->backing_format);
862
    }
863

    
864
    /* backing files always opened read-only */
865
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
866

    
867
    ret = bdrv_open(bs->backing_hd, backing_filename, NULL,
868
                    back_flags, back_drv);
869
    if (ret < 0) {
870
        bdrv_delete(bs->backing_hd);
871
        bs->backing_hd = NULL;
872
        bs->open_flags |= BDRV_O_NO_BACKING;
873
        return ret;
874
    }
875
    return 0;
876
}
877

    
878
static void extract_subqdict(QDict *src, QDict **dst, const char *start)
879
{
880
    const QDictEntry *entry, *next;
881
    const char *p;
882

    
883
    *dst = qdict_new();
884
    entry = qdict_first(src);
885

    
886
    while (entry != NULL) {
887
        next = qdict_next(src, entry);
888
        if (strstart(entry->key, start, &p)) {
889
            qobject_incref(entry->value);
890
            qdict_put_obj(*dst, p, entry->value);
891
            qdict_del(src, entry->key);
892
        }
893
        entry = next;
894
    }
895
}
896

    
897
/*
898
 * Opens a disk image (raw, qcow2, vmdk, ...)
899
 *
900
 * options is a QDict of options to pass to the block drivers, or NULL for an
901
 * empty set of options. The reference to the QDict belongs to the block layer
902
 * after the call (even on failure), so if the caller intends to reuse the
903
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
904
 */
905
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
906
              int flags, BlockDriver *drv)
907
{
908
    int ret;
909
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
910
    char tmp_filename[PATH_MAX + 1];
911
    BlockDriverState *file = NULL;
912
    QDict *file_options = NULL;
913

    
914
    /* NULL means an empty set of options */
915
    if (options == NULL) {
916
        options = qdict_new();
917
    }
918

    
919
    bs->options = options;
920
    options = qdict_clone_shallow(options);
921

    
922
    /* For snapshot=on, create a temporary qcow2 overlay */
923
    if (flags & BDRV_O_SNAPSHOT) {
924
        BlockDriverState *bs1;
925
        int64_t total_size;
926
        BlockDriver *bdrv_qcow2;
927
        QEMUOptionParameter *create_options;
928
        char backing_filename[PATH_MAX];
929

    
930
        if (qdict_size(options) != 0) {
931
            error_report("Can't use snapshot=on with driver-specific options");
932
            ret = -EINVAL;
933
            goto fail;
934
        }
935
        assert(filename != NULL);
936

    
937
        /* if snapshot, we create a temporary backing file and open it
938
           instead of opening 'filename' directly */
939

    
940
        /* if there is a backing file, use it */
941
        bs1 = bdrv_new("");
942
        ret = bdrv_open(bs1, filename, NULL, 0, drv);
943
        if (ret < 0) {
944
            bdrv_delete(bs1);
945
            goto fail;
946
        }
947
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
948

    
949
        bdrv_delete(bs1);
950

    
951
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
952
        if (ret < 0) {
953
            goto fail;
954
        }
955

    
956
        /* Real path is meaningless for protocols */
957
        if (path_has_protocol(filename)) {
958
            snprintf(backing_filename, sizeof(backing_filename),
959
                     "%s", filename);
960
        } else if (!realpath(filename, backing_filename)) {
961
            ret = -errno;
962
            goto fail;
963
        }
964

    
965
        bdrv_qcow2 = bdrv_find_format("qcow2");
966
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
967
                                                 NULL);
968

    
969
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
970
        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
971
                             backing_filename);
972
        if (drv) {
973
            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
974
                drv->format_name);
975
        }
976

    
977
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options);
978
        free_option_parameters(create_options);
979
        if (ret < 0) {
980
            goto fail;
981
        }
982

    
983
        filename = tmp_filename;
984
        drv = bdrv_qcow2;
985
        bs->is_temporary = 1;
986
    }
987

    
988
    /* Open image file without format layer */
989
    if (flags & BDRV_O_RDWR) {
990
        flags |= BDRV_O_ALLOW_RDWR;
991
    }
992

    
993
    extract_subqdict(options, &file_options, "file.");
994

    
995
    ret = bdrv_file_open(&file, filename, file_options,
996
                         bdrv_open_flags(bs, flags));
997
    if (ret < 0) {
998
        goto fail;
999
    }
1000

    
1001
    /* Find the right image format driver */
1002
    if (!drv) {
1003
        ret = find_image_format(file, filename, &drv);
1004
    }
1005

    
1006
    if (!drv) {
1007
        goto unlink_and_fail;
1008
    }
1009

    
1010
    /* Open the image */
1011
    ret = bdrv_open_common(bs, file, filename, options, flags, drv);
1012
    if (ret < 0) {
1013
        goto unlink_and_fail;
1014
    }
1015

    
1016
    if (bs->file != file) {
1017
        bdrv_delete(file);
1018
        file = NULL;
1019
    }
1020

    
1021
    /* If there is a backing file, use it */
1022
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1023
        ret = bdrv_open_backing_file(bs);
1024
        if (ret < 0) {
1025
            goto close_and_fail;
1026
        }
1027
    }
1028

    
1029
    /* Check if any unknown options were used */
1030
    if (qdict_size(options) != 0) {
1031
        const QDictEntry *entry = qdict_first(options);
1032
        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by "
1033
            "device '%s' doesn't support the option '%s'",
1034
            drv->format_name, bs->device_name, entry->key);
1035

    
1036
        ret = -EINVAL;
1037
        goto close_and_fail;
1038
    }
1039
    QDECREF(options);
1040

    
1041
    if (!bdrv_key_required(bs)) {
1042
        bdrv_dev_change_media_cb(bs, true);
1043
    }
1044

    
1045
    /* throttling disk I/O limits */
1046
    if (bs->io_limits_enabled) {
1047
        bdrv_io_limits_enable(bs);
1048
    }
1049

    
1050
    return 0;
1051

    
1052
unlink_and_fail:
1053
    if (file != NULL) {
1054
        bdrv_delete(file);
1055
    }
1056
    if (bs->is_temporary) {
1057
        unlink(filename);
1058
    }
1059
fail:
1060
    QDECREF(bs->options);
1061
    QDECREF(options);
1062
    bs->options = NULL;
1063
    return ret;
1064

    
1065
close_and_fail:
1066
    bdrv_close(bs);
1067
    QDECREF(options);
1068
    return ret;
1069
}
1070

    
1071
typedef struct BlockReopenQueueEntry {
1072
     bool prepared;
1073
     BDRVReopenState state;
1074
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1075
} BlockReopenQueueEntry;
1076

    
1077
/*
1078
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1079
 * reopen of multiple devices.
1080
 *
1081
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1082
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1083
 * be created and initialized. This newly created BlockReopenQueue should be
1084
 * passed back in for subsequent calls that are intended to be of the same
1085
 * atomic 'set'.
1086
 *
1087
 * bs is the BlockDriverState to add to the reopen queue.
1088
 *
1089
 * flags contains the open flags for the associated bs
1090
 *
1091
 * returns a pointer to bs_queue, which is either the newly allocated
1092
 * bs_queue, or the existing bs_queue being used.
1093
 *
1094
 */
1095
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1096
                                    BlockDriverState *bs, int flags)
1097
{
1098
    assert(bs != NULL);
1099

    
1100
    BlockReopenQueueEntry *bs_entry;
1101
    if (bs_queue == NULL) {
1102
        bs_queue = g_new0(BlockReopenQueue, 1);
1103
        QSIMPLEQ_INIT(bs_queue);
1104
    }
1105

    
1106
    if (bs->file) {
1107
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1108
    }
1109

    
1110
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1111
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1112

    
1113
    bs_entry->state.bs = bs;
1114
    bs_entry->state.flags = flags;
1115

    
1116
    return bs_queue;
1117
}
1118

    
1119
/*
1120
 * Reopen multiple BlockDriverStates atomically & transactionally.
1121
 *
1122
 * The queue passed in (bs_queue) must have been built up previous
1123
 * via bdrv_reopen_queue().
1124
 *
1125
 * Reopens all BDS specified in the queue, with the appropriate
1126
 * flags.  All devices are prepared for reopen, and failure of any
1127
 * device will cause all device changes to be abandonded, and intermediate
1128
 * data cleaned up.
1129
 *
1130
 * If all devices prepare successfully, then the changes are committed
1131
 * to all devices.
1132
 *
1133
 */
1134
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1135
{
1136
    int ret = -1;
1137
    BlockReopenQueueEntry *bs_entry, *next;
1138
    Error *local_err = NULL;
1139

    
1140
    assert(bs_queue != NULL);
1141

    
1142
    bdrv_drain_all();
1143

    
1144
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1145
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1146
            error_propagate(errp, local_err);
1147
            goto cleanup;
1148
        }
1149
        bs_entry->prepared = true;
1150
    }
1151

    
1152
    /* If we reach this point, we have success and just need to apply the
1153
     * changes
1154
     */
1155
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1156
        bdrv_reopen_commit(&bs_entry->state);
1157
    }
1158

    
1159
    ret = 0;
1160

    
1161
cleanup:
1162
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1163
        if (ret && bs_entry->prepared) {
1164
            bdrv_reopen_abort(&bs_entry->state);
1165
        }
1166
        g_free(bs_entry);
1167
    }
1168
    g_free(bs_queue);
1169
    return ret;
1170
}
1171

    
1172

    
1173
/* Reopen a single BlockDriverState with the specified flags. */
1174
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1175
{
1176
    int ret = -1;
1177
    Error *local_err = NULL;
1178
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1179

    
1180
    ret = bdrv_reopen_multiple(queue, &local_err);
1181
    if (local_err != NULL) {
1182
        error_propagate(errp, local_err);
1183
    }
1184
    return ret;
1185
}
1186

    
1187

    
1188
/*
1189
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1190
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1191
 * the block driver layer .bdrv_reopen_prepare()
1192
 *
1193
 * bs is the BlockDriverState to reopen
1194
 * flags are the new open flags
1195
 * queue is the reopen queue
1196
 *
1197
 * Returns 0 on success, non-zero on error.  On error errp will be set
1198
 * as well.
1199
 *
1200
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1201
 * It is the responsibility of the caller to then call the abort() or
1202
 * commit() for any other BDS that have been left in a prepare() state
1203
 *
1204
 */
1205
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1206
                        Error **errp)
1207
{
1208
    int ret = -1;
1209
    Error *local_err = NULL;
1210
    BlockDriver *drv;
1211

    
1212
    assert(reopen_state != NULL);
1213
    assert(reopen_state->bs->drv != NULL);
1214
    drv = reopen_state->bs->drv;
1215

    
1216
    /* if we are to stay read-only, do not allow permission change
1217
     * to r/w */
1218
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1219
        reopen_state->flags & BDRV_O_RDWR) {
1220
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1221
                  reopen_state->bs->device_name);
1222
        goto error;
1223
    }
1224

    
1225

    
1226
    ret = bdrv_flush(reopen_state->bs);
1227
    if (ret) {
1228
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1229
                  strerror(-ret));
1230
        goto error;
1231
    }
1232

    
1233
    if (drv->bdrv_reopen_prepare) {
1234
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1235
        if (ret) {
1236
            if (local_err != NULL) {
1237
                error_propagate(errp, local_err);
1238
            } else {
1239
                error_set(errp, QERR_OPEN_FILE_FAILED,
1240
                          reopen_state->bs->filename);
1241
            }
1242
            goto error;
1243
        }
1244
    } else {
1245
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1246
         * handler for each supported drv. */
1247
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1248
                  drv->format_name, reopen_state->bs->device_name,
1249
                 "reopening of file");
1250
        ret = -1;
1251
        goto error;
1252
    }
1253

    
1254
    ret = 0;
1255

    
1256
error:
1257
    return ret;
1258
}
1259

    
1260
/*
1261
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1262
 * makes them final by swapping the staging BlockDriverState contents into
1263
 * the active BlockDriverState contents.
1264
 */
1265
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1266
{
1267
    BlockDriver *drv;
1268

    
1269
    assert(reopen_state != NULL);
1270
    drv = reopen_state->bs->drv;
1271
    assert(drv != NULL);
1272

    
1273
    /* If there are any driver level actions to take */
1274
    if (drv->bdrv_reopen_commit) {
1275
        drv->bdrv_reopen_commit(reopen_state);
1276
    }
1277

    
1278
    /* set BDS specific flags now */
1279
    reopen_state->bs->open_flags         = reopen_state->flags;
1280
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1281
                                              BDRV_O_CACHE_WB);
1282
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1283
}
1284

    
1285
/*
1286
 * Abort the reopen, and delete and free the staged changes in
1287
 * reopen_state
1288
 */
1289
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1290
{
1291
    BlockDriver *drv;
1292

    
1293
    assert(reopen_state != NULL);
1294
    drv = reopen_state->bs->drv;
1295
    assert(drv != NULL);
1296

    
1297
    if (drv->bdrv_reopen_abort) {
1298
        drv->bdrv_reopen_abort(reopen_state);
1299
    }
1300
}
1301

    
1302

    
1303
void bdrv_close(BlockDriverState *bs)
1304
{
1305
    bdrv_flush(bs);
1306
    if (bs->job) {
1307
        block_job_cancel_sync(bs->job);
1308
    }
1309
    bdrv_drain_all();
1310
    notifier_list_notify(&bs->close_notifiers, bs);
1311

    
1312
    if (bs->drv) {
1313
        if (bs == bs_snapshots) {
1314
            bs_snapshots = NULL;
1315
        }
1316
        if (bs->backing_hd) {
1317
            bdrv_delete(bs->backing_hd);
1318
            bs->backing_hd = NULL;
1319
        }
1320
        bs->drv->bdrv_close(bs);
1321
        g_free(bs->opaque);
1322
#ifdef _WIN32
1323
        if (bs->is_temporary) {
1324
            unlink(bs->filename);
1325
        }
1326
#endif
1327
        bs->opaque = NULL;
1328
        bs->drv = NULL;
1329
        bs->copy_on_read = 0;
1330
        bs->backing_file[0] = '\0';
1331
        bs->backing_format[0] = '\0';
1332
        bs->total_sectors = 0;
1333
        bs->encrypted = 0;
1334
        bs->valid_key = 0;
1335
        bs->sg = 0;
1336
        bs->growable = 0;
1337
        QDECREF(bs->options);
1338
        bs->options = NULL;
1339

    
1340
        if (bs->file != NULL) {
1341
            bdrv_delete(bs->file);
1342
            bs->file = NULL;
1343
        }
1344
    }
1345

    
1346
    bdrv_dev_change_media_cb(bs, false);
1347

    
1348
    /*throttling disk I/O limits*/
1349
    if (bs->io_limits_enabled) {
1350
        bdrv_io_limits_disable(bs);
1351
    }
1352
}
1353

    
1354
void bdrv_close_all(void)
1355
{
1356
    BlockDriverState *bs;
1357

    
1358
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1359
        bdrv_close(bs);
1360
    }
1361
}
1362

    
1363
/*
1364
 * Wait for pending requests to complete across all BlockDriverStates
1365
 *
1366
 * This function does not flush data to disk, use bdrv_flush_all() for that
1367
 * after calling this function.
1368
 *
1369
 * Note that completion of an asynchronous I/O operation can trigger any
1370
 * number of other I/O operations on other devices---for example a coroutine
1371
 * can be arbitrarily complex and a constant flow of I/O can come until the
1372
 * coroutine is complete.  Because of this, it is not possible to have a
1373
 * function to drain a single device's I/O queue.
1374
 */
1375
void bdrv_drain_all(void)
1376
{
1377
    BlockDriverState *bs;
1378
    bool busy;
1379

    
1380
    do {
1381
        busy = qemu_aio_wait();
1382

    
1383
        /* FIXME: We do not have timer support here, so this is effectively
1384
         * a busy wait.
1385
         */
1386
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1387
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1388
                qemu_co_queue_restart_all(&bs->throttled_reqs);
1389
                busy = true;
1390
            }
1391
        }
1392
    } while (busy);
1393

    
1394
    /* If requests are still pending there is a bug somewhere */
1395
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1396
        assert(QLIST_EMPTY(&bs->tracked_requests));
1397
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
1398
    }
1399
}
1400

    
1401
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1402
   Also, NULL terminate the device_name to prevent double remove */
1403
void bdrv_make_anon(BlockDriverState *bs)
1404
{
1405
    if (bs->device_name[0] != '\0') {
1406
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1407
    }
1408
    bs->device_name[0] = '\0';
1409
}
1410

    
1411
static void bdrv_rebind(BlockDriverState *bs)
1412
{
1413
    if (bs->drv && bs->drv->bdrv_rebind) {
1414
        bs->drv->bdrv_rebind(bs);
1415
    }
1416
}
1417

    
1418
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1419
                                     BlockDriverState *bs_src)
1420
{
1421
    /* move some fields that need to stay attached to the device */
1422
    bs_dest->open_flags         = bs_src->open_flags;
1423

    
1424
    /* dev info */
1425
    bs_dest->dev_ops            = bs_src->dev_ops;
1426
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1427
    bs_dest->dev                = bs_src->dev;
1428
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1429
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1430

    
1431
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1432

    
1433
    /* i/o timing parameters */
1434
    bs_dest->slice_start        = bs_src->slice_start;
1435
    bs_dest->slice_end          = bs_src->slice_end;
1436
    bs_dest->slice_submitted    = bs_src->slice_submitted;
1437
    bs_dest->io_limits          = bs_src->io_limits;
1438
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1439
    bs_dest->block_timer        = bs_src->block_timer;
1440
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1441

    
1442
    /* r/w error */
1443
    bs_dest->on_read_error      = bs_src->on_read_error;
1444
    bs_dest->on_write_error     = bs_src->on_write_error;
1445

    
1446
    /* i/o status */
1447
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1448
    bs_dest->iostatus           = bs_src->iostatus;
1449

    
1450
    /* dirty bitmap */
1451
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1452

    
1453
    /* job */
1454
    bs_dest->in_use             = bs_src->in_use;
1455
    bs_dest->job                = bs_src->job;
1456

    
1457
    /* keep the same entry in bdrv_states */
1458
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1459
            bs_src->device_name);
1460
    bs_dest->list = bs_src->list;
1461
}
1462

    
1463
/*
1464
 * Swap bs contents for two image chains while they are live,
1465
 * while keeping required fields on the BlockDriverState that is
1466
 * actually attached to a device.
1467
 *
1468
 * This will modify the BlockDriverState fields, and swap contents
1469
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1470
 *
1471
 * bs_new is required to be anonymous.
1472
 *
1473
 * This function does not create any image files.
1474
 */
1475
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1476
{
1477
    BlockDriverState tmp;
1478

    
1479
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1480
    assert(bs_new->device_name[0] == '\0');
1481
    assert(bs_new->dirty_bitmap == NULL);
1482
    assert(bs_new->job == NULL);
1483
    assert(bs_new->dev == NULL);
1484
    assert(bs_new->in_use == 0);
1485
    assert(bs_new->io_limits_enabled == false);
1486
    assert(bs_new->block_timer == NULL);
1487

    
1488
    tmp = *bs_new;
1489
    *bs_new = *bs_old;
1490
    *bs_old = tmp;
1491

    
1492
    /* there are some fields that should not be swapped, move them back */
1493
    bdrv_move_feature_fields(&tmp, bs_old);
1494
    bdrv_move_feature_fields(bs_old, bs_new);
1495
    bdrv_move_feature_fields(bs_new, &tmp);
1496

    
1497
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1498
    assert(bs_new->device_name[0] == '\0');
1499

    
1500
    /* Check a few fields that should remain attached to the device */
1501
    assert(bs_new->dev == NULL);
1502
    assert(bs_new->job == NULL);
1503
    assert(bs_new->in_use == 0);
1504
    assert(bs_new->io_limits_enabled == false);
1505
    assert(bs_new->block_timer == NULL);
1506

    
1507
    bdrv_rebind(bs_new);
1508
    bdrv_rebind(bs_old);
1509
}
1510

    
1511
/*
1512
 * Add new bs contents at the top of an image chain while the chain is
1513
 * live, while keeping required fields on the top layer.
1514
 *
1515
 * This will modify the BlockDriverState fields, and swap contents
1516
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1517
 *
1518
 * bs_new is required to be anonymous.
1519
 *
1520
 * This function does not create any image files.
1521
 */
1522
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1523
{
1524
    bdrv_swap(bs_new, bs_top);
1525

    
1526
    /* The contents of 'tmp' will become bs_top, as we are
1527
     * swapping bs_new and bs_top contents. */
1528
    bs_top->backing_hd = bs_new;
1529
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1530
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1531
            bs_new->filename);
1532
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1533
            bs_new->drv ? bs_new->drv->format_name : "");
1534
}
1535

    
1536
void bdrv_delete(BlockDriverState *bs)
1537
{
1538
    assert(!bs->dev);
1539
    assert(!bs->job);
1540
    assert(!bs->in_use);
1541

    
1542
    /* remove from list, if necessary */
1543
    bdrv_make_anon(bs);
1544

    
1545
    bdrv_close(bs);
1546

    
1547
    assert(bs != bs_snapshots);
1548
    g_free(bs);
1549
}
1550

    
1551
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1552
/* TODO change to DeviceState *dev when all users are qdevified */
1553
{
1554
    if (bs->dev) {
1555
        return -EBUSY;
1556
    }
1557
    bs->dev = dev;
1558
    bdrv_iostatus_reset(bs);
1559
    return 0;
1560
}
1561

    
1562
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1563
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1564
{
1565
    if (bdrv_attach_dev(bs, dev) < 0) {
1566
        abort();
1567
    }
1568
}
1569

    
1570
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1571
/* TODO change to DeviceState *dev when all users are qdevified */
1572
{
1573
    assert(bs->dev == dev);
1574
    bs->dev = NULL;
1575
    bs->dev_ops = NULL;
1576
    bs->dev_opaque = NULL;
1577
    bs->buffer_alignment = 512;
1578
}
1579

    
1580
/* TODO change to return DeviceState * when all users are qdevified */
1581
void *bdrv_get_attached_dev(BlockDriverState *bs)
1582
{
1583
    return bs->dev;
1584
}
1585

    
1586
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1587
                      void *opaque)
1588
{
1589
    bs->dev_ops = ops;
1590
    bs->dev_opaque = opaque;
1591
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1592
        bs_snapshots = NULL;
1593
    }
1594
}
1595

    
1596
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1597
                               enum MonitorEvent ev,
1598
                               BlockErrorAction action, bool is_read)
1599
{
1600
    QObject *data;
1601
    const char *action_str;
1602

    
1603
    switch (action) {
1604
    case BDRV_ACTION_REPORT:
1605
        action_str = "report";
1606
        break;
1607
    case BDRV_ACTION_IGNORE:
1608
        action_str = "ignore";
1609
        break;
1610
    case BDRV_ACTION_STOP:
1611
        action_str = "stop";
1612
        break;
1613
    default:
1614
        abort();
1615
    }
1616

    
1617
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1618
                              bdrv->device_name,
1619
                              action_str,
1620
                              is_read ? "read" : "write");
1621
    monitor_protocol_event(ev, data);
1622

    
1623
    qobject_decref(data);
1624
}
1625

    
1626
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1627
{
1628
    QObject *data;
1629

    
1630
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1631
                              bdrv_get_device_name(bs), ejected);
1632
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1633

    
1634
    qobject_decref(data);
1635
}
1636

    
1637
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1638
{
1639
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1640
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1641
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1642
        if (tray_was_closed) {
1643
            /* tray open */
1644
            bdrv_emit_qmp_eject_event(bs, true);
1645
        }
1646
        if (load) {
1647
            /* tray close */
1648
            bdrv_emit_qmp_eject_event(bs, false);
1649
        }
1650
    }
1651
}
1652

    
1653
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1654
{
1655
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1656
}
1657

    
1658
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1659
{
1660
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1661
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1662
    }
1663
}
1664

    
1665
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1666
{
1667
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1668
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1669
    }
1670
    return false;
1671
}
1672

    
1673
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1674
{
1675
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1676
        bs->dev_ops->resize_cb(bs->dev_opaque);
1677
    }
1678
}
1679

    
1680
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1681
{
1682
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1683
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1684
    }
1685
    return false;
1686
}
1687

    
1688
/*
1689
 * Run consistency checks on an image
1690
 *
1691
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1692
 * free of errors) or -errno when an internal error occurred. The results of the
1693
 * check are stored in res.
1694
 */
1695
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1696
{
1697
    if (bs->drv->bdrv_check == NULL) {
1698
        return -ENOTSUP;
1699
    }
1700

    
1701
    memset(res, 0, sizeof(*res));
1702
    return bs->drv->bdrv_check(bs, res, fix);
1703
}
1704

    
1705
#define COMMIT_BUF_SECTORS 2048
1706

    
1707
/* commit COW file into the raw image */
1708
int bdrv_commit(BlockDriverState *bs)
1709
{
1710
    BlockDriver *drv = bs->drv;
1711
    int64_t sector, total_sectors;
1712
    int n, ro, open_flags;
1713
    int ret = 0;
1714
    uint8_t *buf;
1715
    char filename[PATH_MAX];
1716

    
1717
    if (!drv)
1718
        return -ENOMEDIUM;
1719
    
1720
    if (!bs->backing_hd) {
1721
        return -ENOTSUP;
1722
    }
1723

    
1724
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1725
        return -EBUSY;
1726
    }
1727

    
1728
    ro = bs->backing_hd->read_only;
1729
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1730
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1731
    open_flags =  bs->backing_hd->open_flags;
1732

    
1733
    if (ro) {
1734
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1735
            return -EACCES;
1736
        }
1737
    }
1738

    
1739
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1740
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1741

    
1742
    for (sector = 0; sector < total_sectors; sector += n) {
1743
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1744

    
1745
            if (bdrv_read(bs, sector, buf, n) != 0) {
1746
                ret = -EIO;
1747
                goto ro_cleanup;
1748
            }
1749

    
1750
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1751
                ret = -EIO;
1752
                goto ro_cleanup;
1753
            }
1754
        }
1755
    }
1756

    
1757
    if (drv->bdrv_make_empty) {
1758
        ret = drv->bdrv_make_empty(bs);
1759
        bdrv_flush(bs);
1760
    }
1761

    
1762
    /*
1763
     * Make sure all data we wrote to the backing device is actually
1764
     * stable on disk.
1765
     */
1766
    if (bs->backing_hd)
1767
        bdrv_flush(bs->backing_hd);
1768

    
1769
ro_cleanup:
1770
    g_free(buf);
1771

    
1772
    if (ro) {
1773
        /* ignoring error return here */
1774
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1775
    }
1776

    
1777
    return ret;
1778
}
1779

    
1780
int bdrv_commit_all(void)
1781
{
1782
    BlockDriverState *bs;
1783

    
1784
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1785
        if (bs->drv && bs->backing_hd) {
1786
            int ret = bdrv_commit(bs);
1787
            if (ret < 0) {
1788
                return ret;
1789
            }
1790
        }
1791
    }
1792
    return 0;
1793
}
1794

    
1795
struct BdrvTrackedRequest {
1796
    BlockDriverState *bs;
1797
    int64_t sector_num;
1798
    int nb_sectors;
1799
    bool is_write;
1800
    QLIST_ENTRY(BdrvTrackedRequest) list;
1801
    Coroutine *co; /* owner, used for deadlock detection */
1802
    CoQueue wait_queue; /* coroutines blocked on this request */
1803
};
1804

    
1805
/**
1806
 * Remove an active request from the tracked requests list
1807
 *
1808
 * This function should be called when a tracked request is completing.
1809
 */
1810
static void tracked_request_end(BdrvTrackedRequest *req)
1811
{
1812
    QLIST_REMOVE(req, list);
1813
    qemu_co_queue_restart_all(&req->wait_queue);
1814
}
1815

    
1816
/**
1817
 * Add an active request to the tracked requests list
1818
 */
1819
static void tracked_request_begin(BdrvTrackedRequest *req,
1820
                                  BlockDriverState *bs,
1821
                                  int64_t sector_num,
1822
                                  int nb_sectors, bool is_write)
1823
{
1824
    *req = (BdrvTrackedRequest){
1825
        .bs = bs,
1826
        .sector_num = sector_num,
1827
        .nb_sectors = nb_sectors,
1828
        .is_write = is_write,
1829
        .co = qemu_coroutine_self(),
1830
    };
1831

    
1832
    qemu_co_queue_init(&req->wait_queue);
1833

    
1834
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1835
}
1836

    
1837
/**
1838
 * Round a region to cluster boundaries
1839
 */
1840
void bdrv_round_to_clusters(BlockDriverState *bs,
1841
                            int64_t sector_num, int nb_sectors,
1842
                            int64_t *cluster_sector_num,
1843
                            int *cluster_nb_sectors)
1844
{
1845
    BlockDriverInfo bdi;
1846

    
1847
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1848
        *cluster_sector_num = sector_num;
1849
        *cluster_nb_sectors = nb_sectors;
1850
    } else {
1851
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1852
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1853
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1854
                                            nb_sectors, c);
1855
    }
1856
}
1857

    
1858
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1859
                                     int64_t sector_num, int nb_sectors) {
1860
    /*        aaaa   bbbb */
1861
    if (sector_num >= req->sector_num + req->nb_sectors) {
1862
        return false;
1863
    }
1864
    /* bbbb   aaaa        */
1865
    if (req->sector_num >= sector_num + nb_sectors) {
1866
        return false;
1867
    }
1868
    return true;
1869
}
1870

    
1871
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1872
        int64_t sector_num, int nb_sectors)
1873
{
1874
    BdrvTrackedRequest *req;
1875
    int64_t cluster_sector_num;
1876
    int cluster_nb_sectors;
1877
    bool retry;
1878

    
1879
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1880
     * that allocating writes will be serialized and not race with each other
1881
     * for the same cluster.  For example, in copy-on-read it ensures that the
1882
     * CoR read and write operations are atomic and guest writes cannot
1883
     * interleave between them.
1884
     */
1885
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
1886
                           &cluster_sector_num, &cluster_nb_sectors);
1887

    
1888
    do {
1889
        retry = false;
1890
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1891
            if (tracked_request_overlaps(req, cluster_sector_num,
1892
                                         cluster_nb_sectors)) {
1893
                /* Hitting this means there was a reentrant request, for
1894
                 * example, a block driver issuing nested requests.  This must
1895
                 * never happen since it means deadlock.
1896
                 */
1897
                assert(qemu_coroutine_self() != req->co);
1898

    
1899
                qemu_co_queue_wait(&req->wait_queue);
1900
                retry = true;
1901
                break;
1902
            }
1903
        }
1904
    } while (retry);
1905
}
1906

    
1907
/*
1908
 * Return values:
1909
 * 0        - success
1910
 * -EINVAL  - backing format specified, but no file
1911
 * -ENOSPC  - can't update the backing file because no space is left in the
1912
 *            image file header
1913
 * -ENOTSUP - format driver doesn't support changing the backing file
1914
 */
1915
int bdrv_change_backing_file(BlockDriverState *bs,
1916
    const char *backing_file, const char *backing_fmt)
1917
{
1918
    BlockDriver *drv = bs->drv;
1919
    int ret;
1920

    
1921
    /* Backing file format doesn't make sense without a backing file */
1922
    if (backing_fmt && !backing_file) {
1923
        return -EINVAL;
1924
    }
1925

    
1926
    if (drv->bdrv_change_backing_file != NULL) {
1927
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1928
    } else {
1929
        ret = -ENOTSUP;
1930
    }
1931

    
1932
    if (ret == 0) {
1933
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1934
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1935
    }
1936
    return ret;
1937
}
1938

    
1939
/*
1940
 * Finds the image layer in the chain that has 'bs' as its backing file.
1941
 *
1942
 * active is the current topmost image.
1943
 *
1944
 * Returns NULL if bs is not found in active's image chain,
1945
 * or if active == bs.
1946
 */
1947
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1948
                                    BlockDriverState *bs)
1949
{
1950
    BlockDriverState *overlay = NULL;
1951
    BlockDriverState *intermediate;
1952

    
1953
    assert(active != NULL);
1954
    assert(bs != NULL);
1955

    
1956
    /* if bs is the same as active, then by definition it has no overlay
1957
     */
1958
    if (active == bs) {
1959
        return NULL;
1960
    }
1961

    
1962
    intermediate = active;
1963
    while (intermediate->backing_hd) {
1964
        if (intermediate->backing_hd == bs) {
1965
            overlay = intermediate;
1966
            break;
1967
        }
1968
        intermediate = intermediate->backing_hd;
1969
    }
1970

    
1971
    return overlay;
1972
}
1973

    
1974
typedef struct BlkIntermediateStates {
1975
    BlockDriverState *bs;
1976
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
1977
} BlkIntermediateStates;
1978

    
1979

    
1980
/*
1981
 * Drops images above 'base' up to and including 'top', and sets the image
1982
 * above 'top' to have base as its backing file.
1983
 *
1984
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
1985
 * information in 'bs' can be properly updated.
1986
 *
1987
 * E.g., this will convert the following chain:
1988
 * bottom <- base <- intermediate <- top <- active
1989
 *
1990
 * to
1991
 *
1992
 * bottom <- base <- active
1993
 *
1994
 * It is allowed for bottom==base, in which case it converts:
1995
 *
1996
 * base <- intermediate <- top <- active
1997
 *
1998
 * to
1999
 *
2000
 * base <- active
2001
 *
2002
 * Error conditions:
2003
 *  if active == top, that is considered an error
2004
 *
2005
 */
2006
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2007
                           BlockDriverState *base)
2008
{
2009
    BlockDriverState *intermediate;
2010
    BlockDriverState *base_bs = NULL;
2011
    BlockDriverState *new_top_bs = NULL;
2012
    BlkIntermediateStates *intermediate_state, *next;
2013
    int ret = -EIO;
2014

    
2015
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2016
    QSIMPLEQ_INIT(&states_to_delete);
2017

    
2018
    if (!top->drv || !base->drv) {
2019
        goto exit;
2020
    }
2021

    
2022
    new_top_bs = bdrv_find_overlay(active, top);
2023

    
2024
    if (new_top_bs == NULL) {
2025
        /* we could not find the image above 'top', this is an error */
2026
        goto exit;
2027
    }
2028

    
2029
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2030
     * to do, no intermediate images */
2031
    if (new_top_bs->backing_hd == base) {
2032
        ret = 0;
2033
        goto exit;
2034
    }
2035

    
2036
    intermediate = top;
2037

    
2038
    /* now we will go down through the list, and add each BDS we find
2039
     * into our deletion queue, until we hit the 'base'
2040
     */
2041
    while (intermediate) {
2042
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2043
        intermediate_state->bs = intermediate;
2044
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2045

    
2046
        if (intermediate->backing_hd == base) {
2047
            base_bs = intermediate->backing_hd;
2048
            break;
2049
        }
2050
        intermediate = intermediate->backing_hd;
2051
    }
2052
    if (base_bs == NULL) {
2053
        /* something went wrong, we did not end at the base. safely
2054
         * unravel everything, and exit with error */
2055
        goto exit;
2056
    }
2057

    
2058
    /* success - we can delete the intermediate states, and link top->base */
2059
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2060
                                   base_bs->drv ? base_bs->drv->format_name : "");
2061
    if (ret) {
2062
        goto exit;
2063
    }
2064
    new_top_bs->backing_hd = base_bs;
2065

    
2066

    
2067
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2068
        /* so that bdrv_close() does not recursively close the chain */
2069
        intermediate_state->bs->backing_hd = NULL;
2070
        bdrv_delete(intermediate_state->bs);
2071
    }
2072
    ret = 0;
2073

    
2074
exit:
2075
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2076
        g_free(intermediate_state);
2077
    }
2078
    return ret;
2079
}
2080

    
2081

    
2082
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2083
                                   size_t size)
2084
{
2085
    int64_t len;
2086

    
2087
    if (!bdrv_is_inserted(bs))
2088
        return -ENOMEDIUM;
2089

    
2090
    if (bs->growable)
2091
        return 0;
2092

    
2093
    len = bdrv_getlength(bs);
2094

    
2095
    if (offset < 0)
2096
        return -EIO;
2097

    
2098
    if ((offset > len) || (len - offset < size))
2099
        return -EIO;
2100

    
2101
    return 0;
2102
}
2103

    
2104
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2105
                              int nb_sectors)
2106
{
2107
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2108
                                   nb_sectors * BDRV_SECTOR_SIZE);
2109
}
2110

    
2111
typedef struct RwCo {
2112
    BlockDriverState *bs;
2113
    int64_t sector_num;
2114
    int nb_sectors;
2115
    QEMUIOVector *qiov;
2116
    bool is_write;
2117
    int ret;
2118
} RwCo;
2119

    
2120
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2121
{
2122
    RwCo *rwco = opaque;
2123

    
2124
    if (!rwco->is_write) {
2125
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2126
                                     rwco->nb_sectors, rwco->qiov, 0);
2127
    } else {
2128
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2129
                                      rwco->nb_sectors, rwco->qiov, 0);
2130
    }
2131
}
2132

    
2133
/*
2134
 * Process a vectored synchronous request using coroutines
2135
 */
2136
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2137
                       QEMUIOVector *qiov, bool is_write)
2138
{
2139
    Coroutine *co;
2140
    RwCo rwco = {
2141
        .bs = bs,
2142
        .sector_num = sector_num,
2143
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2144
        .qiov = qiov,
2145
        .is_write = is_write,
2146
        .ret = NOT_DONE,
2147
    };
2148
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2149

    
2150
    /**
2151
     * In sync call context, when the vcpu is blocked, this throttling timer
2152
     * will not fire; so the I/O throttling function has to be disabled here
2153
     * if it has been enabled.
2154
     */
2155
    if (bs->io_limits_enabled) {
2156
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2157
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2158
        bdrv_io_limits_disable(bs);
2159
    }
2160

    
2161
    if (qemu_in_coroutine()) {
2162
        /* Fast-path if already in coroutine context */
2163
        bdrv_rw_co_entry(&rwco);
2164
    } else {
2165
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2166
        qemu_coroutine_enter(co, &rwco);
2167
        while (rwco.ret == NOT_DONE) {
2168
            qemu_aio_wait();
2169
        }
2170
    }
2171
    return rwco.ret;
2172
}
2173

    
2174
/*
2175
 * Process a synchronous request using coroutines
2176
 */
2177
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2178
                      int nb_sectors, bool is_write)
2179
{
2180
    QEMUIOVector qiov;
2181
    struct iovec iov = {
2182
        .iov_base = (void *)buf,
2183
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2184
    };
2185

    
2186
    qemu_iovec_init_external(&qiov, &iov, 1);
2187
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write);
2188
}
2189

    
2190
/* return < 0 if error. See bdrv_write() for the return codes */
2191
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2192
              uint8_t *buf, int nb_sectors)
2193
{
2194
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
2195
}
2196

    
2197
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2198
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2199
                          uint8_t *buf, int nb_sectors)
2200
{
2201
    bool enabled;
2202
    int ret;
2203

    
2204
    enabled = bs->io_limits_enabled;
2205
    bs->io_limits_enabled = false;
2206
    ret = bdrv_read(bs, 0, buf, 1);
2207
    bs->io_limits_enabled = enabled;
2208
    return ret;
2209
}
2210

    
2211
/* Return < 0 if error. Important errors are:
2212
  -EIO         generic I/O error (may happen for all errors)
2213
  -ENOMEDIUM   No media inserted.
2214
  -EINVAL      Invalid sector number or nb_sectors
2215
  -EACCES      Trying to write a read-only device
2216
*/
2217
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2218
               const uint8_t *buf, int nb_sectors)
2219
{
2220
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
2221
}
2222

    
2223
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2224
{
2225
    return bdrv_rwv_co(bs, sector_num, qiov, true);
2226
}
2227

    
2228
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2229
               void *buf, int count1)
2230
{
2231
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2232
    int len, nb_sectors, count;
2233
    int64_t sector_num;
2234
    int ret;
2235

    
2236
    count = count1;
2237
    /* first read to align to sector start */
2238
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2239
    if (len > count)
2240
        len = count;
2241
    sector_num = offset >> BDRV_SECTOR_BITS;
2242
    if (len > 0) {
2243
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2244
            return ret;
2245
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2246
        count -= len;
2247
        if (count == 0)
2248
            return count1;
2249
        sector_num++;
2250
        buf += len;
2251
    }
2252

    
2253
    /* read the sectors "in place" */
2254
    nb_sectors = count >> BDRV_SECTOR_BITS;
2255
    if (nb_sectors > 0) {
2256
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2257
            return ret;
2258
        sector_num += nb_sectors;
2259
        len = nb_sectors << BDRV_SECTOR_BITS;
2260
        buf += len;
2261
        count -= len;
2262
    }
2263

    
2264
    /* add data from the last sector */
2265
    if (count > 0) {
2266
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2267
            return ret;
2268
        memcpy(buf, tmp_buf, count);
2269
    }
2270
    return count1;
2271
}
2272

    
2273
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2274
{
2275
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2276
    int len, nb_sectors, count;
2277
    int64_t sector_num;
2278
    int ret;
2279

    
2280
    count = qiov->size;
2281

    
2282
    /* first write to align to sector start */
2283
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2284
    if (len > count)
2285
        len = count;
2286
    sector_num = offset >> BDRV_SECTOR_BITS;
2287
    if (len > 0) {
2288
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2289
            return ret;
2290
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2291
                          len);
2292
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2293
            return ret;
2294
        count -= len;
2295
        if (count == 0)
2296
            return qiov->size;
2297
        sector_num++;
2298
    }
2299

    
2300
    /* write the sectors "in place" */
2301
    nb_sectors = count >> BDRV_SECTOR_BITS;
2302
    if (nb_sectors > 0) {
2303
        QEMUIOVector qiov_inplace;
2304

    
2305
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2306
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2307
                          nb_sectors << BDRV_SECTOR_BITS);
2308
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2309
        qemu_iovec_destroy(&qiov_inplace);
2310
        if (ret < 0) {
2311
            return ret;
2312
        }
2313

    
2314
        sector_num += nb_sectors;
2315
        len = nb_sectors << BDRV_SECTOR_BITS;
2316
        count -= len;
2317
    }
2318

    
2319
    /* add data from the last sector */
2320
    if (count > 0) {
2321
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2322
            return ret;
2323
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2324
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2325
            return ret;
2326
    }
2327
    return qiov->size;
2328
}
2329

    
2330
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2331
                const void *buf, int count1)
2332
{
2333
    QEMUIOVector qiov;
2334
    struct iovec iov = {
2335
        .iov_base   = (void *) buf,
2336
        .iov_len    = count1,
2337
    };
2338

    
2339
    qemu_iovec_init_external(&qiov, &iov, 1);
2340
    return bdrv_pwritev(bs, offset, &qiov);
2341
}
2342

    
2343
/*
2344
 * Writes to the file and ensures that no writes are reordered across this
2345
 * request (acts as a barrier)
2346
 *
2347
 * Returns 0 on success, -errno in error cases.
2348
 */
2349
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2350
    const void *buf, int count)
2351
{
2352
    int ret;
2353

    
2354
    ret = bdrv_pwrite(bs, offset, buf, count);
2355
    if (ret < 0) {
2356
        return ret;
2357
    }
2358

    
2359
    /* No flush needed for cache modes that already do it */
2360
    if (bs->enable_write_cache) {
2361
        bdrv_flush(bs);
2362
    }
2363

    
2364
    return 0;
2365
}
2366

    
2367
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2368
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2369
{
2370
    /* Perform I/O through a temporary buffer so that users who scribble over
2371
     * their read buffer while the operation is in progress do not end up
2372
     * modifying the image file.  This is critical for zero-copy guest I/O
2373
     * where anything might happen inside guest memory.
2374
     */
2375
    void *bounce_buffer;
2376

    
2377
    BlockDriver *drv = bs->drv;
2378
    struct iovec iov;
2379
    QEMUIOVector bounce_qiov;
2380
    int64_t cluster_sector_num;
2381
    int cluster_nb_sectors;
2382
    size_t skip_bytes;
2383
    int ret;
2384

    
2385
    /* Cover entire cluster so no additional backing file I/O is required when
2386
     * allocating cluster in the image file.
2387
     */
2388
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2389
                           &cluster_sector_num, &cluster_nb_sectors);
2390

    
2391
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2392
                                   cluster_sector_num, cluster_nb_sectors);
2393

    
2394
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2395
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2396
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2397

    
2398
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2399
                             &bounce_qiov);
2400
    if (ret < 0) {
2401
        goto err;
2402
    }
2403

    
2404
    if (drv->bdrv_co_write_zeroes &&
2405
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2406
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2407
                                      cluster_nb_sectors);
2408
    } else {
2409
        /* This does not change the data on the disk, it is not necessary
2410
         * to flush even in cache=writethrough mode.
2411
         */
2412
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2413
                                  &bounce_qiov);
2414
    }
2415

    
2416
    if (ret < 0) {
2417
        /* It might be okay to ignore write errors for guest requests.  If this
2418
         * is a deliberate copy-on-read then we don't want to ignore the error.
2419
         * Simply report it in all cases.
2420
         */
2421
        goto err;
2422
    }
2423

    
2424
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2425
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2426
                        nb_sectors * BDRV_SECTOR_SIZE);
2427

    
2428
err:
2429
    qemu_vfree(bounce_buffer);
2430
    return ret;
2431
}
2432

    
2433
/*
2434
 * Handle a read request in coroutine context
2435
 */
2436
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2437
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2438
    BdrvRequestFlags flags)
2439
{
2440
    BlockDriver *drv = bs->drv;
2441
    BdrvTrackedRequest req;
2442
    int ret;
2443

    
2444
    if (!drv) {
2445
        return -ENOMEDIUM;
2446
    }
2447
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2448
        return -EIO;
2449
    }
2450

    
2451
    /* throttling disk read I/O */
2452
    if (bs->io_limits_enabled) {
2453
        bdrv_io_limits_intercept(bs, false, nb_sectors);
2454
    }
2455

    
2456
    if (bs->copy_on_read) {
2457
        flags |= BDRV_REQ_COPY_ON_READ;
2458
    }
2459
    if (flags & BDRV_REQ_COPY_ON_READ) {
2460
        bs->copy_on_read_in_flight++;
2461
    }
2462

    
2463
    if (bs->copy_on_read_in_flight) {
2464
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2465
    }
2466

    
2467
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2468

    
2469
    if (flags & BDRV_REQ_COPY_ON_READ) {
2470
        int pnum;
2471

    
2472
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2473
        if (ret < 0) {
2474
            goto out;
2475
        }
2476

    
2477
        if (!ret || pnum != nb_sectors) {
2478
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2479
            goto out;
2480
        }
2481
    }
2482

    
2483
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2484

    
2485
out:
2486
    tracked_request_end(&req);
2487

    
2488
    if (flags & BDRV_REQ_COPY_ON_READ) {
2489
        bs->copy_on_read_in_flight--;
2490
    }
2491

    
2492
    return ret;
2493
}
2494

    
2495
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2496
    int nb_sectors, QEMUIOVector *qiov)
2497
{
2498
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2499

    
2500
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2501
}
2502

    
2503
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2504
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2505
{
2506
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2507

    
2508
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2509
                            BDRV_REQ_COPY_ON_READ);
2510
}
2511

    
2512
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2513
    int64_t sector_num, int nb_sectors)
2514
{
2515
    BlockDriver *drv = bs->drv;
2516
    QEMUIOVector qiov;
2517
    struct iovec iov;
2518
    int ret;
2519

    
2520
    /* TODO Emulate only part of misaligned requests instead of letting block
2521
     * drivers return -ENOTSUP and emulate everything */
2522

    
2523
    /* First try the efficient write zeroes operation */
2524
    if (drv->bdrv_co_write_zeroes) {
2525
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2526
        if (ret != -ENOTSUP) {
2527
            return ret;
2528
        }
2529
    }
2530

    
2531
    /* Fall back to bounce buffer if write zeroes is unsupported */
2532
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2533
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2534
    memset(iov.iov_base, 0, iov.iov_len);
2535
    qemu_iovec_init_external(&qiov, &iov, 1);
2536

    
2537
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2538

    
2539
    qemu_vfree(iov.iov_base);
2540
    return ret;
2541
}
2542

    
2543
/*
2544
 * Handle a write request in coroutine context
2545
 */
2546
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2547
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2548
    BdrvRequestFlags flags)
2549
{
2550
    BlockDriver *drv = bs->drv;
2551
    BdrvTrackedRequest req;
2552
    int ret;
2553

    
2554
    if (!bs->drv) {
2555
        return -ENOMEDIUM;
2556
    }
2557
    if (bs->read_only) {
2558
        return -EACCES;
2559
    }
2560
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2561
        return -EIO;
2562
    }
2563

    
2564
    /* throttling disk write I/O */
2565
    if (bs->io_limits_enabled) {
2566
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2567
    }
2568

    
2569
    if (bs->copy_on_read_in_flight) {
2570
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2571
    }
2572

    
2573
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2574

    
2575
    if (flags & BDRV_REQ_ZERO_WRITE) {
2576
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2577
    } else {
2578
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2579
    }
2580

    
2581
    if (ret == 0 && !bs->enable_write_cache) {
2582
        ret = bdrv_co_flush(bs);
2583
    }
2584

    
2585
    if (bs->dirty_bitmap) {
2586
        bdrv_set_dirty(bs, sector_num, nb_sectors);
2587
    }
2588

    
2589
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2590
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2591
    }
2592

    
2593
    tracked_request_end(&req);
2594

    
2595
    return ret;
2596
}
2597

    
2598
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2599
    int nb_sectors, QEMUIOVector *qiov)
2600
{
2601
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2602

    
2603
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2604
}
2605

    
2606
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2607
                                      int64_t sector_num, int nb_sectors)
2608
{
2609
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2610

    
2611
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2612
                             BDRV_REQ_ZERO_WRITE);
2613
}
2614

    
2615
/**
2616
 * Truncate file to 'offset' bytes (needed only for file protocols)
2617
 */
2618
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2619
{
2620
    BlockDriver *drv = bs->drv;
2621
    int ret;
2622
    if (!drv)
2623
        return -ENOMEDIUM;
2624
    if (!drv->bdrv_truncate)
2625
        return -ENOTSUP;
2626
    if (bs->read_only)
2627
        return -EACCES;
2628
    if (bdrv_in_use(bs))
2629
        return -EBUSY;
2630
    ret = drv->bdrv_truncate(bs, offset);
2631
    if (ret == 0) {
2632
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2633
        bdrv_dev_resize_cb(bs);
2634
    }
2635
    return ret;
2636
}
2637

    
2638
/**
2639
 * Length of a allocated file in bytes. Sparse files are counted by actual
2640
 * allocated space. Return < 0 if error or unknown.
2641
 */
2642
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2643
{
2644
    BlockDriver *drv = bs->drv;
2645
    if (!drv) {
2646
        return -ENOMEDIUM;
2647
    }
2648
    if (drv->bdrv_get_allocated_file_size) {
2649
        return drv->bdrv_get_allocated_file_size(bs);
2650
    }
2651
    if (bs->file) {
2652
        return bdrv_get_allocated_file_size(bs->file);
2653
    }
2654
    return -ENOTSUP;
2655
}
2656

    
2657
/**
2658
 * Length of a file in bytes. Return < 0 if error or unknown.
2659
 */
2660
int64_t bdrv_getlength(BlockDriverState *bs)
2661
{
2662
    BlockDriver *drv = bs->drv;
2663
    if (!drv)
2664
        return -ENOMEDIUM;
2665

    
2666
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2667
        if (drv->bdrv_getlength) {
2668
            return drv->bdrv_getlength(bs);
2669
        }
2670
    }
2671
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2672
}
2673

    
2674
/* return 0 as number of sectors if no device present or error */
2675
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2676
{
2677
    int64_t length;
2678
    length = bdrv_getlength(bs);
2679
    if (length < 0)
2680
        length = 0;
2681
    else
2682
        length = length >> BDRV_SECTOR_BITS;
2683
    *nb_sectors_ptr = length;
2684
}
2685

    
2686
/* throttling disk io limits */
2687
void bdrv_set_io_limits(BlockDriverState *bs,
2688
                        BlockIOLimit *io_limits)
2689
{
2690
    bs->io_limits = *io_limits;
2691
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2692
}
2693

    
2694
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2695
                       BlockdevOnError on_write_error)
2696
{
2697
    bs->on_read_error = on_read_error;
2698
    bs->on_write_error = on_write_error;
2699
}
2700

    
2701
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2702
{
2703
    return is_read ? bs->on_read_error : bs->on_write_error;
2704
}
2705

    
2706
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2707
{
2708
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2709

    
2710
    switch (on_err) {
2711
    case BLOCKDEV_ON_ERROR_ENOSPC:
2712
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2713
    case BLOCKDEV_ON_ERROR_STOP:
2714
        return BDRV_ACTION_STOP;
2715
    case BLOCKDEV_ON_ERROR_REPORT:
2716
        return BDRV_ACTION_REPORT;
2717
    case BLOCKDEV_ON_ERROR_IGNORE:
2718
        return BDRV_ACTION_IGNORE;
2719
    default:
2720
        abort();
2721
    }
2722
}
2723

    
2724
/* This is done by device models because, while the block layer knows
2725
 * about the error, it does not know whether an operation comes from
2726
 * the device or the block layer (from a job, for example).
2727
 */
2728
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2729
                       bool is_read, int error)
2730
{
2731
    assert(error >= 0);
2732
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2733
    if (action == BDRV_ACTION_STOP) {
2734
        vm_stop(RUN_STATE_IO_ERROR);
2735
        bdrv_iostatus_set_err(bs, error);
2736
    }
2737
}
2738

    
2739
int bdrv_is_read_only(BlockDriverState *bs)
2740
{
2741
    return bs->read_only;
2742
}
2743

    
2744
int bdrv_is_sg(BlockDriverState *bs)
2745
{
2746
    return bs->sg;
2747
}
2748

    
2749
int bdrv_enable_write_cache(BlockDriverState *bs)
2750
{
2751
    return bs->enable_write_cache;
2752
}
2753

    
2754
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2755
{
2756
    bs->enable_write_cache = wce;
2757

    
2758
    /* so a reopen() will preserve wce */
2759
    if (wce) {
2760
        bs->open_flags |= BDRV_O_CACHE_WB;
2761
    } else {
2762
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2763
    }
2764
}
2765

    
2766
int bdrv_is_encrypted(BlockDriverState *bs)
2767
{
2768
    if (bs->backing_hd && bs->backing_hd->encrypted)
2769
        return 1;
2770
    return bs->encrypted;
2771
}
2772

    
2773
int bdrv_key_required(BlockDriverState *bs)
2774
{
2775
    BlockDriverState *backing_hd = bs->backing_hd;
2776

    
2777
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2778
        return 1;
2779
    return (bs->encrypted && !bs->valid_key);
2780
}
2781

    
2782
int bdrv_set_key(BlockDriverState *bs, const char *key)
2783
{
2784
    int ret;
2785
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2786
        ret = bdrv_set_key(bs->backing_hd, key);
2787
        if (ret < 0)
2788
            return ret;
2789
        if (!bs->encrypted)
2790
            return 0;
2791
    }
2792
    if (!bs->encrypted) {
2793
        return -EINVAL;
2794
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2795
        return -ENOMEDIUM;
2796
    }
2797
    ret = bs->drv->bdrv_set_key(bs, key);
2798
    if (ret < 0) {
2799
        bs->valid_key = 0;
2800
    } else if (!bs->valid_key) {
2801
        bs->valid_key = 1;
2802
        /* call the change callback now, we skipped it on open */
2803
        bdrv_dev_change_media_cb(bs, true);
2804
    }
2805
    return ret;
2806
}
2807

    
2808
const char *bdrv_get_format_name(BlockDriverState *bs)
2809
{
2810
    return bs->drv ? bs->drv->format_name : NULL;
2811
}
2812

    
2813
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2814
                         void *opaque)
2815
{
2816
    BlockDriver *drv;
2817

    
2818
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2819
        it(opaque, drv->format_name);
2820
    }
2821
}
2822

    
2823
BlockDriverState *bdrv_find(const char *name)
2824
{
2825
    BlockDriverState *bs;
2826

    
2827
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2828
        if (!strcmp(name, bs->device_name)) {
2829
            return bs;
2830
        }
2831
    }
2832
    return NULL;
2833
}
2834

    
2835
BlockDriverState *bdrv_next(BlockDriverState *bs)
2836
{
2837
    if (!bs) {
2838
        return QTAILQ_FIRST(&bdrv_states);
2839
    }
2840
    return QTAILQ_NEXT(bs, list);
2841
}
2842

    
2843
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2844
{
2845
    BlockDriverState *bs;
2846

    
2847
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2848
        it(opaque, bs);
2849
    }
2850
}
2851

    
2852
const char *bdrv_get_device_name(BlockDriverState *bs)
2853
{
2854
    return bs->device_name;
2855
}
2856

    
2857
int bdrv_get_flags(BlockDriverState *bs)
2858
{
2859
    return bs->open_flags;
2860
}
2861

    
2862
void bdrv_flush_all(void)
2863
{
2864
    BlockDriverState *bs;
2865

    
2866
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2867
        bdrv_flush(bs);
2868
    }
2869
}
2870

    
2871
int bdrv_has_zero_init(BlockDriverState *bs)
2872
{
2873
    assert(bs->drv);
2874

    
2875
    if (bs->drv->bdrv_has_zero_init) {
2876
        return bs->drv->bdrv_has_zero_init(bs);
2877
    }
2878

    
2879
    return 1;
2880
}
2881

    
2882
typedef struct BdrvCoIsAllocatedData {
2883
    BlockDriverState *bs;
2884
    BlockDriverState *base;
2885
    int64_t sector_num;
2886
    int nb_sectors;
2887
    int *pnum;
2888
    int ret;
2889
    bool done;
2890
} BdrvCoIsAllocatedData;
2891

    
2892
/*
2893
 * Returns true iff the specified sector is present in the disk image. Drivers
2894
 * not implementing the functionality are assumed to not support backing files,
2895
 * hence all their sectors are reported as allocated.
2896
 *
2897
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2898
 * and 'pnum' is set to 0.
2899
 *
2900
 * 'pnum' is set to the number of sectors (including and immediately following
2901
 * the specified sector) that are known to be in the same
2902
 * allocated/unallocated state.
2903
 *
2904
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2905
 * beyond the end of the disk image it will be clamped.
2906
 */
2907
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2908
                                      int nb_sectors, int *pnum)
2909
{
2910
    int64_t n;
2911

    
2912
    if (sector_num >= bs->total_sectors) {
2913
        *pnum = 0;
2914
        return 0;
2915
    }
2916

    
2917
    n = bs->total_sectors - sector_num;
2918
    if (n < nb_sectors) {
2919
        nb_sectors = n;
2920
    }
2921

    
2922
    if (!bs->drv->bdrv_co_is_allocated) {
2923
        *pnum = nb_sectors;
2924
        return 1;
2925
    }
2926

    
2927
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2928
}
2929

    
2930
/* Coroutine wrapper for bdrv_is_allocated() */
2931
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2932
{
2933
    BdrvCoIsAllocatedData *data = opaque;
2934
    BlockDriverState *bs = data->bs;
2935

    
2936
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2937
                                     data->pnum);
2938
    data->done = true;
2939
}
2940

    
2941
/*
2942
 * Synchronous wrapper around bdrv_co_is_allocated().
2943
 *
2944
 * See bdrv_co_is_allocated() for details.
2945
 */
2946
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2947
                      int *pnum)
2948
{
2949
    Coroutine *co;
2950
    BdrvCoIsAllocatedData data = {
2951
        .bs = bs,
2952
        .sector_num = sector_num,
2953
        .nb_sectors = nb_sectors,
2954
        .pnum = pnum,
2955
        .done = false,
2956
    };
2957

    
2958
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2959
    qemu_coroutine_enter(co, &data);
2960
    while (!data.done) {
2961
        qemu_aio_wait();
2962
    }
2963
    return data.ret;
2964
}
2965

    
2966
/*
2967
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2968
 *
2969
 * Return true if the given sector is allocated in any image between
2970
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2971
 * sector is allocated in any image of the chain.  Return false otherwise.
2972
 *
2973
 * 'pnum' is set to the number of sectors (including and immediately following
2974
 *  the specified sector) that are known to be in the same
2975
 *  allocated/unallocated state.
2976
 *
2977
 */
2978
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2979
                                            BlockDriverState *base,
2980
                                            int64_t sector_num,
2981
                                            int nb_sectors, int *pnum)
2982
{
2983
    BlockDriverState *intermediate;
2984
    int ret, n = nb_sectors;
2985

    
2986
    intermediate = top;
2987
    while (intermediate && intermediate != base) {
2988
        int pnum_inter;
2989
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2990
                                   &pnum_inter);
2991
        if (ret < 0) {
2992
            return ret;
2993
        } else if (ret) {
2994
            *pnum = pnum_inter;
2995
            return 1;
2996
        }
2997

    
2998
        /*
2999
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3000
         * might have
3001
         *
3002
         * [sector_num+x, nr_sectors] allocated.
3003
         */
3004
        if (n > pnum_inter &&
3005
            (intermediate == top ||
3006
             sector_num + pnum_inter < intermediate->total_sectors)) {
3007
            n = pnum_inter;
3008
        }
3009

    
3010
        intermediate = intermediate->backing_hd;
3011
    }
3012

    
3013
    *pnum = n;
3014
    return 0;
3015
}
3016

    
3017
/* Coroutine wrapper for bdrv_is_allocated_above() */
3018
static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque)
3019
{
3020
    BdrvCoIsAllocatedData *data = opaque;
3021
    BlockDriverState *top = data->bs;
3022
    BlockDriverState *base = data->base;
3023

    
3024
    data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num,
3025
                                           data->nb_sectors, data->pnum);
3026
    data->done = true;
3027
}
3028

    
3029
/*
3030
 * Synchronous wrapper around bdrv_co_is_allocated_above().
3031
 *
3032
 * See bdrv_co_is_allocated_above() for details.
3033
 */
3034
int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
3035
                            int64_t sector_num, int nb_sectors, int *pnum)
3036
{
3037
    Coroutine *co;
3038
    BdrvCoIsAllocatedData data = {
3039
        .bs = top,
3040
        .base = base,
3041
        .sector_num = sector_num,
3042
        .nb_sectors = nb_sectors,
3043
        .pnum = pnum,
3044
        .done = false,
3045
    };
3046

    
3047
    co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry);
3048
    qemu_coroutine_enter(co, &data);
3049
    while (!data.done) {
3050
        qemu_aio_wait();
3051
    }
3052
    return data.ret;
3053
}
3054

    
3055
BlockInfo *bdrv_query_info(BlockDriverState *bs)
3056
{
3057
    BlockInfo *info = g_malloc0(sizeof(*info));
3058
    info->device = g_strdup(bs->device_name);
3059
    info->type = g_strdup("unknown");
3060
    info->locked = bdrv_dev_is_medium_locked(bs);
3061
    info->removable = bdrv_dev_has_removable_media(bs);
3062

    
3063
    if (bdrv_dev_has_removable_media(bs)) {
3064
        info->has_tray_open = true;
3065
        info->tray_open = bdrv_dev_is_tray_open(bs);
3066
    }
3067

    
3068
    if (bdrv_iostatus_is_enabled(bs)) {
3069
        info->has_io_status = true;
3070
        info->io_status = bs->iostatus;
3071
    }
3072

    
3073
    if (bs->dirty_bitmap) {
3074
        info->has_dirty = true;
3075
        info->dirty = g_malloc0(sizeof(*info->dirty));
3076
        info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
3077
        info->dirty->granularity =
3078
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
3079
    }
3080

    
3081
    if (bs->drv) {
3082
        info->has_inserted = true;
3083
        info->inserted = g_malloc0(sizeof(*info->inserted));
3084
        info->inserted->file = g_strdup(bs->filename);
3085
        info->inserted->ro = bs->read_only;
3086
        info->inserted->drv = g_strdup(bs->drv->format_name);
3087
        info->inserted->encrypted = bs->encrypted;
3088
        info->inserted->encryption_key_missing = bdrv_key_required(bs);
3089

    
3090
        if (bs->backing_file[0]) {
3091
            info->inserted->has_backing_file = true;
3092
            info->inserted->backing_file = g_strdup(bs->backing_file);
3093
        }
3094

    
3095
        info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
3096

    
3097
        if (bs->io_limits_enabled) {
3098
            info->inserted->bps =
3099
                           bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3100
            info->inserted->bps_rd =
3101
                           bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
3102
            info->inserted->bps_wr =
3103
                           bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
3104
            info->inserted->iops =
3105
                           bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3106
            info->inserted->iops_rd =
3107
                           bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
3108
            info->inserted->iops_wr =
3109
                           bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
3110
        }
3111
    }
3112
    return info;
3113
}
3114

    
3115
BlockInfoList *qmp_query_block(Error **errp)
3116
{
3117
    BlockInfoList *head = NULL, **p_next = &head;
3118
    BlockDriverState *bs;
3119

    
3120
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3121
        BlockInfoList *info = g_malloc0(sizeof(*info));
3122
        info->value = bdrv_query_info(bs);
3123

    
3124
        *p_next = info;
3125
        p_next = &info->next;
3126
    }
3127

    
3128
    return head;
3129
}
3130

    
3131
BlockStats *bdrv_query_stats(const BlockDriverState *bs)
3132
{
3133
    BlockStats *s;
3134

    
3135
    s = g_malloc0(sizeof(*s));
3136

    
3137
    if (bs->device_name[0]) {
3138
        s->has_device = true;
3139
        s->device = g_strdup(bs->device_name);
3140
    }
3141

    
3142
    s->stats = g_malloc0(sizeof(*s->stats));
3143
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
3144
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
3145
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
3146
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
3147
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
3148
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
3149
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
3150
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
3151
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
3152

    
3153
    if (bs->file) {
3154
        s->has_parent = true;
3155
        s->parent = bdrv_query_stats(bs->file);
3156
    }
3157

    
3158
    return s;
3159
}
3160

    
3161
BlockStatsList *qmp_query_blockstats(Error **errp)
3162
{
3163
    BlockStatsList *head = NULL, **p_next = &head;
3164
    BlockDriverState *bs;
3165

    
3166
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3167
        BlockStatsList *info = g_malloc0(sizeof(*info));
3168
        info->value = bdrv_query_stats(bs);
3169

    
3170
        *p_next = info;
3171
        p_next = &info->next;
3172
    }
3173

    
3174
    return head;
3175
}
3176

    
3177
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3178
{
3179
    if (bs->backing_hd && bs->backing_hd->encrypted)
3180
        return bs->backing_file;
3181
    else if (bs->encrypted)
3182
        return bs->filename;
3183
    else
3184
        return NULL;
3185
}
3186

    
3187
void bdrv_get_backing_filename(BlockDriverState *bs,
3188
                               char *filename, int filename_size)
3189
{
3190
    pstrcpy(filename, filename_size, bs->backing_file);
3191
}
3192

    
3193
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3194
                          const uint8_t *buf, int nb_sectors)
3195
{
3196
    BlockDriver *drv = bs->drv;
3197
    if (!drv)
3198
        return -ENOMEDIUM;
3199
    if (!drv->bdrv_write_compressed)
3200
        return -ENOTSUP;
3201
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3202
        return -EIO;
3203

    
3204
    assert(!bs->dirty_bitmap);
3205

    
3206
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3207
}
3208

    
3209
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3210
{
3211
    BlockDriver *drv = bs->drv;
3212
    if (!drv)
3213
        return -ENOMEDIUM;
3214
    if (!drv->bdrv_get_info)
3215
        return -ENOTSUP;
3216
    memset(bdi, 0, sizeof(*bdi));
3217
    return drv->bdrv_get_info(bs, bdi);
3218
}
3219

    
3220
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3221
                      int64_t pos, int size)
3222
{
3223
    QEMUIOVector qiov;
3224
    struct iovec iov = {
3225
        .iov_base   = (void *) buf,
3226
        .iov_len    = size,
3227
    };
3228

    
3229
    qemu_iovec_init_external(&qiov, &iov, 1);
3230
    return bdrv_writev_vmstate(bs, &qiov, pos);
3231
}
3232

    
3233
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3234
{
3235
    BlockDriver *drv = bs->drv;
3236

    
3237
    if (!drv) {
3238
        return -ENOMEDIUM;
3239
    } else if (drv->bdrv_save_vmstate) {
3240
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3241
    } else if (bs->file) {
3242
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3243
    }
3244

    
3245
    return -ENOTSUP;
3246
}
3247

    
3248
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3249
                      int64_t pos, int size)
3250
{
3251
    BlockDriver *drv = bs->drv;
3252
    if (!drv)
3253
        return -ENOMEDIUM;
3254
    if (drv->bdrv_load_vmstate)
3255
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3256
    if (bs->file)
3257
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3258
    return -ENOTSUP;
3259
}
3260

    
3261
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3262
{
3263
    BlockDriver *drv = bs->drv;
3264

    
3265
    if (!drv || !drv->bdrv_debug_event) {
3266
        return;
3267
    }
3268

    
3269
    drv->bdrv_debug_event(bs, event);
3270
}
3271

    
3272
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3273
                          const char *tag)
3274
{
3275
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3276
        bs = bs->file;
3277
    }
3278

    
3279
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3280
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3281
    }
3282

    
3283
    return -ENOTSUP;
3284
}
3285

    
3286
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3287
{
3288
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3289
        bs = bs->file;
3290
    }
3291

    
3292
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3293
        return bs->drv->bdrv_debug_resume(bs, tag);
3294
    }
3295

    
3296
    return -ENOTSUP;
3297
}
3298

    
3299
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3300
{
3301
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3302
        bs = bs->file;
3303
    }
3304

    
3305
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3306
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3307
    }
3308

    
3309
    return false;
3310
}
3311

    
3312
/**************************************************************/
3313
/* handling of snapshots */
3314

    
3315
int bdrv_can_snapshot(BlockDriverState *bs)
3316
{
3317
    BlockDriver *drv = bs->drv;
3318
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3319
        return 0;
3320
    }
3321

    
3322
    if (!drv->bdrv_snapshot_create) {
3323
        if (bs->file != NULL) {
3324
            return bdrv_can_snapshot(bs->file);
3325
        }
3326
        return 0;
3327
    }
3328

    
3329
    return 1;
3330
}
3331

    
3332
int bdrv_is_snapshot(BlockDriverState *bs)
3333
{
3334
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3335
}
3336

    
3337
BlockDriverState *bdrv_snapshots(void)
3338
{
3339
    BlockDriverState *bs;
3340

    
3341
    if (bs_snapshots) {
3342
        return bs_snapshots;
3343
    }
3344

    
3345
    bs = NULL;
3346
    while ((bs = bdrv_next(bs))) {
3347
        if (bdrv_can_snapshot(bs)) {
3348
            bs_snapshots = bs;
3349
            return bs;
3350
        }
3351
    }
3352
    return NULL;
3353
}
3354

    
3355
int bdrv_snapshot_create(BlockDriverState *bs,
3356
                         QEMUSnapshotInfo *sn_info)
3357
{
3358
    BlockDriver *drv = bs->drv;
3359
    if (!drv)
3360
        return -ENOMEDIUM;
3361
    if (drv->bdrv_snapshot_create)
3362
        return drv->bdrv_snapshot_create(bs, sn_info);
3363
    if (bs->file)
3364
        return bdrv_snapshot_create(bs->file, sn_info);
3365
    return -ENOTSUP;
3366
}
3367

    
3368
int bdrv_snapshot_goto(BlockDriverState *bs,
3369
                       const char *snapshot_id)
3370
{
3371
    BlockDriver *drv = bs->drv;
3372
    int ret, open_ret;
3373

    
3374
    if (!drv)
3375
        return -ENOMEDIUM;
3376
    if (drv->bdrv_snapshot_goto)
3377
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
3378

    
3379
    if (bs->file) {
3380
        drv->bdrv_close(bs);
3381
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
3382
        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags);
3383
        if (open_ret < 0) {
3384
            bdrv_delete(bs->file);
3385
            bs->drv = NULL;
3386
            return open_ret;
3387
        }
3388
        return ret;
3389
    }
3390

    
3391
    return -ENOTSUP;
3392
}
3393

    
3394
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
3395
{
3396
    BlockDriver *drv = bs->drv;
3397
    if (!drv)
3398
        return -ENOMEDIUM;
3399
    if (drv->bdrv_snapshot_delete)
3400
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
3401
    if (bs->file)
3402
        return bdrv_snapshot_delete(bs->file, snapshot_id);
3403
    return -ENOTSUP;
3404
}
3405

    
3406
int bdrv_snapshot_list(BlockDriverState *bs,
3407
                       QEMUSnapshotInfo **psn_info)
3408
{
3409
    BlockDriver *drv = bs->drv;
3410
    if (!drv)
3411
        return -ENOMEDIUM;
3412
    if (drv->bdrv_snapshot_list)
3413
        return drv->bdrv_snapshot_list(bs, psn_info);
3414
    if (bs->file)
3415
        return bdrv_snapshot_list(bs->file, psn_info);
3416
    return -ENOTSUP;
3417
}
3418

    
3419
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
3420
        const char *snapshot_name)
3421
{
3422
    BlockDriver *drv = bs->drv;
3423
    if (!drv) {
3424
        return -ENOMEDIUM;
3425
    }
3426
    if (!bs->read_only) {
3427
        return -EINVAL;
3428
    }
3429
    if (drv->bdrv_snapshot_load_tmp) {
3430
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
3431
    }
3432
    return -ENOTSUP;
3433
}
3434

    
3435
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3436
 * relative, it must be relative to the chain.  So, passing in bs->filename
3437
 * from a BDS as backing_file should not be done, as that may be relative to
3438
 * the CWD rather than the chain. */
3439
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3440
        const char *backing_file)
3441
{
3442
    char *filename_full = NULL;
3443
    char *backing_file_full = NULL;
3444
    char *filename_tmp = NULL;
3445
    int is_protocol = 0;
3446
    BlockDriverState *curr_bs = NULL;
3447
    BlockDriverState *retval = NULL;
3448

    
3449
    if (!bs || !bs->drv || !backing_file) {
3450
        return NULL;
3451
    }
3452

    
3453
    filename_full     = g_malloc(PATH_MAX);
3454
    backing_file_full = g_malloc(PATH_MAX);
3455
    filename_tmp      = g_malloc(PATH_MAX);
3456

    
3457
    is_protocol = path_has_protocol(backing_file);
3458

    
3459
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3460

    
3461
        /* If either of the filename paths is actually a protocol, then
3462
         * compare unmodified paths; otherwise make paths relative */
3463
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3464
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3465
                retval = curr_bs->backing_hd;
3466
                break;
3467
            }
3468
        } else {
3469
            /* If not an absolute filename path, make it relative to the current
3470
             * image's filename path */
3471
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3472
                         backing_file);
3473

    
3474
            /* We are going to compare absolute pathnames */
3475
            if (!realpath(filename_tmp, filename_full)) {
3476
                continue;
3477
            }
3478

    
3479
            /* We need to make sure the backing filename we are comparing against
3480
             * is relative to the current image filename (or absolute) */
3481
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3482
                         curr_bs->backing_file);
3483

    
3484
            if (!realpath(filename_tmp, backing_file_full)) {
3485
                continue;
3486
            }
3487

    
3488
            if (strcmp(backing_file_full, filename_full) == 0) {
3489
                retval = curr_bs->backing_hd;
3490
                break;
3491
            }
3492
        }
3493
    }
3494

    
3495
    g_free(filename_full);
3496
    g_free(backing_file_full);
3497
    g_free(filename_tmp);
3498
    return retval;
3499
}
3500

    
3501
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3502
{
3503
    if (!bs->drv) {
3504
        return 0;
3505
    }
3506

    
3507
    if (!bs->backing_hd) {
3508
        return 0;
3509
    }
3510

    
3511
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3512
}
3513

    
3514
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3515
{
3516
    BlockDriverState *curr_bs = NULL;
3517

    
3518
    if (!bs) {
3519
        return NULL;
3520
    }
3521

    
3522
    curr_bs = bs;
3523

    
3524
    while (curr_bs->backing_hd) {
3525
        curr_bs = curr_bs->backing_hd;
3526
    }
3527
    return curr_bs;
3528
}
3529

    
3530
#define NB_SUFFIXES 4
3531

    
3532
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
3533
{
3534
    static const char suffixes[NB_SUFFIXES] = "KMGT";
3535
    int64_t base;
3536
    int i;
3537

    
3538
    if (size <= 999) {
3539
        snprintf(buf, buf_size, "%" PRId64, size);
3540
    } else {
3541
        base = 1024;
3542
        for(i = 0; i < NB_SUFFIXES; i++) {
3543
            if (size < (10 * base)) {
3544
                snprintf(buf, buf_size, "%0.1f%c",
3545
                         (double)size / base,
3546
                         suffixes[i]);
3547
                break;
3548
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3549
                snprintf(buf, buf_size, "%" PRId64 "%c",
3550
                         ((size + (base >> 1)) / base),
3551
                         suffixes[i]);
3552
                break;
3553
            }
3554
            base = base * 1024;
3555
        }
3556
    }
3557
    return buf;
3558
}
3559

    
3560
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3561
{
3562
    char buf1[128], date_buf[128], clock_buf[128];
3563
    struct tm tm;
3564
    time_t ti;
3565
    int64_t secs;
3566

    
3567
    if (!sn) {
3568
        snprintf(buf, buf_size,
3569
                 "%-10s%-20s%7s%20s%15s",
3570
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3571
    } else {
3572
        ti = sn->date_sec;
3573
        localtime_r(&ti, &tm);
3574
        strftime(date_buf, sizeof(date_buf),
3575
                 "%Y-%m-%d %H:%M:%S", &tm);
3576
        secs = sn->vm_clock_nsec / 1000000000;
3577
        snprintf(clock_buf, sizeof(clock_buf),
3578
                 "%02d:%02d:%02d.%03d",
3579
                 (int)(secs / 3600),
3580
                 (int)((secs / 60) % 60),
3581
                 (int)(secs % 60),
3582
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3583
        snprintf(buf, buf_size,
3584
                 "%-10s%-20s%7s%20s%15s",
3585
                 sn->id_str, sn->name,
3586
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3587
                 date_buf,
3588
                 clock_buf);
3589
    }
3590
    return buf;
3591
}
3592

    
3593
/**************************************************************/
3594
/* async I/Os */
3595

    
3596
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3597
                                 QEMUIOVector *qiov, int nb_sectors,
3598
                                 BlockDriverCompletionFunc *cb, void *opaque)
3599
{
3600
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3601

    
3602
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3603
                                 cb, opaque, false);
3604
}
3605

    
3606
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3607
                                  QEMUIOVector *qiov, int nb_sectors,
3608
                                  BlockDriverCompletionFunc *cb, void *opaque)
3609
{
3610
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3611

    
3612
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3613
                                 cb, opaque, true);
3614
}
3615

    
3616

    
3617
typedef struct MultiwriteCB {
3618
    int error;
3619
    int num_requests;
3620
    int num_callbacks;
3621
    struct {
3622
        BlockDriverCompletionFunc *cb;
3623
        void *opaque;
3624
        QEMUIOVector *free_qiov;
3625
    } callbacks[];
3626
} MultiwriteCB;
3627

    
3628
static void multiwrite_user_cb(MultiwriteCB *mcb)
3629
{
3630
    int i;
3631

    
3632
    for (i = 0; i < mcb->num_callbacks; i++) {
3633
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3634
        if (mcb->callbacks[i].free_qiov) {
3635
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3636
        }
3637
        g_free(mcb->callbacks[i].free_qiov);
3638
    }
3639
}
3640

    
3641
static void multiwrite_cb(void *opaque, int ret)
3642
{
3643
    MultiwriteCB *mcb = opaque;
3644

    
3645
    trace_multiwrite_cb(mcb, ret);
3646

    
3647
    if (ret < 0 && !mcb->error) {
3648
        mcb->error = ret;
3649
    }
3650

    
3651
    mcb->num_requests--;
3652
    if (mcb->num_requests == 0) {
3653
        multiwrite_user_cb(mcb);
3654
        g_free(mcb);
3655
    }
3656
}
3657

    
3658
static int multiwrite_req_compare(const void *a, const void *b)
3659
{
3660
    const BlockRequest *req1 = a, *req2 = b;
3661

    
3662
    /*
3663
     * Note that we can't simply subtract req2->sector from req1->sector
3664
     * here as that could overflow the return value.
3665
     */
3666
    if (req1->sector > req2->sector) {
3667
        return 1;
3668
    } else if (req1->sector < req2->sector) {
3669
        return -1;
3670
    } else {
3671
        return 0;
3672
    }
3673
}
3674

    
3675
/*
3676
 * Takes a bunch of requests and tries to merge them. Returns the number of
3677
 * requests that remain after merging.
3678
 */
3679
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3680
    int num_reqs, MultiwriteCB *mcb)
3681
{
3682
    int i, outidx;
3683

    
3684
    // Sort requests by start sector
3685
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3686

    
3687
    // Check if adjacent requests touch the same clusters. If so, combine them,
3688
    // filling up gaps with zero sectors.
3689
    outidx = 0;
3690
    for (i = 1; i < num_reqs; i++) {
3691
        int merge = 0;
3692
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3693

    
3694
        // Handle exactly sequential writes and overlapping writes.
3695
        if (reqs[i].sector <= oldreq_last) {
3696
            merge = 1;
3697
        }
3698

    
3699
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3700
            merge = 0;
3701
        }
3702

    
3703
        if (merge) {
3704
            size_t size;
3705
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3706
            qemu_iovec_init(qiov,
3707
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3708

    
3709
            // Add the first request to the merged one. If the requests are
3710
            // overlapping, drop the last sectors of the first request.
3711
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3712
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3713

    
3714
            // We should need to add any zeros between the two requests
3715
            assert (reqs[i].sector <= oldreq_last);
3716

    
3717
            // Add the second request
3718
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3719

    
3720
            reqs[outidx].nb_sectors = qiov->size >> 9;
3721
            reqs[outidx].qiov = qiov;
3722

    
3723
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3724
        } else {
3725
            outidx++;
3726
            reqs[outidx].sector     = reqs[i].sector;
3727
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3728
            reqs[outidx].qiov       = reqs[i].qiov;
3729
        }
3730
    }
3731

    
3732
    return outidx + 1;
3733
}
3734

    
3735
/*
3736
 * Submit multiple AIO write requests at once.
3737
 *
3738
 * On success, the function returns 0 and all requests in the reqs array have
3739
 * been submitted. In error case this function returns -1, and any of the
3740
 * requests may or may not be submitted yet. In particular, this means that the
3741
 * callback will be called for some of the requests, for others it won't. The
3742
 * caller must check the error field of the BlockRequest to wait for the right
3743
 * callbacks (if error != 0, no callback will be called).
3744
 *
3745
 * The implementation may modify the contents of the reqs array, e.g. to merge
3746
 * requests. However, the fields opaque and error are left unmodified as they
3747
 * are used to signal failure for a single request to the caller.
3748
 */
3749
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3750
{
3751
    MultiwriteCB *mcb;
3752
    int i;
3753

    
3754
    /* don't submit writes if we don't have a medium */
3755
    if (bs->drv == NULL) {
3756
        for (i = 0; i < num_reqs; i++) {
3757
            reqs[i].error = -ENOMEDIUM;
3758
        }
3759
        return -1;
3760
    }
3761

    
3762
    if (num_reqs == 0) {
3763
        return 0;
3764
    }
3765

    
3766
    // Create MultiwriteCB structure
3767
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3768
    mcb->num_requests = 0;
3769
    mcb->num_callbacks = num_reqs;
3770

    
3771
    for (i = 0; i < num_reqs; i++) {
3772
        mcb->callbacks[i].cb = reqs[i].cb;
3773
        mcb->callbacks[i].opaque = reqs[i].opaque;
3774
    }
3775

    
3776
    // Check for mergable requests
3777
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3778

    
3779
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3780

    
3781
    /* Run the aio requests. */
3782
    mcb->num_requests = num_reqs;
3783
    for (i = 0; i < num_reqs; i++) {
3784
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3785
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3786
    }
3787

    
3788
    return 0;
3789
}
3790

    
3791
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3792
{
3793
    acb->aiocb_info->cancel(acb);
3794
}
3795

    
3796
/* block I/O throttling */
3797
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3798
                 bool is_write, double elapsed_time, uint64_t *wait)
3799
{
3800
    uint64_t bps_limit = 0;
3801
    uint64_t extension;
3802
    double   bytes_limit, bytes_base, bytes_res;
3803
    double   slice_time, wait_time;
3804

    
3805
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3806
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3807
    } else if (bs->io_limits.bps[is_write]) {
3808
        bps_limit = bs->io_limits.bps[is_write];
3809
    } else {
3810
        if (wait) {
3811
            *wait = 0;
3812
        }
3813

    
3814
        return false;
3815
    }
3816

    
3817
    slice_time = bs->slice_end - bs->slice_start;
3818
    slice_time /= (NANOSECONDS_PER_SECOND);
3819
    bytes_limit = bps_limit * slice_time;
3820
    bytes_base  = bs->slice_submitted.bytes[is_write];
3821
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3822
        bytes_base += bs->slice_submitted.bytes[!is_write];
3823
    }
3824

    
3825
    /* bytes_base: the bytes of data which have been read/written; and
3826
     *             it is obtained from the history statistic info.
3827
     * bytes_res: the remaining bytes of data which need to be read/written.
3828
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3829
     *             the total time for completing reading/writting all data.
3830
     */
3831
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3832

    
3833
    if (bytes_base + bytes_res <= bytes_limit) {
3834
        if (wait) {
3835
            *wait = 0;
3836
        }
3837

    
3838
        return false;
3839
    }
3840

    
3841
    /* Calc approx time to dispatch */
3842
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3843

    
3844
    /* When the I/O rate at runtime exceeds the limits,
3845
     * bs->slice_end need to be extended in order that the current statistic
3846
     * info can be kept until the timer fire, so it is increased and tuned
3847
     * based on the result of experiment.
3848
     */
3849
    extension = wait_time * NANOSECONDS_PER_SECOND;
3850
    extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
3851
                BLOCK_IO_SLICE_TIME;
3852
    bs->slice_end += extension;
3853
    if (wait) {
3854
        *wait = wait_time * NANOSECONDS_PER_SECOND;
3855
    }
3856

    
3857
    return true;
3858
}
3859

    
3860
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3861
                             double elapsed_time, uint64_t *wait)
3862
{
3863
    uint64_t iops_limit = 0;
3864
    double   ios_limit, ios_base;
3865
    double   slice_time, wait_time;
3866

    
3867
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3868
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3869
    } else if (bs->io_limits.iops[is_write]) {
3870
        iops_limit = bs->io_limits.iops[is_write];
3871
    } else {
3872
        if (wait) {
3873
            *wait = 0;
3874
        }
3875

    
3876
        return false;
3877
    }
3878

    
3879
    slice_time = bs->slice_end - bs->slice_start;
3880
    slice_time /= (NANOSECONDS_PER_SECOND);
3881
    ios_limit  = iops_limit * slice_time;
3882
    ios_base   = bs->slice_submitted.ios[is_write];
3883
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3884
        ios_base += bs->slice_submitted.ios[!is_write];
3885
    }
3886

    
3887
    if (ios_base + 1 <= ios_limit) {
3888
        if (wait) {
3889
            *wait = 0;
3890
        }
3891

    
3892
        return false;
3893
    }
3894

    
3895
    /* Calc approx time to dispatch, in seconds */
3896
    wait_time = (ios_base + 1) / iops_limit;
3897
    if (wait_time > elapsed_time) {
3898
        wait_time = wait_time - elapsed_time;
3899
    } else {
3900
        wait_time = 0;
3901
    }
3902

    
3903
    /* Exceeded current slice, extend it by another slice time */
3904
    bs->slice_end += BLOCK_IO_SLICE_TIME;
3905
    if (wait) {
3906
        *wait = wait_time * NANOSECONDS_PER_SECOND;
3907
    }
3908

    
3909
    return true;
3910
}
3911

    
3912
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3913
                           bool is_write, int64_t *wait)
3914
{
3915
    int64_t  now, max_wait;
3916
    uint64_t bps_wait = 0, iops_wait = 0;
3917
    double   elapsed_time;
3918
    int      bps_ret, iops_ret;
3919

    
3920
    now = qemu_get_clock_ns(vm_clock);
3921
    if (now > bs->slice_end) {
3922
        bs->slice_start = now;
3923
        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
3924
        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
3925
    }
3926

    
3927
    elapsed_time  = now - bs->slice_start;
3928
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3929

    
3930
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3931
                                      is_write, elapsed_time, &bps_wait);
3932
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3933
                                      elapsed_time, &iops_wait);
3934
    if (bps_ret || iops_ret) {
3935
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3936
        if (wait) {
3937
            *wait = max_wait;
3938
        }
3939

    
3940
        now = qemu_get_clock_ns(vm_clock);
3941
        if (bs->slice_end < now + max_wait) {
3942
            bs->slice_end = now + max_wait;
3943
        }
3944

    
3945
        return true;
3946
    }
3947

    
3948
    if (wait) {
3949
        *wait = 0;
3950
    }
3951

    
3952
    bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
3953
                                           BDRV_SECTOR_SIZE;
3954
    bs->slice_submitted.ios[is_write]++;
3955

    
3956
    return false;
3957
}
3958

    
3959
/**************************************************************/
3960
/* async block device emulation */
3961

    
3962
typedef struct BlockDriverAIOCBSync {
3963
    BlockDriverAIOCB common;
3964
    QEMUBH *bh;
3965
    int ret;
3966
    /* vector translation state */
3967
    QEMUIOVector *qiov;
3968
    uint8_t *bounce;
3969
    int is_write;
3970
} BlockDriverAIOCBSync;
3971

    
3972
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3973
{
3974
    BlockDriverAIOCBSync *acb =
3975
        container_of(blockacb, BlockDriverAIOCBSync, common);
3976
    qemu_bh_delete(acb->bh);
3977
    acb->bh = NULL;
3978
    qemu_aio_release(acb);
3979
}
3980

    
3981
static const AIOCBInfo bdrv_em_aiocb_info = {
3982
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3983
    .cancel             = bdrv_aio_cancel_em,
3984
};
3985

    
3986
static void bdrv_aio_bh_cb(void *opaque)
3987
{
3988
    BlockDriverAIOCBSync *acb = opaque;
3989

    
3990
    if (!acb->is_write)
3991
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3992
    qemu_vfree(acb->bounce);
3993
    acb->common.cb(acb->common.opaque, acb->ret);
3994
    qemu_bh_delete(acb->bh);
3995
    acb->bh = NULL;
3996
    qemu_aio_release(acb);
3997
}
3998

    
3999
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4000
                                            int64_t sector_num,
4001
                                            QEMUIOVector *qiov,
4002
                                            int nb_sectors,
4003
                                            BlockDriverCompletionFunc *cb,
4004
                                            void *opaque,
4005
                                            int is_write)
4006

    
4007
{
4008
    BlockDriverAIOCBSync *acb;
4009

    
4010
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4011
    acb->is_write = is_write;
4012
    acb->qiov = qiov;
4013
    acb->bounce = qemu_blockalign(bs, qiov->size);
4014
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4015

    
4016
    if (is_write) {
4017
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4018
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4019
    } else {
4020
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4021
    }
4022

    
4023
    qemu_bh_schedule(acb->bh);
4024

    
4025
    return &acb->common;
4026
}
4027

    
4028
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4029
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4030
        BlockDriverCompletionFunc *cb, void *opaque)
4031
{
4032
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4033
}
4034

    
4035
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4036
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4037
        BlockDriverCompletionFunc *cb, void *opaque)
4038
{
4039
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4040
}
4041

    
4042

    
4043
typedef struct BlockDriverAIOCBCoroutine {
4044
    BlockDriverAIOCB common;
4045
    BlockRequest req;
4046
    bool is_write;
4047
    bool *done;
4048
    QEMUBH* bh;
4049
} BlockDriverAIOCBCoroutine;
4050

    
4051
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4052
{
4053
    BlockDriverAIOCBCoroutine *acb =
4054
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4055
    bool done = false;
4056

    
4057
    acb->done = &done;
4058
    while (!done) {
4059
        qemu_aio_wait();
4060
    }
4061
}
4062

    
4063
static const AIOCBInfo bdrv_em_co_aiocb_info = {
4064
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4065
    .cancel             = bdrv_aio_co_cancel_em,
4066
};
4067

    
4068
static void bdrv_co_em_bh(void *opaque)
4069
{
4070
    BlockDriverAIOCBCoroutine *acb = opaque;
4071

    
4072
    acb->common.cb(acb->common.opaque, acb->req.error);
4073

    
4074
    if (acb->done) {
4075
        *acb->done = true;
4076
    }
4077

    
4078
    qemu_bh_delete(acb->bh);
4079
    qemu_aio_release(acb);
4080
}
4081

    
4082
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4083
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4084
{
4085
    BlockDriverAIOCBCoroutine *acb = opaque;
4086
    BlockDriverState *bs = acb->common.bs;
4087

    
4088
    if (!acb->is_write) {
4089
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4090
            acb->req.nb_sectors, acb->req.qiov, 0);
4091
    } else {
4092
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4093
            acb->req.nb_sectors, acb->req.qiov, 0);
4094
    }
4095

    
4096
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4097
    qemu_bh_schedule(acb->bh);
4098
}
4099

    
4100
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4101
                                               int64_t sector_num,
4102
                                               QEMUIOVector *qiov,
4103
                                               int nb_sectors,
4104
                                               BlockDriverCompletionFunc *cb,
4105
                                               void *opaque,
4106
                                               bool is_write)
4107
{
4108
    Coroutine *co;
4109
    BlockDriverAIOCBCoroutine *acb;
4110

    
4111
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4112
    acb->req.sector = sector_num;
4113
    acb->req.nb_sectors = nb_sectors;
4114
    acb->req.qiov = qiov;
4115
    acb->is_write = is_write;
4116
    acb->done = NULL;
4117

    
4118
    co = qemu_coroutine_create(bdrv_co_do_rw);
4119
    qemu_coroutine_enter(co, acb);
4120

    
4121
    return &acb->common;
4122
}
4123

    
4124
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4125
{
4126
    BlockDriverAIOCBCoroutine *acb = opaque;
4127
    BlockDriverState *bs = acb->common.bs;
4128

    
4129
    acb->req.error = bdrv_co_flush(bs);
4130
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4131
    qemu_bh_schedule(acb->bh);
4132
}
4133

    
4134
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4135
        BlockDriverCompletionFunc *cb, void *opaque)
4136
{
4137
    trace_bdrv_aio_flush(bs, opaque);
4138

    
4139
    Coroutine *co;
4140
    BlockDriverAIOCBCoroutine *acb;
4141

    
4142
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4143
    acb->done = NULL;
4144

    
4145
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4146
    qemu_coroutine_enter(co, acb);
4147

    
4148
    return &acb->common;
4149
}
4150

    
4151
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4152
{
4153
    BlockDriverAIOCBCoroutine *acb = opaque;
4154
    BlockDriverState *bs = acb->common.bs;
4155

    
4156
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4157
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4158
    qemu_bh_schedule(acb->bh);
4159
}
4160

    
4161
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4162
        int64_t sector_num, int nb_sectors,
4163
        BlockDriverCompletionFunc *cb, void *opaque)
4164
{
4165
    Coroutine *co;
4166
    BlockDriverAIOCBCoroutine *acb;
4167

    
4168
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4169

    
4170
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4171
    acb->req.sector = sector_num;
4172
    acb->req.nb_sectors = nb_sectors;
4173
    acb->done = NULL;
4174
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4175
    qemu_coroutine_enter(co, acb);
4176

    
4177
    return &acb->common;
4178
}
4179

    
4180
void bdrv_init(void)
4181
{
4182
    module_call_init(MODULE_INIT_BLOCK);
4183
}
4184

    
4185
void bdrv_init_with_whitelist(void)
4186
{
4187
    use_bdrv_whitelist = 1;
4188
    bdrv_init();
4189
}
4190

    
4191
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4192
                   BlockDriverCompletionFunc *cb, void *opaque)
4193
{
4194
    BlockDriverAIOCB *acb;
4195

    
4196
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4197
    acb->aiocb_info = aiocb_info;
4198
    acb->bs = bs;
4199
    acb->cb = cb;
4200
    acb->opaque = opaque;
4201
    return acb;
4202
}
4203

    
4204
void qemu_aio_release(void *p)
4205
{
4206
    BlockDriverAIOCB *acb = p;
4207
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4208
}
4209

    
4210
/**************************************************************/
4211
/* Coroutine block device emulation */
4212

    
4213
typedef struct CoroutineIOCompletion {
4214
    Coroutine *coroutine;
4215
    int ret;
4216
} CoroutineIOCompletion;
4217

    
4218
static void bdrv_co_io_em_complete(void *opaque, int ret)
4219
{
4220
    CoroutineIOCompletion *co = opaque;
4221

    
4222
    co->ret = ret;
4223
    qemu_coroutine_enter(co->coroutine, NULL);
4224
}
4225

    
4226
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4227
                                      int nb_sectors, QEMUIOVector *iov,
4228
                                      bool is_write)
4229
{
4230
    CoroutineIOCompletion co = {
4231
        .coroutine = qemu_coroutine_self(),
4232
    };
4233
    BlockDriverAIOCB *acb;
4234

    
4235
    if (is_write) {
4236
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4237
                                       bdrv_co_io_em_complete, &co);
4238
    } else {
4239
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4240
                                      bdrv_co_io_em_complete, &co);
4241
    }
4242

    
4243
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4244
    if (!acb) {
4245
        return -EIO;
4246
    }
4247
    qemu_coroutine_yield();
4248

    
4249
    return co.ret;
4250
}
4251

    
4252
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4253
                                         int64_t sector_num, int nb_sectors,
4254
                                         QEMUIOVector *iov)
4255
{
4256
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4257
}
4258

    
4259
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4260
                                         int64_t sector_num, int nb_sectors,
4261
                                         QEMUIOVector *iov)
4262
{
4263
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4264
}
4265

    
4266
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4267
{
4268
    RwCo *rwco = opaque;
4269

    
4270
    rwco->ret = bdrv_co_flush(rwco->bs);
4271
}
4272

    
4273
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4274
{
4275
    int ret;
4276

    
4277
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4278
        return 0;
4279
    }
4280

    
4281
    /* Write back cached data to the OS even with cache=unsafe */
4282
    if (bs->drv->bdrv_co_flush_to_os) {
4283
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4284
        if (ret < 0) {
4285
            return ret;
4286
        }
4287
    }
4288

    
4289
    /* But don't actually force it to the disk with cache=unsafe */
4290
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4291
        goto flush_parent;
4292
    }
4293

    
4294
    if (bs->drv->bdrv_co_flush_to_disk) {
4295
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4296
    } else if (bs->drv->bdrv_aio_flush) {
4297
        BlockDriverAIOCB *acb;
4298
        CoroutineIOCompletion co = {
4299
            .coroutine = qemu_coroutine_self(),
4300
        };
4301

    
4302
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4303
        if (acb == NULL) {
4304
            ret = -EIO;
4305
        } else {
4306
            qemu_coroutine_yield();
4307
            ret = co.ret;
4308
        }
4309
    } else {
4310
        /*
4311
         * Some block drivers always operate in either writethrough or unsafe
4312
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4313
         * know how the server works (because the behaviour is hardcoded or
4314
         * depends on server-side configuration), so we can't ensure that
4315
         * everything is safe on disk. Returning an error doesn't work because
4316
         * that would break guests even if the server operates in writethrough
4317
         * mode.
4318
         *
4319
         * Let's hope the user knows what he's doing.
4320
         */
4321
        ret = 0;
4322
    }
4323
    if (ret < 0) {
4324
        return ret;
4325
    }
4326

    
4327
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4328
     * in the case of cache=unsafe, so there are no useless flushes.
4329
     */
4330
flush_parent:
4331
    return bdrv_co_flush(bs->file);
4332
}
4333

    
4334
void bdrv_invalidate_cache(BlockDriverState *bs)
4335
{
4336
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4337
        bs->drv->bdrv_invalidate_cache(bs);
4338
    }
4339
}
4340

    
4341
void bdrv_invalidate_cache_all(void)
4342
{
4343
    BlockDriverState *bs;
4344

    
4345
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4346
        bdrv_invalidate_cache(bs);
4347
    }
4348
}
4349

    
4350
void bdrv_clear_incoming_migration_all(void)
4351
{
4352
    BlockDriverState *bs;
4353

    
4354
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4355
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4356
    }
4357
}
4358

    
4359
int bdrv_flush(BlockDriverState *bs)
4360
{
4361
    Coroutine *co;
4362
    RwCo rwco = {
4363
        .bs = bs,
4364
        .ret = NOT_DONE,
4365
    };
4366

    
4367
    if (qemu_in_coroutine()) {
4368
        /* Fast-path if already in coroutine context */
4369
        bdrv_flush_co_entry(&rwco);
4370
    } else {
4371
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4372
        qemu_coroutine_enter(co, &rwco);
4373
        while (rwco.ret == NOT_DONE) {
4374
            qemu_aio_wait();
4375
        }
4376
    }
4377

    
4378
    return rwco.ret;
4379
}
4380

    
4381
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4382
{
4383
    RwCo *rwco = opaque;
4384

    
4385
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4386
}
4387

    
4388
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4389
                                 int nb_sectors)
4390
{
4391
    if (!bs->drv) {
4392
        return -ENOMEDIUM;
4393
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4394
        return -EIO;
4395
    } else if (bs->read_only) {
4396
        return -EROFS;
4397
    }
4398

    
4399
    if (bs->dirty_bitmap) {
4400
        bdrv_reset_dirty(bs, sector_num, nb_sectors);
4401
    }
4402

    
4403
    /* Do nothing if disabled.  */
4404
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4405
        return 0;
4406
    }
4407

    
4408
    if (bs->drv->bdrv_co_discard) {
4409
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4410
    } else if (bs->drv->bdrv_aio_discard) {
4411
        BlockDriverAIOCB *acb;
4412
        CoroutineIOCompletion co = {
4413
            .coroutine = qemu_coroutine_self(),
4414
        };
4415

    
4416
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4417
                                        bdrv_co_io_em_complete, &co);
4418
        if (acb == NULL) {
4419
            return -EIO;
4420
        } else {
4421
            qemu_coroutine_yield();
4422
            return co.ret;
4423
        }
4424
    } else {
4425
        return 0;
4426
    }
4427
}
4428

    
4429
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4430
{
4431
    Coroutine *co;
4432
    RwCo rwco = {
4433
        .bs = bs,
4434
        .sector_num = sector_num,
4435
        .nb_sectors = nb_sectors,
4436
        .ret = NOT_DONE,
4437
    };
4438

    
4439
    if (qemu_in_coroutine()) {
4440
        /* Fast-path if already in coroutine context */
4441
        bdrv_discard_co_entry(&rwco);
4442
    } else {
4443
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4444
        qemu_coroutine_enter(co, &rwco);
4445
        while (rwco.ret == NOT_DONE) {
4446
            qemu_aio_wait();
4447
        }
4448
    }
4449

    
4450
    return rwco.ret;
4451
}
4452

    
4453
/**************************************************************/
4454
/* removable device support */
4455

    
4456
/**
4457
 * Return TRUE if the media is present
4458
 */
4459
int bdrv_is_inserted(BlockDriverState *bs)
4460
{
4461
    BlockDriver *drv = bs->drv;
4462

    
4463
    if (!drv)
4464
        return 0;
4465
    if (!drv->bdrv_is_inserted)
4466
        return 1;
4467
    return drv->bdrv_is_inserted(bs);
4468
}
4469

    
4470
/**
4471
 * Return whether the media changed since the last call to this
4472
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4473
 */
4474
int bdrv_media_changed(BlockDriverState *bs)
4475
{
4476
    BlockDriver *drv = bs->drv;
4477

    
4478
    if (drv && drv->bdrv_media_changed) {
4479
        return drv->bdrv_media_changed(bs);
4480
    }
4481
    return -ENOTSUP;
4482
}
4483

    
4484
/**
4485
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4486
 */
4487
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4488
{
4489
    BlockDriver *drv = bs->drv;
4490

    
4491
    if (drv && drv->bdrv_eject) {
4492
        drv->bdrv_eject(bs, eject_flag);
4493
    }
4494

    
4495
    if (bs->device_name[0] != '\0') {
4496
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4497
    }
4498
}
4499

    
4500
/**
4501
 * Lock or unlock the media (if it is locked, the user won't be able
4502
 * to eject it manually).
4503
 */
4504
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4505
{
4506
    BlockDriver *drv = bs->drv;
4507

    
4508
    trace_bdrv_lock_medium(bs, locked);
4509

    
4510
    if (drv && drv->bdrv_lock_medium) {
4511
        drv->bdrv_lock_medium(bs, locked);
4512
    }
4513
}
4514

    
4515
/* needed for generic scsi interface */
4516

    
4517
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4518
{
4519
    BlockDriver *drv = bs->drv;
4520

    
4521
    if (drv && drv->bdrv_ioctl)
4522
        return drv->bdrv_ioctl(bs, req, buf);
4523
    return -ENOTSUP;
4524
}
4525

    
4526
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4527
        unsigned long int req, void *buf,
4528
        BlockDriverCompletionFunc *cb, void *opaque)
4529
{
4530
    BlockDriver *drv = bs->drv;
4531

    
4532
    if (drv && drv->bdrv_aio_ioctl)
4533
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4534
    return NULL;
4535
}
4536

    
4537
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4538
{
4539
    bs->buffer_alignment = align;
4540
}
4541

    
4542
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4543
{
4544
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4545
}
4546

    
4547
/*
4548
 * Check if all memory in this vector is sector aligned.
4549
 */
4550
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4551
{
4552
    int i;
4553

    
4554
    for (i = 0; i < qiov->niov; i++) {
4555
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4556
            return false;
4557
        }
4558
    }
4559

    
4560
    return true;
4561
}
4562

    
4563
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4564
{
4565
    int64_t bitmap_size;
4566

    
4567
    assert((granularity & (granularity - 1)) == 0);
4568

    
4569
    if (granularity) {
4570
        granularity >>= BDRV_SECTOR_BITS;
4571
        assert(!bs->dirty_bitmap);
4572
        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4573
        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4574
    } else {
4575
        if (bs->dirty_bitmap) {
4576
            hbitmap_free(bs->dirty_bitmap);
4577
            bs->dirty_bitmap = NULL;
4578
        }
4579
    }
4580
}
4581

    
4582
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4583
{
4584
    if (bs->dirty_bitmap) {
4585
        return hbitmap_get(bs->dirty_bitmap, sector);
4586
    } else {
4587
        return 0;
4588
    }
4589
}
4590

    
4591
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4592
{
4593
    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4594
}
4595

    
4596
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4597
                    int nr_sectors)
4598
{
4599
    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4600
}
4601

    
4602
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4603
                      int nr_sectors)
4604
{
4605
    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4606
}
4607

    
4608
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4609
{
4610
    if (bs->dirty_bitmap) {
4611
        return hbitmap_count(bs->dirty_bitmap);
4612
    } else {
4613
        return 0;
4614
    }
4615
}
4616

    
4617
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4618
{
4619
    assert(bs->in_use != in_use);
4620
    bs->in_use = in_use;
4621
}
4622

    
4623
int bdrv_in_use(BlockDriverState *bs)
4624
{
4625
    return bs->in_use;
4626
}
4627

    
4628
void bdrv_iostatus_enable(BlockDriverState *bs)
4629
{
4630
    bs->iostatus_enabled = true;
4631
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4632
}
4633

    
4634
/* The I/O status is only enabled if the drive explicitly
4635
 * enables it _and_ the VM is configured to stop on errors */
4636
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4637
{
4638
    return (bs->iostatus_enabled &&
4639
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4640
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4641
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4642
}
4643

    
4644
void bdrv_iostatus_disable(BlockDriverState *bs)
4645
{
4646
    bs->iostatus_enabled = false;
4647
}
4648

    
4649
void bdrv_iostatus_reset(BlockDriverState *bs)
4650
{
4651
    if (bdrv_iostatus_is_enabled(bs)) {
4652
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4653
        if (bs->job) {
4654
            block_job_iostatus_reset(bs->job);
4655
        }
4656
    }
4657
}
4658

    
4659
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4660
{
4661
    assert(bdrv_iostatus_is_enabled(bs));
4662
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4663
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4664
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4665
    }
4666
}
4667

    
4668
void
4669
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4670
        enum BlockAcctType type)
4671
{
4672
    assert(type < BDRV_MAX_IOTYPE);
4673

    
4674
    cookie->bytes = bytes;
4675
    cookie->start_time_ns = get_clock();
4676
    cookie->type = type;
4677
}
4678

    
4679
void
4680
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4681
{
4682
    assert(cookie->type < BDRV_MAX_IOTYPE);
4683

    
4684
    bs->nr_bytes[cookie->type] += cookie->bytes;
4685
    bs->nr_ops[cookie->type]++;
4686
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4687
}
4688

    
4689
void bdrv_img_create(const char *filename, const char *fmt,
4690
                     const char *base_filename, const char *base_fmt,
4691
                     char *options, uint64_t img_size, int flags,
4692
                     Error **errp, bool quiet)
4693
{
4694
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4695
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4696
    BlockDriverState *bs = NULL;
4697
    BlockDriver *drv, *proto_drv;
4698
    BlockDriver *backing_drv = NULL;
4699
    int ret = 0;
4700

    
4701
    /* Find driver and parse its options */
4702
    drv = bdrv_find_format(fmt);
4703
    if (!drv) {
4704
        error_setg(errp, "Unknown file format '%s'", fmt);
4705
        return;
4706
    }
4707

    
4708
    proto_drv = bdrv_find_protocol(filename);
4709
    if (!proto_drv) {
4710
        error_setg(errp, "Unknown protocol '%s'", filename);
4711
        return;
4712
    }
4713

    
4714
    create_options = append_option_parameters(create_options,
4715
                                              drv->create_options);
4716
    create_options = append_option_parameters(create_options,
4717
                                              proto_drv->create_options);
4718

    
4719
    /* Create parameter list with default values */
4720
    param = parse_option_parameters("", create_options, param);
4721

    
4722
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4723

    
4724
    /* Parse -o options */
4725
    if (options) {
4726
        param = parse_option_parameters(options, create_options, param);
4727
        if (param == NULL) {
4728
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4729
            goto out;
4730
        }
4731
    }
4732

    
4733
    if (base_filename) {
4734
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4735
                                 base_filename)) {
4736
            error_setg(errp, "Backing file not supported for file format '%s'",
4737
                       fmt);
4738
            goto out;
4739
        }
4740
    }
4741

    
4742
    if (base_fmt) {
4743
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4744
            error_setg(errp, "Backing file format not supported for file "
4745
                             "format '%s'", fmt);
4746
            goto out;
4747
        }
4748
    }
4749

    
4750
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4751
    if (backing_file && backing_file->value.s) {
4752
        if (!strcmp(filename, backing_file->value.s)) {
4753
            error_setg(errp, "Error: Trying to create an image with the "
4754
                             "same filename as the backing file");
4755
            goto out;
4756
        }
4757
    }
4758

    
4759
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4760
    if (backing_fmt && backing_fmt->value.s) {
4761
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4762
        if (!backing_drv) {
4763
            error_setg(errp, "Unknown backing file format '%s'",
4764
                       backing_fmt->value.s);
4765
            goto out;
4766
        }
4767
    }
4768

    
4769
    // The size for the image must always be specified, with one exception:
4770
    // If we are using a backing file, we can obtain the size from there
4771
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4772
    if (size && size->value.n == -1) {
4773
        if (backing_file && backing_file->value.s) {
4774
            uint64_t size;
4775
            char buf[32];
4776
            int back_flags;
4777

    
4778
            /* backing files always opened read-only */
4779
            back_flags =
4780
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4781

    
4782
            bs = bdrv_new("");
4783

    
4784
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4785
                            backing_drv);
4786
            if (ret < 0) {
4787
                error_setg_errno(errp, -ret, "Could not open '%s'",
4788
                                 backing_file->value.s);
4789
                goto out;
4790
            }
4791
            bdrv_get_geometry(bs, &size);
4792
            size *= 512;
4793

    
4794
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4795
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4796
        } else {
4797
            error_setg(errp, "Image creation needs a size parameter");
4798
            goto out;
4799
        }
4800
    }
4801

    
4802
    if (!quiet) {
4803
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4804
        print_option_parameters(param);
4805
        puts("");
4806
    }
4807
    ret = bdrv_create(drv, filename, param);
4808
    if (ret < 0) {
4809
        if (ret == -ENOTSUP) {
4810
            error_setg(errp,"Formatting or formatting option not supported for "
4811
                            "file format '%s'", fmt);
4812
        } else if (ret == -EFBIG) {
4813
            error_setg(errp, "The image size is too large for file format '%s'",
4814
                       fmt);
4815
        } else {
4816
            error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
4817
                       strerror(-ret));
4818
        }
4819
    }
4820

    
4821
out:
4822
    free_option_parameters(create_options);
4823
    free_option_parameters(param);
4824

    
4825
    if (bs) {
4826
        bdrv_delete(bs);
4827
    }
4828
}
4829

    
4830
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4831
{
4832
    /* Currently BlockDriverState always uses the main loop AioContext */
4833
    return qemu_get_aio_context();
4834
}