Statistics
| Branch: | Revision:

root / block.c @ d5ef94d4

History | View | Annotate | Download (135.1 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
struct BdrvDirtyBitmap {
53
    HBitmap *bitmap;
54
    QLIST_ENTRY(BdrvDirtyBitmap) list;
55
};
56

    
57
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BdrvRequestFlags flags,
83
                                               BlockDriverCompletionFunc *cb,
84
                                               void *opaque,
85
                                               bool is_write);
86
static void coroutine_fn bdrv_co_do_rw(void *opaque);
87
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
88
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
89

    
90
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
91
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
92

    
93
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
94
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
95

    
96
/* If non-zero, use only whitelisted block drivers */
97
static int use_bdrv_whitelist;
98

    
99
#ifdef _WIN32
100
static int is_windows_drive_prefix(const char *filename)
101
{
102
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
103
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
104
            filename[1] == ':');
105
}
106

    
107
int is_windows_drive(const char *filename)
108
{
109
    if (is_windows_drive_prefix(filename) &&
110
        filename[2] == '\0')
111
        return 1;
112
    if (strstart(filename, "\\\\.\\", NULL) ||
113
        strstart(filename, "//./", NULL))
114
        return 1;
115
    return 0;
116
}
117
#endif
118

    
119
/* throttling disk I/O limits */
120
void bdrv_set_io_limits(BlockDriverState *bs,
121
                        ThrottleConfig *cfg)
122
{
123
    int i;
124

    
125
    throttle_config(&bs->throttle_state, cfg);
126

    
127
    for (i = 0; i < 2; i++) {
128
        qemu_co_enter_next(&bs->throttled_reqs[i]);
129
    }
130
}
131

    
132
/* this function drain all the throttled IOs */
133
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
134
{
135
    bool drained = false;
136
    bool enabled = bs->io_limits_enabled;
137
    int i;
138

    
139
    bs->io_limits_enabled = false;
140

    
141
    for (i = 0; i < 2; i++) {
142
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
143
            drained = true;
144
        }
145
    }
146

    
147
    bs->io_limits_enabled = enabled;
148

    
149
    return drained;
150
}
151

    
152
void bdrv_io_limits_disable(BlockDriverState *bs)
153
{
154
    bs->io_limits_enabled = false;
155

    
156
    bdrv_start_throttled_reqs(bs);
157

    
158
    throttle_destroy(&bs->throttle_state);
159
}
160

    
161
static void bdrv_throttle_read_timer_cb(void *opaque)
162
{
163
    BlockDriverState *bs = opaque;
164
    qemu_co_enter_next(&bs->throttled_reqs[0]);
165
}
166

    
167
static void bdrv_throttle_write_timer_cb(void *opaque)
168
{
169
    BlockDriverState *bs = opaque;
170
    qemu_co_enter_next(&bs->throttled_reqs[1]);
171
}
172

    
173
/* should be called before bdrv_set_io_limits if a limit is set */
174
void bdrv_io_limits_enable(BlockDriverState *bs)
175
{
176
    assert(!bs->io_limits_enabled);
177
    throttle_init(&bs->throttle_state,
178
                  QEMU_CLOCK_VIRTUAL,
179
                  bdrv_throttle_read_timer_cb,
180
                  bdrv_throttle_write_timer_cb,
181
                  bs);
182
    bs->io_limits_enabled = true;
183
}
184

    
185
/* This function makes an IO wait if needed
186
 *
187
 * @nb_sectors: the number of sectors of the IO
188
 * @is_write:   is the IO a write
189
 */
190
static void bdrv_io_limits_intercept(BlockDriverState *bs,
191
                                     int nb_sectors,
192
                                     bool is_write)
193
{
194
    /* does this io must wait */
195
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
196

    
197
    /* if must wait or any request of this type throttled queue the IO */
198
    if (must_wait ||
199
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
200
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
201
    }
202

    
203
    /* the IO will be executed, do the accounting */
204
    throttle_account(&bs->throttle_state,
205
                     is_write,
206
                     nb_sectors * BDRV_SECTOR_SIZE);
207

    
208
    /* if the next request must wait -> do nothing */
209
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
210
        return;
211
    }
212

    
213
    /* else queue next request for execution */
214
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
215
}
216

    
217
/* check if the path starts with "<protocol>:" */
218
static int path_has_protocol(const char *path)
219
{
220
    const char *p;
221

    
222
#ifdef _WIN32
223
    if (is_windows_drive(path) ||
224
        is_windows_drive_prefix(path)) {
225
        return 0;
226
    }
227
    p = path + strcspn(path, ":/\\");
228
#else
229
    p = path + strcspn(path, ":/");
230
#endif
231

    
232
    return *p == ':';
233
}
234

    
235
int path_is_absolute(const char *path)
236
{
237
#ifdef _WIN32
238
    /* specific case for names like: "\\.\d:" */
239
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
240
        return 1;
241
    }
242
    return (*path == '/' || *path == '\\');
243
#else
244
    return (*path == '/');
245
#endif
246
}
247

    
248
/* if filename is absolute, just copy it to dest. Otherwise, build a
249
   path to it by considering it is relative to base_path. URL are
250
   supported. */
251
void path_combine(char *dest, int dest_size,
252
                  const char *base_path,
253
                  const char *filename)
254
{
255
    const char *p, *p1;
256
    int len;
257

    
258
    if (dest_size <= 0)
259
        return;
260
    if (path_is_absolute(filename)) {
261
        pstrcpy(dest, dest_size, filename);
262
    } else {
263
        p = strchr(base_path, ':');
264
        if (p)
265
            p++;
266
        else
267
            p = base_path;
268
        p1 = strrchr(base_path, '/');
269
#ifdef _WIN32
270
        {
271
            const char *p2;
272
            p2 = strrchr(base_path, '\\');
273
            if (!p1 || p2 > p1)
274
                p1 = p2;
275
        }
276
#endif
277
        if (p1)
278
            p1++;
279
        else
280
            p1 = base_path;
281
        if (p1 > p)
282
            p = p1;
283
        len = p - base_path;
284
        if (len > dest_size - 1)
285
            len = dest_size - 1;
286
        memcpy(dest, base_path, len);
287
        dest[len] = '\0';
288
        pstrcat(dest, dest_size, filename);
289
    }
290
}
291

    
292
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
293
{
294
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
295
        pstrcpy(dest, sz, bs->backing_file);
296
    } else {
297
        path_combine(dest, sz, bs->filename, bs->backing_file);
298
    }
299
}
300

    
301
void bdrv_register(BlockDriver *bdrv)
302
{
303
    /* Block drivers without coroutine functions need emulation */
304
    if (!bdrv->bdrv_co_readv) {
305
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
306
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
307

    
308
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
309
         * the block driver lacks aio we need to emulate that too.
310
         */
311
        if (!bdrv->bdrv_aio_readv) {
312
            /* add AIO emulation layer */
313
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
314
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
315
        }
316
    }
317

    
318
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
319
}
320

    
321
/* create a new block device (by default it is empty) */
322
BlockDriverState *bdrv_new(const char *device_name)
323
{
324
    BlockDriverState *bs;
325

    
326
    bs = g_malloc0(sizeof(BlockDriverState));
327
    QLIST_INIT(&bs->dirty_bitmaps);
328
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
329
    if (device_name[0] != '\0') {
330
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
331
    }
332
    bdrv_iostatus_disable(bs);
333
    notifier_list_init(&bs->close_notifiers);
334
    notifier_with_return_list_init(&bs->before_write_notifiers);
335
    qemu_co_queue_init(&bs->throttled_reqs[0]);
336
    qemu_co_queue_init(&bs->throttled_reqs[1]);
337
    bs->refcnt = 1;
338

    
339
    return bs;
340
}
341

    
342
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
343
{
344
    notifier_list_add(&bs->close_notifiers, notify);
345
}
346

    
347
BlockDriver *bdrv_find_format(const char *format_name)
348
{
349
    BlockDriver *drv1;
350
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
351
        if (!strcmp(drv1->format_name, format_name)) {
352
            return drv1;
353
        }
354
    }
355
    return NULL;
356
}
357

    
358
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
359
{
360
    static const char *whitelist_rw[] = {
361
        CONFIG_BDRV_RW_WHITELIST
362
    };
363
    static const char *whitelist_ro[] = {
364
        CONFIG_BDRV_RO_WHITELIST
365
    };
366
    const char **p;
367

    
368
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
369
        return 1;               /* no whitelist, anything goes */
370
    }
371

    
372
    for (p = whitelist_rw; *p; p++) {
373
        if (!strcmp(drv->format_name, *p)) {
374
            return 1;
375
        }
376
    }
377
    if (read_only) {
378
        for (p = whitelist_ro; *p; p++) {
379
            if (!strcmp(drv->format_name, *p)) {
380
                return 1;
381
            }
382
        }
383
    }
384
    return 0;
385
}
386

    
387
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
388
                                          bool read_only)
389
{
390
    BlockDriver *drv = bdrv_find_format(format_name);
391
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
392
}
393

    
394
typedef struct CreateCo {
395
    BlockDriver *drv;
396
    char *filename;
397
    QEMUOptionParameter *options;
398
    int ret;
399
    Error *err;
400
} CreateCo;
401

    
402
static void coroutine_fn bdrv_create_co_entry(void *opaque)
403
{
404
    Error *local_err = NULL;
405
    int ret;
406

    
407
    CreateCo *cco = opaque;
408
    assert(cco->drv);
409

    
410
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
411
    if (error_is_set(&local_err)) {
412
        error_propagate(&cco->err, local_err);
413
    }
414
    cco->ret = ret;
415
}
416

    
417
int bdrv_create(BlockDriver *drv, const char* filename,
418
    QEMUOptionParameter *options, Error **errp)
419
{
420
    int ret;
421

    
422
    Coroutine *co;
423
    CreateCo cco = {
424
        .drv = drv,
425
        .filename = g_strdup(filename),
426
        .options = options,
427
        .ret = NOT_DONE,
428
        .err = NULL,
429
    };
430

    
431
    if (!drv->bdrv_create) {
432
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
433
        ret = -ENOTSUP;
434
        goto out;
435
    }
436

    
437
    if (qemu_in_coroutine()) {
438
        /* Fast-path if already in coroutine context */
439
        bdrv_create_co_entry(&cco);
440
    } else {
441
        co = qemu_coroutine_create(bdrv_create_co_entry);
442
        qemu_coroutine_enter(co, &cco);
443
        while (cco.ret == NOT_DONE) {
444
            qemu_aio_wait();
445
        }
446
    }
447

    
448
    ret = cco.ret;
449
    if (ret < 0) {
450
        if (error_is_set(&cco.err)) {
451
            error_propagate(errp, cco.err);
452
        } else {
453
            error_setg_errno(errp, -ret, "Could not create image");
454
        }
455
    }
456

    
457
out:
458
    g_free(cco.filename);
459
    return ret;
460
}
461

    
462
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
463
                     Error **errp)
464
{
465
    BlockDriver *drv;
466
    Error *local_err = NULL;
467
    int ret;
468

    
469
    drv = bdrv_find_protocol(filename, true);
470
    if (drv == NULL) {
471
        error_setg(errp, "Could not find protocol for file '%s'", filename);
472
        return -ENOENT;
473
    }
474

    
475
    ret = bdrv_create(drv, filename, options, &local_err);
476
    if (error_is_set(&local_err)) {
477
        error_propagate(errp, local_err);
478
    }
479
    return ret;
480
}
481

    
482
/*
483
 * Create a uniquely-named empty temporary file.
484
 * Return 0 upon success, otherwise a negative errno value.
485
 */
486
int get_tmp_filename(char *filename, int size)
487
{
488
#ifdef _WIN32
489
    char temp_dir[MAX_PATH];
490
    /* GetTempFileName requires that its output buffer (4th param)
491
       have length MAX_PATH or greater.  */
492
    assert(size >= MAX_PATH);
493
    return (GetTempPath(MAX_PATH, temp_dir)
494
            && GetTempFileName(temp_dir, "qem", 0, filename)
495
            ? 0 : -GetLastError());
496
#else
497
    int fd;
498
    const char *tmpdir;
499
    tmpdir = getenv("TMPDIR");
500
    if (!tmpdir)
501
        tmpdir = "/tmp";
502
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
503
        return -EOVERFLOW;
504
    }
505
    fd = mkstemp(filename);
506
    if (fd < 0) {
507
        return -errno;
508
    }
509
    if (close(fd) != 0) {
510
        unlink(filename);
511
        return -errno;
512
    }
513
    return 0;
514
#endif
515
}
516

    
517
/*
518
 * Detect host devices. By convention, /dev/cdrom[N] is always
519
 * recognized as a host CDROM.
520
 */
521
static BlockDriver *find_hdev_driver(const char *filename)
522
{
523
    int score_max = 0, score;
524
    BlockDriver *drv = NULL, *d;
525

    
526
    QLIST_FOREACH(d, &bdrv_drivers, list) {
527
        if (d->bdrv_probe_device) {
528
            score = d->bdrv_probe_device(filename);
529
            if (score > score_max) {
530
                score_max = score;
531
                drv = d;
532
            }
533
        }
534
    }
535

    
536
    return drv;
537
}
538

    
539
BlockDriver *bdrv_find_protocol(const char *filename,
540
                                bool allow_protocol_prefix)
541
{
542
    BlockDriver *drv1;
543
    char protocol[128];
544
    int len;
545
    const char *p;
546

    
547
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
548

    
549
    /*
550
     * XXX(hch): we really should not let host device detection
551
     * override an explicit protocol specification, but moving this
552
     * later breaks access to device names with colons in them.
553
     * Thanks to the brain-dead persistent naming schemes on udev-
554
     * based Linux systems those actually are quite common.
555
     */
556
    drv1 = find_hdev_driver(filename);
557
    if (drv1) {
558
        return drv1;
559
    }
560

    
561
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
562
        return bdrv_find_format("file");
563
    }
564

    
565
    p = strchr(filename, ':');
566
    assert(p != NULL);
567
    len = p - filename;
568
    if (len > sizeof(protocol) - 1)
569
        len = sizeof(protocol) - 1;
570
    memcpy(protocol, filename, len);
571
    protocol[len] = '\0';
572
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
573
        if (drv1->protocol_name &&
574
            !strcmp(drv1->protocol_name, protocol)) {
575
            return drv1;
576
        }
577
    }
578
    return NULL;
579
}
580

    
581
static int find_image_format(BlockDriverState *bs, const char *filename,
582
                             BlockDriver **pdrv, Error **errp)
583
{
584
    int score, score_max;
585
    BlockDriver *drv1, *drv;
586
    uint8_t buf[2048];
587
    int ret = 0;
588

    
589
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
590
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
591
        drv = bdrv_find_format("raw");
592
        if (!drv) {
593
            error_setg(errp, "Could not find raw image format");
594
            ret = -ENOENT;
595
        }
596
        *pdrv = drv;
597
        return ret;
598
    }
599

    
600
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
601
    if (ret < 0) {
602
        error_setg_errno(errp, -ret, "Could not read image for determining its "
603
                         "format");
604
        *pdrv = NULL;
605
        return ret;
606
    }
607

    
608
    score_max = 0;
609
    drv = NULL;
610
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
611
        if (drv1->bdrv_probe) {
612
            score = drv1->bdrv_probe(buf, ret, filename);
613
            if (score > score_max) {
614
                score_max = score;
615
                drv = drv1;
616
            }
617
        }
618
    }
619
    if (!drv) {
620
        error_setg(errp, "Could not determine image format: No compatible "
621
                   "driver found");
622
        ret = -ENOENT;
623
    }
624
    *pdrv = drv;
625
    return ret;
626
}
627

    
628
/**
629
 * Set the current 'total_sectors' value
630
 */
631
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
632
{
633
    BlockDriver *drv = bs->drv;
634

    
635
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
636
    if (bs->sg)
637
        return 0;
638

    
639
    /* query actual device if possible, otherwise just trust the hint */
640
    if (drv->bdrv_getlength) {
641
        int64_t length = drv->bdrv_getlength(bs);
642
        if (length < 0) {
643
            return length;
644
        }
645
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
646
    }
647

    
648
    bs->total_sectors = hint;
649
    return 0;
650
}
651

    
652
/**
653
 * Set open flags for a given discard mode
654
 *
655
 * Return 0 on success, -1 if the discard mode was invalid.
656
 */
657
int bdrv_parse_discard_flags(const char *mode, int *flags)
658
{
659
    *flags &= ~BDRV_O_UNMAP;
660

    
661
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
662
        /* do nothing */
663
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
664
        *flags |= BDRV_O_UNMAP;
665
    } else {
666
        return -1;
667
    }
668

    
669
    return 0;
670
}
671

    
672
/**
673
 * Set open flags for a given cache mode
674
 *
675
 * Return 0 on success, -1 if the cache mode was invalid.
676
 */
677
int bdrv_parse_cache_flags(const char *mode, int *flags)
678
{
679
    *flags &= ~BDRV_O_CACHE_MASK;
680

    
681
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
682
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
683
    } else if (!strcmp(mode, "directsync")) {
684
        *flags |= BDRV_O_NOCACHE;
685
    } else if (!strcmp(mode, "writeback")) {
686
        *flags |= BDRV_O_CACHE_WB;
687
    } else if (!strcmp(mode, "unsafe")) {
688
        *flags |= BDRV_O_CACHE_WB;
689
        *flags |= BDRV_O_NO_FLUSH;
690
    } else if (!strcmp(mode, "writethrough")) {
691
        /* this is the default */
692
    } else {
693
        return -1;
694
    }
695

    
696
    return 0;
697
}
698

    
699
/**
700
 * The copy-on-read flag is actually a reference count so multiple users may
701
 * use the feature without worrying about clobbering its previous state.
702
 * Copy-on-read stays enabled until all users have called to disable it.
703
 */
704
void bdrv_enable_copy_on_read(BlockDriverState *bs)
705
{
706
    bs->copy_on_read++;
707
}
708

    
709
void bdrv_disable_copy_on_read(BlockDriverState *bs)
710
{
711
    assert(bs->copy_on_read > 0);
712
    bs->copy_on_read--;
713
}
714

    
715
static int bdrv_open_flags(BlockDriverState *bs, int flags)
716
{
717
    int open_flags = flags | BDRV_O_CACHE_WB;
718

    
719
    /*
720
     * Clear flags that are internal to the block layer before opening the
721
     * image.
722
     */
723
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
724

    
725
    /*
726
     * Snapshots should be writable.
727
     */
728
    if (bs->is_temporary) {
729
        open_flags |= BDRV_O_RDWR;
730
    }
731

    
732
    return open_flags;
733
}
734

    
735
/*
736
 * Common part for opening disk images and files
737
 *
738
 * Removes all processed options from *options.
739
 */
740
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
741
    QDict *options, int flags, BlockDriver *drv, Error **errp)
742
{
743
    int ret, open_flags;
744
    const char *filename;
745
    Error *local_err = NULL;
746

    
747
    assert(drv != NULL);
748
    assert(bs->file == NULL);
749
    assert(options != NULL && bs->options != options);
750

    
751
    if (file != NULL) {
752
        filename = file->filename;
753
    } else {
754
        filename = qdict_get_try_str(options, "filename");
755
    }
756

    
757
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
758

    
759
    /* bdrv_open() with directly using a protocol as drv. This layer is already
760
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
761
     * and return immediately. */
762
    if (file != NULL && drv->bdrv_file_open) {
763
        bdrv_swap(file, bs);
764
        return 0;
765
    }
766

    
767
    bs->open_flags = flags;
768
    bs->buffer_alignment = 512;
769
    bs->zero_beyond_eof = true;
770
    open_flags = bdrv_open_flags(bs, flags);
771
    bs->read_only = !(open_flags & BDRV_O_RDWR);
772

    
773
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
774
        error_setg(errp,
775
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
776
                        ? "Driver '%s' can only be used for read-only devices"
777
                        : "Driver '%s' is not whitelisted",
778
                   drv->format_name);
779
        return -ENOTSUP;
780
    }
781

    
782
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
783
    if (flags & BDRV_O_COPY_ON_READ) {
784
        if (!bs->read_only) {
785
            bdrv_enable_copy_on_read(bs);
786
        } else {
787
            error_setg(errp, "Can't use copy-on-read on read-only device");
788
            return -EINVAL;
789
        }
790
    }
791

    
792
    if (filename != NULL) {
793
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
794
    } else {
795
        bs->filename[0] = '\0';
796
    }
797

    
798
    bs->drv = drv;
799
    bs->opaque = g_malloc0(drv->instance_size);
800

    
801
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
802

    
803
    /* Open the image, either directly or using a protocol */
804
    if (drv->bdrv_file_open) {
805
        assert(file == NULL);
806
        assert(!drv->bdrv_needs_filename || filename != NULL);
807
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
808
    } else {
809
        if (file == NULL) {
810
            error_setg(errp, "Can't use '%s' as a block driver for the "
811
                       "protocol level", drv->format_name);
812
            ret = -EINVAL;
813
            goto free_and_fail;
814
        }
815
        bs->file = file;
816
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
817
    }
818

    
819
    if (ret < 0) {
820
        if (error_is_set(&local_err)) {
821
            error_propagate(errp, local_err);
822
        } else if (bs->filename[0]) {
823
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
824
        } else {
825
            error_setg_errno(errp, -ret, "Could not open image");
826
        }
827
        goto free_and_fail;
828
    }
829

    
830
    ret = refresh_total_sectors(bs, bs->total_sectors);
831
    if (ret < 0) {
832
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
833
        goto free_and_fail;
834
    }
835

    
836
#ifndef _WIN32
837
    if (bs->is_temporary) {
838
        assert(bs->filename[0] != '\0');
839
        unlink(bs->filename);
840
    }
841
#endif
842
    return 0;
843

    
844
free_and_fail:
845
    bs->file = NULL;
846
    g_free(bs->opaque);
847
    bs->opaque = NULL;
848
    bs->drv = NULL;
849
    return ret;
850
}
851

    
852
/*
853
 * Opens a file using a protocol (file, host_device, nbd, ...)
854
 *
855
 * options is a QDict of options to pass to the block drivers, or NULL for an
856
 * empty set of options. The reference to the QDict belongs to the block layer
857
 * after the call (even on failure), so if the caller intends to reuse the
858
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
859
 */
860
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
861
                   QDict *options, int flags, Error **errp)
862
{
863
    BlockDriverState *bs;
864
    BlockDriver *drv;
865
    const char *drvname;
866
    bool allow_protocol_prefix = false;
867
    Error *local_err = NULL;
868
    int ret;
869

    
870
    /* NULL means an empty set of options */
871
    if (options == NULL) {
872
        options = qdict_new();
873
    }
874

    
875
    bs = bdrv_new("");
876
    bs->options = options;
877
    options = qdict_clone_shallow(options);
878

    
879
    /* Fetch the file name from the options QDict if necessary */
880
    if (!filename) {
881
        filename = qdict_get_try_str(options, "filename");
882
    } else if (filename && !qdict_haskey(options, "filename")) {
883
        qdict_put(options, "filename", qstring_from_str(filename));
884
        allow_protocol_prefix = true;
885
    } else {
886
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
887
                   "same time");
888
        ret = -EINVAL;
889
        goto fail;
890
    }
891

    
892
    /* Find the right block driver */
893
    drvname = qdict_get_try_str(options, "driver");
894
    if (drvname) {
895
        drv = bdrv_find_format(drvname);
896
        if (!drv) {
897
            error_setg(errp, "Unknown driver '%s'", drvname);
898
        }
899
        qdict_del(options, "driver");
900
    } else if (filename) {
901
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
902
        if (!drv) {
903
            error_setg(errp, "Unknown protocol");
904
        }
905
    } else {
906
        error_setg(errp, "Must specify either driver or file");
907
        drv = NULL;
908
    }
909

    
910
    if (!drv) {
911
        /* errp has been set already */
912
        ret = -ENOENT;
913
        goto fail;
914
    }
915

    
916
    /* Parse the filename and open it */
917
    if (drv->bdrv_parse_filename && filename) {
918
        drv->bdrv_parse_filename(filename, options, &local_err);
919
        if (error_is_set(&local_err)) {
920
            error_propagate(errp, local_err);
921
            ret = -EINVAL;
922
            goto fail;
923
        }
924
        qdict_del(options, "filename");
925
    } else if (drv->bdrv_needs_filename && !filename) {
926
        error_setg(errp, "The '%s' block driver requires a file name",
927
                   drv->format_name);
928
        ret = -EINVAL;
929
        goto fail;
930
    }
931

    
932
    ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
933
    if (ret < 0) {
934
        error_propagate(errp, local_err);
935
        goto fail;
936
    }
937

    
938
    /* Check if any unknown options were used */
939
    if (qdict_size(options) != 0) {
940
        const QDictEntry *entry = qdict_first(options);
941
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
942
                   drv->format_name, entry->key);
943
        ret = -EINVAL;
944
        goto fail;
945
    }
946
    QDECREF(options);
947

    
948
    bs->growable = 1;
949
    *pbs = bs;
950
    return 0;
951

    
952
fail:
953
    QDECREF(options);
954
    if (!bs->drv) {
955
        QDECREF(bs->options);
956
    }
957
    bdrv_unref(bs);
958
    return ret;
959
}
960

    
961
/*
962
 * Opens the backing file for a BlockDriverState if not yet open
963
 *
964
 * options is a QDict of options to pass to the block drivers, or NULL for an
965
 * empty set of options. The reference to the QDict is transferred to this
966
 * function (even on failure), so if the caller intends to reuse the dictionary,
967
 * it needs to use QINCREF() before calling bdrv_file_open.
968
 */
969
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
970
{
971
    char backing_filename[PATH_MAX];
972
    int back_flags, ret;
973
    BlockDriver *back_drv = NULL;
974
    Error *local_err = NULL;
975

    
976
    if (bs->backing_hd != NULL) {
977
        QDECREF(options);
978
        return 0;
979
    }
980

    
981
    /* NULL means an empty set of options */
982
    if (options == NULL) {
983
        options = qdict_new();
984
    }
985

    
986
    bs->open_flags &= ~BDRV_O_NO_BACKING;
987
    if (qdict_haskey(options, "file.filename")) {
988
        backing_filename[0] = '\0';
989
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
990
        QDECREF(options);
991
        return 0;
992
    } else {
993
        bdrv_get_full_backing_filename(bs, backing_filename,
994
                                       sizeof(backing_filename));
995
    }
996

    
997
    bs->backing_hd = bdrv_new("");
998

    
999
    if (bs->backing_format[0] != '\0') {
1000
        back_drv = bdrv_find_format(bs->backing_format);
1001
    }
1002

    
1003
    /* backing files always opened read-only */
1004
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1005
                                    BDRV_O_COPY_ON_READ);
1006

    
1007
    ret = bdrv_open(bs->backing_hd,
1008
                    *backing_filename ? backing_filename : NULL, options,
1009
                    back_flags, back_drv, &local_err);
1010
    if (ret < 0) {
1011
        bdrv_unref(bs->backing_hd);
1012
        bs->backing_hd = NULL;
1013
        bs->open_flags |= BDRV_O_NO_BACKING;
1014
        error_setg(errp, "Could not open backing file: %s",
1015
                   error_get_pretty(local_err));
1016
        error_free(local_err);
1017
        return ret;
1018
    }
1019
    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1020
            bs->backing_hd->file->filename);
1021
    return 0;
1022
}
1023

    
1024
/*
1025
 * Opens a disk image (raw, qcow2, vmdk, ...)
1026
 *
1027
 * options is a QDict of options to pass to the block drivers, or NULL for an
1028
 * empty set of options. The reference to the QDict belongs to the block layer
1029
 * after the call (even on failure), so if the caller intends to reuse the
1030
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1031
 */
1032
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1033
              int flags, BlockDriver *drv, Error **errp)
1034
{
1035
    int ret;
1036
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1037
    char tmp_filename[PATH_MAX + 1];
1038
    BlockDriverState *file = NULL;
1039
    QDict *file_options = NULL;
1040
    const char *drvname;
1041
    Error *local_err = NULL;
1042

    
1043
    /* NULL means an empty set of options */
1044
    if (options == NULL) {
1045
        options = qdict_new();
1046
    }
1047

    
1048
    bs->options = options;
1049
    options = qdict_clone_shallow(options);
1050

    
1051
    /* For snapshot=on, create a temporary qcow2 overlay */
1052
    if (flags & BDRV_O_SNAPSHOT) {
1053
        BlockDriverState *bs1;
1054
        int64_t total_size;
1055
        BlockDriver *bdrv_qcow2;
1056
        QEMUOptionParameter *create_options;
1057
        QDict *snapshot_options;
1058

    
1059
        /* if snapshot, we create a temporary backing file and open it
1060
           instead of opening 'filename' directly */
1061

    
1062
        /* Get the required size from the image */
1063
        bs1 = bdrv_new("");
1064
        QINCREF(options);
1065
        ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
1066
                        drv, &local_err);
1067
        if (ret < 0) {
1068
            bdrv_unref(bs1);
1069
            goto fail;
1070
        }
1071
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1072

    
1073
        bdrv_unref(bs1);
1074

    
1075
        /* Create the temporary image */
1076
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1077
        if (ret < 0) {
1078
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1079
            goto fail;
1080
        }
1081

    
1082
        bdrv_qcow2 = bdrv_find_format("qcow2");
1083
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1084
                                                 NULL);
1085

    
1086
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1087

    
1088
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1089
        free_option_parameters(create_options);
1090
        if (ret < 0) {
1091
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1092
                             "'%s': %s", tmp_filename,
1093
                             error_get_pretty(local_err));
1094
            error_free(local_err);
1095
            local_err = NULL;
1096
            goto fail;
1097
        }
1098

    
1099
        /* Prepare a new options QDict for the temporary file, where user
1100
         * options refer to the backing file */
1101
        if (filename) {
1102
            qdict_put(options, "file.filename", qstring_from_str(filename));
1103
        }
1104
        if (drv) {
1105
            qdict_put(options, "driver", qstring_from_str(drv->format_name));
1106
        }
1107

    
1108
        snapshot_options = qdict_new();
1109
        qdict_put(snapshot_options, "backing", options);
1110
        qdict_flatten(snapshot_options);
1111

    
1112
        bs->options = snapshot_options;
1113
        options = qdict_clone_shallow(bs->options);
1114

    
1115
        filename = tmp_filename;
1116
        drv = bdrv_qcow2;
1117
        bs->is_temporary = 1;
1118
    }
1119

    
1120
    /* Open image file without format layer */
1121
    if (flags & BDRV_O_RDWR) {
1122
        flags |= BDRV_O_ALLOW_RDWR;
1123
    }
1124

    
1125
    qdict_extract_subqdict(options, &file_options, "file.");
1126

    
1127
    ret = bdrv_file_open(&file, filename, file_options,
1128
                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
1129
    if (ret < 0) {
1130
        goto fail;
1131
    }
1132

    
1133
    /* Find the right image format driver */
1134
    drvname = qdict_get_try_str(options, "driver");
1135
    if (drvname) {
1136
        drv = bdrv_find_format(drvname);
1137
        qdict_del(options, "driver");
1138
        if (!drv) {
1139
            error_setg(errp, "Invalid driver: '%s'", drvname);
1140
            ret = -EINVAL;
1141
            goto unlink_and_fail;
1142
        }
1143
    }
1144

    
1145
    if (!drv) {
1146
        ret = find_image_format(file, filename, &drv, &local_err);
1147
    }
1148

    
1149
    if (!drv) {
1150
        goto unlink_and_fail;
1151
    }
1152

    
1153
    /* Open the image */
1154
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1155
    if (ret < 0) {
1156
        goto unlink_and_fail;
1157
    }
1158

    
1159
    if (bs->file != file) {
1160
        bdrv_unref(file);
1161
        file = NULL;
1162
    }
1163

    
1164
    /* If there is a backing file, use it */
1165
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1166
        QDict *backing_options;
1167

    
1168
        qdict_extract_subqdict(options, &backing_options, "backing.");
1169
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1170
        if (ret < 0) {
1171
            goto close_and_fail;
1172
        }
1173
    }
1174

    
1175
    /* Check if any unknown options were used */
1176
    if (qdict_size(options) != 0) {
1177
        const QDictEntry *entry = qdict_first(options);
1178
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1179
                   "support the option '%s'", drv->format_name, bs->device_name,
1180
                   entry->key);
1181

    
1182
        ret = -EINVAL;
1183
        goto close_and_fail;
1184
    }
1185
    QDECREF(options);
1186

    
1187
    if (!bdrv_key_required(bs)) {
1188
        bdrv_dev_change_media_cb(bs, true);
1189
    }
1190

    
1191
    return 0;
1192

    
1193
unlink_and_fail:
1194
    if (file != NULL) {
1195
        bdrv_unref(file);
1196
    }
1197
    if (bs->is_temporary) {
1198
        unlink(filename);
1199
    }
1200
fail:
1201
    QDECREF(bs->options);
1202
    QDECREF(options);
1203
    bs->options = NULL;
1204
    if (error_is_set(&local_err)) {
1205
        error_propagate(errp, local_err);
1206
    }
1207
    return ret;
1208

    
1209
close_and_fail:
1210
    bdrv_close(bs);
1211
    QDECREF(options);
1212
    if (error_is_set(&local_err)) {
1213
        error_propagate(errp, local_err);
1214
    }
1215
    return ret;
1216
}
1217

    
1218
typedef struct BlockReopenQueueEntry {
1219
     bool prepared;
1220
     BDRVReopenState state;
1221
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1222
} BlockReopenQueueEntry;
1223

    
1224
/*
1225
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1226
 * reopen of multiple devices.
1227
 *
1228
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1229
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1230
 * be created and initialized. This newly created BlockReopenQueue should be
1231
 * passed back in for subsequent calls that are intended to be of the same
1232
 * atomic 'set'.
1233
 *
1234
 * bs is the BlockDriverState to add to the reopen queue.
1235
 *
1236
 * flags contains the open flags for the associated bs
1237
 *
1238
 * returns a pointer to bs_queue, which is either the newly allocated
1239
 * bs_queue, or the existing bs_queue being used.
1240
 *
1241
 */
1242
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1243
                                    BlockDriverState *bs, int flags)
1244
{
1245
    assert(bs != NULL);
1246

    
1247
    BlockReopenQueueEntry *bs_entry;
1248
    if (bs_queue == NULL) {
1249
        bs_queue = g_new0(BlockReopenQueue, 1);
1250
        QSIMPLEQ_INIT(bs_queue);
1251
    }
1252

    
1253
    if (bs->file) {
1254
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1255
    }
1256

    
1257
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1258
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1259

    
1260
    bs_entry->state.bs = bs;
1261
    bs_entry->state.flags = flags;
1262

    
1263
    return bs_queue;
1264
}
1265

    
1266
/*
1267
 * Reopen multiple BlockDriverStates atomically & transactionally.
1268
 *
1269
 * The queue passed in (bs_queue) must have been built up previous
1270
 * via bdrv_reopen_queue().
1271
 *
1272
 * Reopens all BDS specified in the queue, with the appropriate
1273
 * flags.  All devices are prepared for reopen, and failure of any
1274
 * device will cause all device changes to be abandonded, and intermediate
1275
 * data cleaned up.
1276
 *
1277
 * If all devices prepare successfully, then the changes are committed
1278
 * to all devices.
1279
 *
1280
 */
1281
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1282
{
1283
    int ret = -1;
1284
    BlockReopenQueueEntry *bs_entry, *next;
1285
    Error *local_err = NULL;
1286

    
1287
    assert(bs_queue != NULL);
1288

    
1289
    bdrv_drain_all();
1290

    
1291
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1292
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1293
            error_propagate(errp, local_err);
1294
            goto cleanup;
1295
        }
1296
        bs_entry->prepared = true;
1297
    }
1298

    
1299
    /* If we reach this point, we have success and just need to apply the
1300
     * changes
1301
     */
1302
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1303
        bdrv_reopen_commit(&bs_entry->state);
1304
    }
1305

    
1306
    ret = 0;
1307

    
1308
cleanup:
1309
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1310
        if (ret && bs_entry->prepared) {
1311
            bdrv_reopen_abort(&bs_entry->state);
1312
        }
1313
        g_free(bs_entry);
1314
    }
1315
    g_free(bs_queue);
1316
    return ret;
1317
}
1318

    
1319

    
1320
/* Reopen a single BlockDriverState with the specified flags. */
1321
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1322
{
1323
    int ret = -1;
1324
    Error *local_err = NULL;
1325
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1326

    
1327
    ret = bdrv_reopen_multiple(queue, &local_err);
1328
    if (local_err != NULL) {
1329
        error_propagate(errp, local_err);
1330
    }
1331
    return ret;
1332
}
1333

    
1334

    
1335
/*
1336
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1337
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1338
 * the block driver layer .bdrv_reopen_prepare()
1339
 *
1340
 * bs is the BlockDriverState to reopen
1341
 * flags are the new open flags
1342
 * queue is the reopen queue
1343
 *
1344
 * Returns 0 on success, non-zero on error.  On error errp will be set
1345
 * as well.
1346
 *
1347
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1348
 * It is the responsibility of the caller to then call the abort() or
1349
 * commit() for any other BDS that have been left in a prepare() state
1350
 *
1351
 */
1352
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1353
                        Error **errp)
1354
{
1355
    int ret = -1;
1356
    Error *local_err = NULL;
1357
    BlockDriver *drv;
1358

    
1359
    assert(reopen_state != NULL);
1360
    assert(reopen_state->bs->drv != NULL);
1361
    drv = reopen_state->bs->drv;
1362

    
1363
    /* if we are to stay read-only, do not allow permission change
1364
     * to r/w */
1365
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1366
        reopen_state->flags & BDRV_O_RDWR) {
1367
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1368
                  reopen_state->bs->device_name);
1369
        goto error;
1370
    }
1371

    
1372

    
1373
    ret = bdrv_flush(reopen_state->bs);
1374
    if (ret) {
1375
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1376
                  strerror(-ret));
1377
        goto error;
1378
    }
1379

    
1380
    if (drv->bdrv_reopen_prepare) {
1381
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1382
        if (ret) {
1383
            if (local_err != NULL) {
1384
                error_propagate(errp, local_err);
1385
            } else {
1386
                error_setg(errp, "failed while preparing to reopen image '%s'",
1387
                           reopen_state->bs->filename);
1388
            }
1389
            goto error;
1390
        }
1391
    } else {
1392
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1393
         * handler for each supported drv. */
1394
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1395
                  drv->format_name, reopen_state->bs->device_name,
1396
                 "reopening of file");
1397
        ret = -1;
1398
        goto error;
1399
    }
1400

    
1401
    ret = 0;
1402

    
1403
error:
1404
    return ret;
1405
}
1406

    
1407
/*
1408
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1409
 * makes them final by swapping the staging BlockDriverState contents into
1410
 * the active BlockDriverState contents.
1411
 */
1412
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1413
{
1414
    BlockDriver *drv;
1415

    
1416
    assert(reopen_state != NULL);
1417
    drv = reopen_state->bs->drv;
1418
    assert(drv != NULL);
1419

    
1420
    /* If there are any driver level actions to take */
1421
    if (drv->bdrv_reopen_commit) {
1422
        drv->bdrv_reopen_commit(reopen_state);
1423
    }
1424

    
1425
    /* set BDS specific flags now */
1426
    reopen_state->bs->open_flags         = reopen_state->flags;
1427
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1428
                                              BDRV_O_CACHE_WB);
1429
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1430
}
1431

    
1432
/*
1433
 * Abort the reopen, and delete and free the staged changes in
1434
 * reopen_state
1435
 */
1436
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1437
{
1438
    BlockDriver *drv;
1439

    
1440
    assert(reopen_state != NULL);
1441
    drv = reopen_state->bs->drv;
1442
    assert(drv != NULL);
1443

    
1444
    if (drv->bdrv_reopen_abort) {
1445
        drv->bdrv_reopen_abort(reopen_state);
1446
    }
1447
}
1448

    
1449

    
1450
void bdrv_close(BlockDriverState *bs)
1451
{
1452
    if (bs->job) {
1453
        block_job_cancel_sync(bs->job);
1454
    }
1455
    bdrv_drain_all(); /* complete I/O */
1456
    bdrv_flush(bs);
1457
    bdrv_drain_all(); /* in case flush left pending I/O */
1458
    notifier_list_notify(&bs->close_notifiers, bs);
1459

    
1460
    if (bs->drv) {
1461
        if (bs->backing_hd) {
1462
            bdrv_unref(bs->backing_hd);
1463
            bs->backing_hd = NULL;
1464
        }
1465
        bs->drv->bdrv_close(bs);
1466
        g_free(bs->opaque);
1467
#ifdef _WIN32
1468
        if (bs->is_temporary) {
1469
            unlink(bs->filename);
1470
        }
1471
#endif
1472
        bs->opaque = NULL;
1473
        bs->drv = NULL;
1474
        bs->copy_on_read = 0;
1475
        bs->backing_file[0] = '\0';
1476
        bs->backing_format[0] = '\0';
1477
        bs->total_sectors = 0;
1478
        bs->encrypted = 0;
1479
        bs->valid_key = 0;
1480
        bs->sg = 0;
1481
        bs->growable = 0;
1482
        bs->zero_beyond_eof = false;
1483
        QDECREF(bs->options);
1484
        bs->options = NULL;
1485

    
1486
        if (bs->file != NULL) {
1487
            bdrv_unref(bs->file);
1488
            bs->file = NULL;
1489
        }
1490
    }
1491

    
1492
    bdrv_dev_change_media_cb(bs, false);
1493

    
1494
    /*throttling disk I/O limits*/
1495
    if (bs->io_limits_enabled) {
1496
        bdrv_io_limits_disable(bs);
1497
    }
1498
}
1499

    
1500
void bdrv_close_all(void)
1501
{
1502
    BlockDriverState *bs;
1503

    
1504
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1505
        bdrv_close(bs);
1506
    }
1507
}
1508

    
1509
/* Check if any requests are in-flight (including throttled requests) */
1510
static bool bdrv_requests_pending(BlockDriverState *bs)
1511
{
1512
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1513
        return true;
1514
    }
1515
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1516
        return true;
1517
    }
1518
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1519
        return true;
1520
    }
1521
    if (bs->file && bdrv_requests_pending(bs->file)) {
1522
        return true;
1523
    }
1524
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1525
        return true;
1526
    }
1527
    return false;
1528
}
1529

    
1530
static bool bdrv_requests_pending_all(void)
1531
{
1532
    BlockDriverState *bs;
1533
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1534
        if (bdrv_requests_pending(bs)) {
1535
            return true;
1536
        }
1537
    }
1538
    return false;
1539
}
1540

    
1541
/*
1542
 * Wait for pending requests to complete across all BlockDriverStates
1543
 *
1544
 * This function does not flush data to disk, use bdrv_flush_all() for that
1545
 * after calling this function.
1546
 *
1547
 * Note that completion of an asynchronous I/O operation can trigger any
1548
 * number of other I/O operations on other devices---for example a coroutine
1549
 * can be arbitrarily complex and a constant flow of I/O can come until the
1550
 * coroutine is complete.  Because of this, it is not possible to have a
1551
 * function to drain a single device's I/O queue.
1552
 */
1553
void bdrv_drain_all(void)
1554
{
1555
    /* Always run first iteration so any pending completion BHs run */
1556
    bool busy = true;
1557
    BlockDriverState *bs;
1558

    
1559
    while (busy) {
1560
        /* FIXME: We do not have timer support here, so this is effectively
1561
         * a busy wait.
1562
         */
1563
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1564
            if (bdrv_start_throttled_reqs(bs)) {
1565
                busy = true;
1566
            }
1567
        }
1568

    
1569
        busy = bdrv_requests_pending_all();
1570
        busy |= aio_poll(qemu_get_aio_context(), busy);
1571
    }
1572
}
1573

    
1574
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1575
   Also, NULL terminate the device_name to prevent double remove */
1576
void bdrv_make_anon(BlockDriverState *bs)
1577
{
1578
    if (bs->device_name[0] != '\0') {
1579
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1580
    }
1581
    bs->device_name[0] = '\0';
1582
}
1583

    
1584
static void bdrv_rebind(BlockDriverState *bs)
1585
{
1586
    if (bs->drv && bs->drv->bdrv_rebind) {
1587
        bs->drv->bdrv_rebind(bs);
1588
    }
1589
}
1590

    
1591
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1592
                                     BlockDriverState *bs_src)
1593
{
1594
    /* move some fields that need to stay attached to the device */
1595
    bs_dest->open_flags         = bs_src->open_flags;
1596

    
1597
    /* dev info */
1598
    bs_dest->dev_ops            = bs_src->dev_ops;
1599
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1600
    bs_dest->dev                = bs_src->dev;
1601
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1602
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1603

    
1604
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1605

    
1606
    /* i/o throttled req */
1607
    memcpy(&bs_dest->throttle_state,
1608
           &bs_src->throttle_state,
1609
           sizeof(ThrottleState));
1610
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1611
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1612
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1613

    
1614
    /* r/w error */
1615
    bs_dest->on_read_error      = bs_src->on_read_error;
1616
    bs_dest->on_write_error     = bs_src->on_write_error;
1617

    
1618
    /* i/o status */
1619
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1620
    bs_dest->iostatus           = bs_src->iostatus;
1621

    
1622
    /* dirty bitmap */
1623
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1624

    
1625
    /* reference count */
1626
    bs_dest->refcnt             = bs_src->refcnt;
1627

    
1628
    /* job */
1629
    bs_dest->in_use             = bs_src->in_use;
1630
    bs_dest->job                = bs_src->job;
1631

    
1632
    /* keep the same entry in bdrv_states */
1633
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1634
            bs_src->device_name);
1635
    bs_dest->list = bs_src->list;
1636
}
1637

    
1638
/*
1639
 * Swap bs contents for two image chains while they are live,
1640
 * while keeping required fields on the BlockDriverState that is
1641
 * actually attached to a device.
1642
 *
1643
 * This will modify the BlockDriverState fields, and swap contents
1644
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1645
 *
1646
 * bs_new is required to be anonymous.
1647
 *
1648
 * This function does not create any image files.
1649
 */
1650
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1651
{
1652
    BlockDriverState tmp;
1653

    
1654
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1655
    assert(bs_new->device_name[0] == '\0');
1656
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1657
    assert(bs_new->job == NULL);
1658
    assert(bs_new->dev == NULL);
1659
    assert(bs_new->in_use == 0);
1660
    assert(bs_new->io_limits_enabled == false);
1661
    assert(!throttle_have_timer(&bs_new->throttle_state));
1662

    
1663
    tmp = *bs_new;
1664
    *bs_new = *bs_old;
1665
    *bs_old = tmp;
1666

    
1667
    /* there are some fields that should not be swapped, move them back */
1668
    bdrv_move_feature_fields(&tmp, bs_old);
1669
    bdrv_move_feature_fields(bs_old, bs_new);
1670
    bdrv_move_feature_fields(bs_new, &tmp);
1671

    
1672
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1673
    assert(bs_new->device_name[0] == '\0');
1674

    
1675
    /* Check a few fields that should remain attached to the device */
1676
    assert(bs_new->dev == NULL);
1677
    assert(bs_new->job == NULL);
1678
    assert(bs_new->in_use == 0);
1679
    assert(bs_new->io_limits_enabled == false);
1680
    assert(!throttle_have_timer(&bs_new->throttle_state));
1681

    
1682
    bdrv_rebind(bs_new);
1683
    bdrv_rebind(bs_old);
1684
}
1685

    
1686
/*
1687
 * Add new bs contents at the top of an image chain while the chain is
1688
 * live, while keeping required fields on the top layer.
1689
 *
1690
 * This will modify the BlockDriverState fields, and swap contents
1691
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1692
 *
1693
 * bs_new is required to be anonymous.
1694
 *
1695
 * This function does not create any image files.
1696
 */
1697
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1698
{
1699
    bdrv_swap(bs_new, bs_top);
1700

    
1701
    /* The contents of 'tmp' will become bs_top, as we are
1702
     * swapping bs_new and bs_top contents. */
1703
    bs_top->backing_hd = bs_new;
1704
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1705
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1706
            bs_new->filename);
1707
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1708
            bs_new->drv ? bs_new->drv->format_name : "");
1709
}
1710

    
1711
static void bdrv_delete(BlockDriverState *bs)
1712
{
1713
    assert(!bs->dev);
1714
    assert(!bs->job);
1715
    assert(!bs->in_use);
1716
    assert(!bs->refcnt);
1717
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1718

    
1719
    bdrv_close(bs);
1720

    
1721
    /* remove from list, if necessary */
1722
    bdrv_make_anon(bs);
1723

    
1724
    g_free(bs);
1725
}
1726

    
1727
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1728
/* TODO change to DeviceState *dev when all users are qdevified */
1729
{
1730
    if (bs->dev) {
1731
        return -EBUSY;
1732
    }
1733
    bs->dev = dev;
1734
    bdrv_iostatus_reset(bs);
1735
    return 0;
1736
}
1737

    
1738
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1739
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1740
{
1741
    if (bdrv_attach_dev(bs, dev) < 0) {
1742
        abort();
1743
    }
1744
}
1745

    
1746
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1747
/* TODO change to DeviceState *dev when all users are qdevified */
1748
{
1749
    assert(bs->dev == dev);
1750
    bs->dev = NULL;
1751
    bs->dev_ops = NULL;
1752
    bs->dev_opaque = NULL;
1753
    bs->buffer_alignment = 512;
1754
}
1755

    
1756
/* TODO change to return DeviceState * when all users are qdevified */
1757
void *bdrv_get_attached_dev(BlockDriverState *bs)
1758
{
1759
    return bs->dev;
1760
}
1761

    
1762
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1763
                      void *opaque)
1764
{
1765
    bs->dev_ops = ops;
1766
    bs->dev_opaque = opaque;
1767
}
1768

    
1769
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1770
                               enum MonitorEvent ev,
1771
                               BlockErrorAction action, bool is_read)
1772
{
1773
    QObject *data;
1774
    const char *action_str;
1775

    
1776
    switch (action) {
1777
    case BDRV_ACTION_REPORT:
1778
        action_str = "report";
1779
        break;
1780
    case BDRV_ACTION_IGNORE:
1781
        action_str = "ignore";
1782
        break;
1783
    case BDRV_ACTION_STOP:
1784
        action_str = "stop";
1785
        break;
1786
    default:
1787
        abort();
1788
    }
1789

    
1790
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1791
                              bdrv->device_name,
1792
                              action_str,
1793
                              is_read ? "read" : "write");
1794
    monitor_protocol_event(ev, data);
1795

    
1796
    qobject_decref(data);
1797
}
1798

    
1799
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1800
{
1801
    QObject *data;
1802

    
1803
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1804
                              bdrv_get_device_name(bs), ejected);
1805
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1806

    
1807
    qobject_decref(data);
1808
}
1809

    
1810
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1811
{
1812
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1813
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1814
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1815
        if (tray_was_closed) {
1816
            /* tray open */
1817
            bdrv_emit_qmp_eject_event(bs, true);
1818
        }
1819
        if (load) {
1820
            /* tray close */
1821
            bdrv_emit_qmp_eject_event(bs, false);
1822
        }
1823
    }
1824
}
1825

    
1826
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1827
{
1828
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1829
}
1830

    
1831
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1832
{
1833
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1834
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1835
    }
1836
}
1837

    
1838
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1839
{
1840
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1841
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1842
    }
1843
    return false;
1844
}
1845

    
1846
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1847
{
1848
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1849
        bs->dev_ops->resize_cb(bs->dev_opaque);
1850
    }
1851
}
1852

    
1853
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1854
{
1855
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1856
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1857
    }
1858
    return false;
1859
}
1860

    
1861
/*
1862
 * Run consistency checks on an image
1863
 *
1864
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1865
 * free of errors) or -errno when an internal error occurred. The results of the
1866
 * check are stored in res.
1867
 */
1868
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1869
{
1870
    if (bs->drv->bdrv_check == NULL) {
1871
        return -ENOTSUP;
1872
    }
1873

    
1874
    memset(res, 0, sizeof(*res));
1875
    return bs->drv->bdrv_check(bs, res, fix);
1876
}
1877

    
1878
#define COMMIT_BUF_SECTORS 2048
1879

    
1880
/* commit COW file into the raw image */
1881
int bdrv_commit(BlockDriverState *bs)
1882
{
1883
    BlockDriver *drv = bs->drv;
1884
    int64_t sector, total_sectors;
1885
    int n, ro, open_flags;
1886
    int ret = 0;
1887
    uint8_t *buf;
1888
    char filename[PATH_MAX];
1889

    
1890
    if (!drv)
1891
        return -ENOMEDIUM;
1892
    
1893
    if (!bs->backing_hd) {
1894
        return -ENOTSUP;
1895
    }
1896

    
1897
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1898
        return -EBUSY;
1899
    }
1900

    
1901
    ro = bs->backing_hd->read_only;
1902
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1903
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1904
    open_flags =  bs->backing_hd->open_flags;
1905

    
1906
    if (ro) {
1907
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1908
            return -EACCES;
1909
        }
1910
    }
1911

    
1912
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1913
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1914

    
1915
    for (sector = 0; sector < total_sectors; sector += n) {
1916
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
1917
        if (ret < 0) {
1918
            goto ro_cleanup;
1919
        }
1920
        if (ret) {
1921
            if (bdrv_read(bs, sector, buf, n) != 0) {
1922
                ret = -EIO;
1923
                goto ro_cleanup;
1924
            }
1925

    
1926
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1927
                ret = -EIO;
1928
                goto ro_cleanup;
1929
            }
1930
        }
1931
    }
1932

    
1933
    if (drv->bdrv_make_empty) {
1934
        ret = drv->bdrv_make_empty(bs);
1935
        bdrv_flush(bs);
1936
    }
1937

    
1938
    /*
1939
     * Make sure all data we wrote to the backing device is actually
1940
     * stable on disk.
1941
     */
1942
    if (bs->backing_hd)
1943
        bdrv_flush(bs->backing_hd);
1944

    
1945
ro_cleanup:
1946
    g_free(buf);
1947

    
1948
    if (ro) {
1949
        /* ignoring error return here */
1950
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1951
    }
1952

    
1953
    return ret;
1954
}
1955

    
1956
int bdrv_commit_all(void)
1957
{
1958
    BlockDriverState *bs;
1959

    
1960
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1961
        if (bs->drv && bs->backing_hd) {
1962
            int ret = bdrv_commit(bs);
1963
            if (ret < 0) {
1964
                return ret;
1965
            }
1966
        }
1967
    }
1968
    return 0;
1969
}
1970

    
1971
/**
1972
 * Remove an active request from the tracked requests list
1973
 *
1974
 * This function should be called when a tracked request is completing.
1975
 */
1976
static void tracked_request_end(BdrvTrackedRequest *req)
1977
{
1978
    QLIST_REMOVE(req, list);
1979
    qemu_co_queue_restart_all(&req->wait_queue);
1980
}
1981

    
1982
/**
1983
 * Add an active request to the tracked requests list
1984
 */
1985
static void tracked_request_begin(BdrvTrackedRequest *req,
1986
                                  BlockDriverState *bs,
1987
                                  int64_t sector_num,
1988
                                  int nb_sectors, bool is_write)
1989
{
1990
    *req = (BdrvTrackedRequest){
1991
        .bs = bs,
1992
        .sector_num = sector_num,
1993
        .nb_sectors = nb_sectors,
1994
        .is_write = is_write,
1995
        .co = qemu_coroutine_self(),
1996
    };
1997

    
1998
    qemu_co_queue_init(&req->wait_queue);
1999

    
2000
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2001
}
2002

    
2003
/**
2004
 * Round a region to cluster boundaries
2005
 */
2006
void bdrv_round_to_clusters(BlockDriverState *bs,
2007
                            int64_t sector_num, int nb_sectors,
2008
                            int64_t *cluster_sector_num,
2009
                            int *cluster_nb_sectors)
2010
{
2011
    BlockDriverInfo bdi;
2012

    
2013
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2014
        *cluster_sector_num = sector_num;
2015
        *cluster_nb_sectors = nb_sectors;
2016
    } else {
2017
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2018
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2019
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2020
                                            nb_sectors, c);
2021
    }
2022
}
2023

    
2024
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2025
                                     int64_t sector_num, int nb_sectors) {
2026
    /*        aaaa   bbbb */
2027
    if (sector_num >= req->sector_num + req->nb_sectors) {
2028
        return false;
2029
    }
2030
    /* bbbb   aaaa        */
2031
    if (req->sector_num >= sector_num + nb_sectors) {
2032
        return false;
2033
    }
2034
    return true;
2035
}
2036

    
2037
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2038
        int64_t sector_num, int nb_sectors)
2039
{
2040
    BdrvTrackedRequest *req;
2041
    int64_t cluster_sector_num;
2042
    int cluster_nb_sectors;
2043
    bool retry;
2044

    
2045
    /* If we touch the same cluster it counts as an overlap.  This guarantees
2046
     * that allocating writes will be serialized and not race with each other
2047
     * for the same cluster.  For example, in copy-on-read it ensures that the
2048
     * CoR read and write operations are atomic and guest writes cannot
2049
     * interleave between them.
2050
     */
2051
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2052
                           &cluster_sector_num, &cluster_nb_sectors);
2053

    
2054
    do {
2055
        retry = false;
2056
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2057
            if (tracked_request_overlaps(req, cluster_sector_num,
2058
                                         cluster_nb_sectors)) {
2059
                /* Hitting this means there was a reentrant request, for
2060
                 * example, a block driver issuing nested requests.  This must
2061
                 * never happen since it means deadlock.
2062
                 */
2063
                assert(qemu_coroutine_self() != req->co);
2064

    
2065
                qemu_co_queue_wait(&req->wait_queue);
2066
                retry = true;
2067
                break;
2068
            }
2069
        }
2070
    } while (retry);
2071
}
2072

    
2073
/*
2074
 * Return values:
2075
 * 0        - success
2076
 * -EINVAL  - backing format specified, but no file
2077
 * -ENOSPC  - can't update the backing file because no space is left in the
2078
 *            image file header
2079
 * -ENOTSUP - format driver doesn't support changing the backing file
2080
 */
2081
int bdrv_change_backing_file(BlockDriverState *bs,
2082
    const char *backing_file, const char *backing_fmt)
2083
{
2084
    BlockDriver *drv = bs->drv;
2085
    int ret;
2086

    
2087
    /* Backing file format doesn't make sense without a backing file */
2088
    if (backing_fmt && !backing_file) {
2089
        return -EINVAL;
2090
    }
2091

    
2092
    if (drv->bdrv_change_backing_file != NULL) {
2093
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2094
    } else {
2095
        ret = -ENOTSUP;
2096
    }
2097

    
2098
    if (ret == 0) {
2099
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2100
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2101
    }
2102
    return ret;
2103
}
2104

    
2105
/*
2106
 * Finds the image layer in the chain that has 'bs' as its backing file.
2107
 *
2108
 * active is the current topmost image.
2109
 *
2110
 * Returns NULL if bs is not found in active's image chain,
2111
 * or if active == bs.
2112
 */
2113
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2114
                                    BlockDriverState *bs)
2115
{
2116
    BlockDriverState *overlay = NULL;
2117
    BlockDriverState *intermediate;
2118

    
2119
    assert(active != NULL);
2120
    assert(bs != NULL);
2121

    
2122
    /* if bs is the same as active, then by definition it has no overlay
2123
     */
2124
    if (active == bs) {
2125
        return NULL;
2126
    }
2127

    
2128
    intermediate = active;
2129
    while (intermediate->backing_hd) {
2130
        if (intermediate->backing_hd == bs) {
2131
            overlay = intermediate;
2132
            break;
2133
        }
2134
        intermediate = intermediate->backing_hd;
2135
    }
2136

    
2137
    return overlay;
2138
}
2139

    
2140
typedef struct BlkIntermediateStates {
2141
    BlockDriverState *bs;
2142
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2143
} BlkIntermediateStates;
2144

    
2145

    
2146
/*
2147
 * Drops images above 'base' up to and including 'top', and sets the image
2148
 * above 'top' to have base as its backing file.
2149
 *
2150
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2151
 * information in 'bs' can be properly updated.
2152
 *
2153
 * E.g., this will convert the following chain:
2154
 * bottom <- base <- intermediate <- top <- active
2155
 *
2156
 * to
2157
 *
2158
 * bottom <- base <- active
2159
 *
2160
 * It is allowed for bottom==base, in which case it converts:
2161
 *
2162
 * base <- intermediate <- top <- active
2163
 *
2164
 * to
2165
 *
2166
 * base <- active
2167
 *
2168
 * Error conditions:
2169
 *  if active == top, that is considered an error
2170
 *
2171
 */
2172
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2173
                           BlockDriverState *base)
2174
{
2175
    BlockDriverState *intermediate;
2176
    BlockDriverState *base_bs = NULL;
2177
    BlockDriverState *new_top_bs = NULL;
2178
    BlkIntermediateStates *intermediate_state, *next;
2179
    int ret = -EIO;
2180

    
2181
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2182
    QSIMPLEQ_INIT(&states_to_delete);
2183

    
2184
    if (!top->drv || !base->drv) {
2185
        goto exit;
2186
    }
2187

    
2188
    new_top_bs = bdrv_find_overlay(active, top);
2189

    
2190
    if (new_top_bs == NULL) {
2191
        /* we could not find the image above 'top', this is an error */
2192
        goto exit;
2193
    }
2194

    
2195
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2196
     * to do, no intermediate images */
2197
    if (new_top_bs->backing_hd == base) {
2198
        ret = 0;
2199
        goto exit;
2200
    }
2201

    
2202
    intermediate = top;
2203

    
2204
    /* now we will go down through the list, and add each BDS we find
2205
     * into our deletion queue, until we hit the 'base'
2206
     */
2207
    while (intermediate) {
2208
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2209
        intermediate_state->bs = intermediate;
2210
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2211

    
2212
        if (intermediate->backing_hd == base) {
2213
            base_bs = intermediate->backing_hd;
2214
            break;
2215
        }
2216
        intermediate = intermediate->backing_hd;
2217
    }
2218
    if (base_bs == NULL) {
2219
        /* something went wrong, we did not end at the base. safely
2220
         * unravel everything, and exit with error */
2221
        goto exit;
2222
    }
2223

    
2224
    /* success - we can delete the intermediate states, and link top->base */
2225
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2226
                                   base_bs->drv ? base_bs->drv->format_name : "");
2227
    if (ret) {
2228
        goto exit;
2229
    }
2230
    new_top_bs->backing_hd = base_bs;
2231

    
2232

    
2233
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2234
        /* so that bdrv_close() does not recursively close the chain */
2235
        intermediate_state->bs->backing_hd = NULL;
2236
        bdrv_unref(intermediate_state->bs);
2237
    }
2238
    ret = 0;
2239

    
2240
exit:
2241
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2242
        g_free(intermediate_state);
2243
    }
2244
    return ret;
2245
}
2246

    
2247

    
2248
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2249
                                   size_t size)
2250
{
2251
    int64_t len;
2252

    
2253
    if (!bdrv_is_inserted(bs))
2254
        return -ENOMEDIUM;
2255

    
2256
    if (bs->growable)
2257
        return 0;
2258

    
2259
    len = bdrv_getlength(bs);
2260

    
2261
    if (offset < 0)
2262
        return -EIO;
2263

    
2264
    if ((offset > len) || (len - offset < size))
2265
        return -EIO;
2266

    
2267
    return 0;
2268
}
2269

    
2270
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2271
                              int nb_sectors)
2272
{
2273
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2274
                                   nb_sectors * BDRV_SECTOR_SIZE);
2275
}
2276

    
2277
typedef struct RwCo {
2278
    BlockDriverState *bs;
2279
    int64_t sector_num;
2280
    int nb_sectors;
2281
    QEMUIOVector *qiov;
2282
    bool is_write;
2283
    int ret;
2284
    BdrvRequestFlags flags;
2285
} RwCo;
2286

    
2287
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2288
{
2289
    RwCo *rwco = opaque;
2290

    
2291
    if (!rwco->is_write) {
2292
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2293
                                     rwco->nb_sectors, rwco->qiov,
2294
                                     rwco->flags);
2295
    } else {
2296
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2297
                                      rwco->nb_sectors, rwco->qiov,
2298
                                      rwco->flags);
2299
    }
2300
}
2301

    
2302
/*
2303
 * Process a vectored synchronous request using coroutines
2304
 */
2305
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2306
                       QEMUIOVector *qiov, bool is_write,
2307
                       BdrvRequestFlags flags)
2308
{
2309
    Coroutine *co;
2310
    RwCo rwco = {
2311
        .bs = bs,
2312
        .sector_num = sector_num,
2313
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2314
        .qiov = qiov,
2315
        .is_write = is_write,
2316
        .ret = NOT_DONE,
2317
        .flags = flags,
2318
    };
2319
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2320

    
2321
    /**
2322
     * In sync call context, when the vcpu is blocked, this throttling timer
2323
     * will not fire; so the I/O throttling function has to be disabled here
2324
     * if it has been enabled.
2325
     */
2326
    if (bs->io_limits_enabled) {
2327
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2328
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2329
        bdrv_io_limits_disable(bs);
2330
    }
2331

    
2332
    if (qemu_in_coroutine()) {
2333
        /* Fast-path if already in coroutine context */
2334
        bdrv_rw_co_entry(&rwco);
2335
    } else {
2336
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2337
        qemu_coroutine_enter(co, &rwco);
2338
        while (rwco.ret == NOT_DONE) {
2339
            qemu_aio_wait();
2340
        }
2341
    }
2342
    return rwco.ret;
2343
}
2344

    
2345
/*
2346
 * Process a synchronous request using coroutines
2347
 */
2348
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2349
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2350
{
2351
    QEMUIOVector qiov;
2352
    struct iovec iov = {
2353
        .iov_base = (void *)buf,
2354
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2355
    };
2356

    
2357
    qemu_iovec_init_external(&qiov, &iov, 1);
2358
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2359
}
2360

    
2361
/* return < 0 if error. See bdrv_write() for the return codes */
2362
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2363
              uint8_t *buf, int nb_sectors)
2364
{
2365
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2366
}
2367

    
2368
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2369
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2370
                          uint8_t *buf, int nb_sectors)
2371
{
2372
    bool enabled;
2373
    int ret;
2374

    
2375
    enabled = bs->io_limits_enabled;
2376
    bs->io_limits_enabled = false;
2377
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2378
    bs->io_limits_enabled = enabled;
2379
    return ret;
2380
}
2381

    
2382
/* Return < 0 if error. Important errors are:
2383
  -EIO         generic I/O error (may happen for all errors)
2384
  -ENOMEDIUM   No media inserted.
2385
  -EINVAL      Invalid sector number or nb_sectors
2386
  -EACCES      Trying to write a read-only device
2387
*/
2388
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2389
               const uint8_t *buf, int nb_sectors)
2390
{
2391
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2392
}
2393

    
2394
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2395
{
2396
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2397
}
2398

    
2399
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2400
                      int nb_sectors, BdrvRequestFlags flags)
2401
{
2402
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2403
                      BDRV_REQ_ZERO_WRITE | flags);
2404
}
2405

    
2406
/*
2407
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2408
 * The operation is sped up by checking the block status and only writing
2409
 * zeroes to the device if they currently do not return zeroes. Optional
2410
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2411
 *
2412
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2413
 */
2414
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2415
{
2416
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2417
    int64_t ret, nb_sectors, sector_num = 0;
2418
    int n;
2419

    
2420
    for (;;) {
2421
        nb_sectors = target_size - sector_num;
2422
        if (nb_sectors <= 0) {
2423
            return 0;
2424
        }
2425
        if (nb_sectors > INT_MAX) {
2426
            nb_sectors = INT_MAX;
2427
        }
2428
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2429
        if (ret & BDRV_BLOCK_ZERO) {
2430
            sector_num += n;
2431
            continue;
2432
        }
2433
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2434
        if (ret < 0) {
2435
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2436
                         sector_num, strerror(-ret));
2437
            return ret;
2438
        }
2439
        sector_num += n;
2440
    }
2441
}
2442

    
2443
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2444
               void *buf, int count1)
2445
{
2446
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2447
    int len, nb_sectors, count;
2448
    int64_t sector_num;
2449
    int ret;
2450

    
2451
    count = count1;
2452
    /* first read to align to sector start */
2453
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2454
    if (len > count)
2455
        len = count;
2456
    sector_num = offset >> BDRV_SECTOR_BITS;
2457
    if (len > 0) {
2458
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2459
            return ret;
2460
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2461
        count -= len;
2462
        if (count == 0)
2463
            return count1;
2464
        sector_num++;
2465
        buf += len;
2466
    }
2467

    
2468
    /* read the sectors "in place" */
2469
    nb_sectors = count >> BDRV_SECTOR_BITS;
2470
    if (nb_sectors > 0) {
2471
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2472
            return ret;
2473
        sector_num += nb_sectors;
2474
        len = nb_sectors << BDRV_SECTOR_BITS;
2475
        buf += len;
2476
        count -= len;
2477
    }
2478

    
2479
    /* add data from the last sector */
2480
    if (count > 0) {
2481
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2482
            return ret;
2483
        memcpy(buf, tmp_buf, count);
2484
    }
2485
    return count1;
2486
}
2487

    
2488
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2489
{
2490
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2491
    int len, nb_sectors, count;
2492
    int64_t sector_num;
2493
    int ret;
2494

    
2495
    count = qiov->size;
2496

    
2497
    /* first write to align to sector start */
2498
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2499
    if (len > count)
2500
        len = count;
2501
    sector_num = offset >> BDRV_SECTOR_BITS;
2502
    if (len > 0) {
2503
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2504
            return ret;
2505
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2506
                          len);
2507
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2508
            return ret;
2509
        count -= len;
2510
        if (count == 0)
2511
            return qiov->size;
2512
        sector_num++;
2513
    }
2514

    
2515
    /* write the sectors "in place" */
2516
    nb_sectors = count >> BDRV_SECTOR_BITS;
2517
    if (nb_sectors > 0) {
2518
        QEMUIOVector qiov_inplace;
2519

    
2520
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2521
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2522
                          nb_sectors << BDRV_SECTOR_BITS);
2523
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2524
        qemu_iovec_destroy(&qiov_inplace);
2525
        if (ret < 0) {
2526
            return ret;
2527
        }
2528

    
2529
        sector_num += nb_sectors;
2530
        len = nb_sectors << BDRV_SECTOR_BITS;
2531
        count -= len;
2532
    }
2533

    
2534
    /* add data from the last sector */
2535
    if (count > 0) {
2536
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2537
            return ret;
2538
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2539
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2540
            return ret;
2541
    }
2542
    return qiov->size;
2543
}
2544

    
2545
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2546
                const void *buf, int count1)
2547
{
2548
    QEMUIOVector qiov;
2549
    struct iovec iov = {
2550
        .iov_base   = (void *) buf,
2551
        .iov_len    = count1,
2552
    };
2553

    
2554
    qemu_iovec_init_external(&qiov, &iov, 1);
2555
    return bdrv_pwritev(bs, offset, &qiov);
2556
}
2557

    
2558
/*
2559
 * Writes to the file and ensures that no writes are reordered across this
2560
 * request (acts as a barrier)
2561
 *
2562
 * Returns 0 on success, -errno in error cases.
2563
 */
2564
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2565
    const void *buf, int count)
2566
{
2567
    int ret;
2568

    
2569
    ret = bdrv_pwrite(bs, offset, buf, count);
2570
    if (ret < 0) {
2571
        return ret;
2572
    }
2573

    
2574
    /* No flush needed for cache modes that already do it */
2575
    if (bs->enable_write_cache) {
2576
        bdrv_flush(bs);
2577
    }
2578

    
2579
    return 0;
2580
}
2581

    
2582
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2583
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2584
{
2585
    /* Perform I/O through a temporary buffer so that users who scribble over
2586
     * their read buffer while the operation is in progress do not end up
2587
     * modifying the image file.  This is critical for zero-copy guest I/O
2588
     * where anything might happen inside guest memory.
2589
     */
2590
    void *bounce_buffer;
2591

    
2592
    BlockDriver *drv = bs->drv;
2593
    struct iovec iov;
2594
    QEMUIOVector bounce_qiov;
2595
    int64_t cluster_sector_num;
2596
    int cluster_nb_sectors;
2597
    size_t skip_bytes;
2598
    int ret;
2599

    
2600
    /* Cover entire cluster so no additional backing file I/O is required when
2601
     * allocating cluster in the image file.
2602
     */
2603
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2604
                           &cluster_sector_num, &cluster_nb_sectors);
2605

    
2606
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2607
                                   cluster_sector_num, cluster_nb_sectors);
2608

    
2609
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2610
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2611
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2612

    
2613
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2614
                             &bounce_qiov);
2615
    if (ret < 0) {
2616
        goto err;
2617
    }
2618

    
2619
    if (drv->bdrv_co_write_zeroes &&
2620
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2621
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2622
                                      cluster_nb_sectors, 0);
2623
    } else {
2624
        /* This does not change the data on the disk, it is not necessary
2625
         * to flush even in cache=writethrough mode.
2626
         */
2627
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2628
                                  &bounce_qiov);
2629
    }
2630

    
2631
    if (ret < 0) {
2632
        /* It might be okay to ignore write errors for guest requests.  If this
2633
         * is a deliberate copy-on-read then we don't want to ignore the error.
2634
         * Simply report it in all cases.
2635
         */
2636
        goto err;
2637
    }
2638

    
2639
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2640
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2641
                        nb_sectors * BDRV_SECTOR_SIZE);
2642

    
2643
err:
2644
    qemu_vfree(bounce_buffer);
2645
    return ret;
2646
}
2647

    
2648
/*
2649
 * Handle a read request in coroutine context
2650
 */
2651
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2652
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2653
    BdrvRequestFlags flags)
2654
{
2655
    BlockDriver *drv = bs->drv;
2656
    BdrvTrackedRequest req;
2657
    int ret;
2658

    
2659
    if (!drv) {
2660
        return -ENOMEDIUM;
2661
    }
2662
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2663
        return -EIO;
2664
    }
2665

    
2666
    if (bs->copy_on_read) {
2667
        flags |= BDRV_REQ_COPY_ON_READ;
2668
    }
2669
    if (flags & BDRV_REQ_COPY_ON_READ) {
2670
        bs->copy_on_read_in_flight++;
2671
    }
2672

    
2673
    if (bs->copy_on_read_in_flight) {
2674
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2675
    }
2676

    
2677
    /* throttling disk I/O */
2678
    if (bs->io_limits_enabled) {
2679
        bdrv_io_limits_intercept(bs, nb_sectors, false);
2680
    }
2681

    
2682
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2683

    
2684
    if (flags & BDRV_REQ_COPY_ON_READ) {
2685
        int pnum;
2686

    
2687
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2688
        if (ret < 0) {
2689
            goto out;
2690
        }
2691

    
2692
        if (!ret || pnum != nb_sectors) {
2693
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2694
            goto out;
2695
        }
2696
    }
2697

    
2698
    if (!(bs->zero_beyond_eof && bs->growable)) {
2699
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2700
    } else {
2701
        /* Read zeros after EOF of growable BDSes */
2702
        int64_t len, total_sectors, max_nb_sectors;
2703

    
2704
        len = bdrv_getlength(bs);
2705
        if (len < 0) {
2706
            ret = len;
2707
            goto out;
2708
        }
2709

    
2710
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2711
        max_nb_sectors = MAX(0, total_sectors - sector_num);
2712
        if (max_nb_sectors > 0) {
2713
            ret = drv->bdrv_co_readv(bs, sector_num,
2714
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2715
        } else {
2716
            ret = 0;
2717
        }
2718

    
2719
        /* Reading beyond end of file is supposed to produce zeroes */
2720
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2721
            uint64_t offset = MAX(0, total_sectors - sector_num);
2722
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2723
                              BDRV_SECTOR_SIZE;
2724
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2725
        }
2726
    }
2727

    
2728
out:
2729
    tracked_request_end(&req);
2730

    
2731
    if (flags & BDRV_REQ_COPY_ON_READ) {
2732
        bs->copy_on_read_in_flight--;
2733
    }
2734

    
2735
    return ret;
2736
}
2737

    
2738
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2739
    int nb_sectors, QEMUIOVector *qiov)
2740
{
2741
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2742

    
2743
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2744
}
2745

    
2746
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2747
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2748
{
2749
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2750

    
2751
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2752
                            BDRV_REQ_COPY_ON_READ);
2753
}
2754

    
2755
/* if no limit is specified in the BlockLimits use a default
2756
 * of 32768 512-byte sectors (16 MiB) per request.
2757
 */
2758
#define MAX_WRITE_ZEROES_DEFAULT 32768
2759

    
2760
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2761
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
2762
{
2763
    BlockDriver *drv = bs->drv;
2764
    QEMUIOVector qiov;
2765
    struct iovec iov = {0};
2766
    int ret = 0;
2767

    
2768
    int max_write_zeroes = bs->bl.max_write_zeroes ?
2769
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
2770

    
2771
    while (nb_sectors > 0 && !ret) {
2772
        int num = nb_sectors;
2773

    
2774
        /* align request */
2775
        if (bs->bl.write_zeroes_alignment &&
2776
            num >= bs->bl.write_zeroes_alignment &&
2777
            sector_num % bs->bl.write_zeroes_alignment) {
2778
            if (num > bs->bl.write_zeroes_alignment) {
2779
                num = bs->bl.write_zeroes_alignment;
2780
            }
2781
            num -= sector_num % bs->bl.write_zeroes_alignment;
2782
        }
2783

    
2784
        /* limit request size */
2785
        if (num > max_write_zeroes) {
2786
            num = max_write_zeroes;
2787
        }
2788

    
2789
        ret = -ENOTSUP;
2790
        /* First try the efficient write zeroes operation */
2791
        if (drv->bdrv_co_write_zeroes) {
2792
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
2793
        }
2794

    
2795
        if (ret == -ENOTSUP) {
2796
            /* Fall back to bounce buffer if write zeroes is unsupported */
2797
            iov.iov_len = num * BDRV_SECTOR_SIZE;
2798
            if (iov.iov_base == NULL) {
2799
                /* allocate bounce buffer only once and ensure that it
2800
                 * is big enough for this and all future requests.
2801
                 */
2802
                size_t bufsize = num <= nb_sectors ? num : max_write_zeroes;
2803
                iov.iov_base = qemu_blockalign(bs, bufsize * BDRV_SECTOR_SIZE);
2804
                memset(iov.iov_base, 0, bufsize * BDRV_SECTOR_SIZE);
2805
            }
2806
            qemu_iovec_init_external(&qiov, &iov, 1);
2807

    
2808
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
2809
        }
2810

    
2811
        sector_num += num;
2812
        nb_sectors -= num;
2813
    }
2814

    
2815
    qemu_vfree(iov.iov_base);
2816
    return ret;
2817
}
2818

    
2819
/*
2820
 * Handle a write request in coroutine context
2821
 */
2822
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2823
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2824
    BdrvRequestFlags flags)
2825
{
2826
    BlockDriver *drv = bs->drv;
2827
    BdrvTrackedRequest req;
2828
    int ret;
2829

    
2830
    if (!bs->drv) {
2831
        return -ENOMEDIUM;
2832
    }
2833
    if (bs->read_only) {
2834
        return -EACCES;
2835
    }
2836
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2837
        return -EIO;
2838
    }
2839

    
2840
    if (bs->copy_on_read_in_flight) {
2841
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2842
    }
2843

    
2844
    /* throttling disk I/O */
2845
    if (bs->io_limits_enabled) {
2846
        bdrv_io_limits_intercept(bs, nb_sectors, true);
2847
    }
2848

    
2849
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2850

    
2851
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2852

    
2853
    if (ret < 0) {
2854
        /* Do nothing, write notifier decided to fail this request */
2855
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
2856
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
2857
    } else {
2858
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2859
    }
2860

    
2861
    if (ret == 0 && !bs->enable_write_cache) {
2862
        ret = bdrv_co_flush(bs);
2863
    }
2864

    
2865
    bdrv_set_dirty(bs, sector_num, nb_sectors);
2866

    
2867
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2868
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2869
    }
2870
    if (bs->growable && ret >= 0) {
2871
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
2872
    }
2873

    
2874
    tracked_request_end(&req);
2875

    
2876
    return ret;
2877
}
2878

    
2879
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2880
    int nb_sectors, QEMUIOVector *qiov)
2881
{
2882
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2883

    
2884
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2885
}
2886

    
2887
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2888
                                      int64_t sector_num, int nb_sectors,
2889
                                      BdrvRequestFlags flags)
2890
{
2891
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
2892

    
2893
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
2894
        flags &= ~BDRV_REQ_MAY_UNMAP;
2895
    }
2896

    
2897
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2898
                             BDRV_REQ_ZERO_WRITE | flags);
2899
}
2900

    
2901
/**
2902
 * Truncate file to 'offset' bytes (needed only for file protocols)
2903
 */
2904
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2905
{
2906
    BlockDriver *drv = bs->drv;
2907
    int ret;
2908
    if (!drv)
2909
        return -ENOMEDIUM;
2910
    if (!drv->bdrv_truncate)
2911
        return -ENOTSUP;
2912
    if (bs->read_only)
2913
        return -EACCES;
2914
    if (bdrv_in_use(bs))
2915
        return -EBUSY;
2916
    ret = drv->bdrv_truncate(bs, offset);
2917
    if (ret == 0) {
2918
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2919
        bdrv_dev_resize_cb(bs);
2920
    }
2921
    return ret;
2922
}
2923

    
2924
/**
2925
 * Length of a allocated file in bytes. Sparse files are counted by actual
2926
 * allocated space. Return < 0 if error or unknown.
2927
 */
2928
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2929
{
2930
    BlockDriver *drv = bs->drv;
2931
    if (!drv) {
2932
        return -ENOMEDIUM;
2933
    }
2934
    if (drv->bdrv_get_allocated_file_size) {
2935
        return drv->bdrv_get_allocated_file_size(bs);
2936
    }
2937
    if (bs->file) {
2938
        return bdrv_get_allocated_file_size(bs->file);
2939
    }
2940
    return -ENOTSUP;
2941
}
2942

    
2943
/**
2944
 * Length of a file in bytes. Return < 0 if error or unknown.
2945
 */
2946
int64_t bdrv_getlength(BlockDriverState *bs)
2947
{
2948
    BlockDriver *drv = bs->drv;
2949
    if (!drv)
2950
        return -ENOMEDIUM;
2951

    
2952
    if (drv->has_variable_length) {
2953
        int ret = refresh_total_sectors(bs, bs->total_sectors);
2954
        if (ret < 0) {
2955
            return ret;
2956
        }
2957
    }
2958
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2959
}
2960

    
2961
/* return 0 as number of sectors if no device present or error */
2962
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2963
{
2964
    int64_t length;
2965
    length = bdrv_getlength(bs);
2966
    if (length < 0)
2967
        length = 0;
2968
    else
2969
        length = length >> BDRV_SECTOR_BITS;
2970
    *nb_sectors_ptr = length;
2971
}
2972

    
2973
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2974
                       BlockdevOnError on_write_error)
2975
{
2976
    bs->on_read_error = on_read_error;
2977
    bs->on_write_error = on_write_error;
2978
}
2979

    
2980
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2981
{
2982
    return is_read ? bs->on_read_error : bs->on_write_error;
2983
}
2984

    
2985
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2986
{
2987
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2988

    
2989
    switch (on_err) {
2990
    case BLOCKDEV_ON_ERROR_ENOSPC:
2991
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2992
    case BLOCKDEV_ON_ERROR_STOP:
2993
        return BDRV_ACTION_STOP;
2994
    case BLOCKDEV_ON_ERROR_REPORT:
2995
        return BDRV_ACTION_REPORT;
2996
    case BLOCKDEV_ON_ERROR_IGNORE:
2997
        return BDRV_ACTION_IGNORE;
2998
    default:
2999
        abort();
3000
    }
3001
}
3002

    
3003
/* This is done by device models because, while the block layer knows
3004
 * about the error, it does not know whether an operation comes from
3005
 * the device or the block layer (from a job, for example).
3006
 */
3007
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3008
                       bool is_read, int error)
3009
{
3010
    assert(error >= 0);
3011
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3012
    if (action == BDRV_ACTION_STOP) {
3013
        vm_stop(RUN_STATE_IO_ERROR);
3014
        bdrv_iostatus_set_err(bs, error);
3015
    }
3016
}
3017

    
3018
int bdrv_is_read_only(BlockDriverState *bs)
3019
{
3020
    return bs->read_only;
3021
}
3022

    
3023
int bdrv_is_sg(BlockDriverState *bs)
3024
{
3025
    return bs->sg;
3026
}
3027

    
3028
int bdrv_enable_write_cache(BlockDriverState *bs)
3029
{
3030
    return bs->enable_write_cache;
3031
}
3032

    
3033
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3034
{
3035
    bs->enable_write_cache = wce;
3036

    
3037
    /* so a reopen() will preserve wce */
3038
    if (wce) {
3039
        bs->open_flags |= BDRV_O_CACHE_WB;
3040
    } else {
3041
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3042
    }
3043
}
3044

    
3045
int bdrv_is_encrypted(BlockDriverState *bs)
3046
{
3047
    if (bs->backing_hd && bs->backing_hd->encrypted)
3048
        return 1;
3049
    return bs->encrypted;
3050
}
3051

    
3052
int bdrv_key_required(BlockDriverState *bs)
3053
{
3054
    BlockDriverState *backing_hd = bs->backing_hd;
3055

    
3056
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3057
        return 1;
3058
    return (bs->encrypted && !bs->valid_key);
3059
}
3060

    
3061
int bdrv_set_key(BlockDriverState *bs, const char *key)
3062
{
3063
    int ret;
3064
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3065
        ret = bdrv_set_key(bs->backing_hd, key);
3066
        if (ret < 0)
3067
            return ret;
3068
        if (!bs->encrypted)
3069
            return 0;
3070
    }
3071
    if (!bs->encrypted) {
3072
        return -EINVAL;
3073
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3074
        return -ENOMEDIUM;
3075
    }
3076
    ret = bs->drv->bdrv_set_key(bs, key);
3077
    if (ret < 0) {
3078
        bs->valid_key = 0;
3079
    } else if (!bs->valid_key) {
3080
        bs->valid_key = 1;
3081
        /* call the change callback now, we skipped it on open */
3082
        bdrv_dev_change_media_cb(bs, true);
3083
    }
3084
    return ret;
3085
}
3086

    
3087
const char *bdrv_get_format_name(BlockDriverState *bs)
3088
{
3089
    return bs->drv ? bs->drv->format_name : NULL;
3090
}
3091

    
3092
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3093
                         void *opaque)
3094
{
3095
    BlockDriver *drv;
3096

    
3097
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3098
        it(opaque, drv->format_name);
3099
    }
3100
}
3101

    
3102
BlockDriverState *bdrv_find(const char *name)
3103
{
3104
    BlockDriverState *bs;
3105

    
3106
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3107
        if (!strcmp(name, bs->device_name)) {
3108
            return bs;
3109
        }
3110
    }
3111
    return NULL;
3112
}
3113

    
3114
BlockDriverState *bdrv_next(BlockDriverState *bs)
3115
{
3116
    if (!bs) {
3117
        return QTAILQ_FIRST(&bdrv_states);
3118
    }
3119
    return QTAILQ_NEXT(bs, list);
3120
}
3121

    
3122
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3123
{
3124
    BlockDriverState *bs;
3125

    
3126
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3127
        it(opaque, bs);
3128
    }
3129
}
3130

    
3131
const char *bdrv_get_device_name(BlockDriverState *bs)
3132
{
3133
    return bs->device_name;
3134
}
3135

    
3136
int bdrv_get_flags(BlockDriverState *bs)
3137
{
3138
    return bs->open_flags;
3139
}
3140

    
3141
int bdrv_flush_all(void)
3142
{
3143
    BlockDriverState *bs;
3144
    int result = 0;
3145

    
3146
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3147
        int ret = bdrv_flush(bs);
3148
        if (ret < 0 && !result) {
3149
            result = ret;
3150
        }
3151
    }
3152

    
3153
    return result;
3154
}
3155

    
3156
int bdrv_has_zero_init_1(BlockDriverState *bs)
3157
{
3158
    return 1;
3159
}
3160

    
3161
int bdrv_has_zero_init(BlockDriverState *bs)
3162
{
3163
    assert(bs->drv);
3164

    
3165
    /* If BS is a copy on write image, it is initialized to
3166
       the contents of the base image, which may not be zeroes.  */
3167
    if (bs->backing_hd) {
3168
        return 0;
3169
    }
3170
    if (bs->drv->bdrv_has_zero_init) {
3171
        return bs->drv->bdrv_has_zero_init(bs);
3172
    }
3173

    
3174
    /* safe default */
3175
    return 0;
3176
}
3177

    
3178
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3179
{
3180
    BlockDriverInfo bdi;
3181

    
3182
    if (bs->backing_hd) {
3183
        return false;
3184
    }
3185

    
3186
    if (bdrv_get_info(bs, &bdi) == 0) {
3187
        return bdi.unallocated_blocks_are_zero;
3188
    }
3189

    
3190
    return false;
3191
}
3192

    
3193
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3194
{
3195
    BlockDriverInfo bdi;
3196

    
3197
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3198
        return false;
3199
    }
3200

    
3201
    if (bdrv_get_info(bs, &bdi) == 0) {
3202
        return bdi.can_write_zeroes_with_unmap;
3203
    }
3204

    
3205
    return false;
3206
}
3207

    
3208
typedef struct BdrvCoGetBlockStatusData {
3209
    BlockDriverState *bs;
3210
    BlockDriverState *base;
3211
    int64_t sector_num;
3212
    int nb_sectors;
3213
    int *pnum;
3214
    int64_t ret;
3215
    bool done;
3216
} BdrvCoGetBlockStatusData;
3217

    
3218
/*
3219
 * Returns true iff the specified sector is present in the disk image. Drivers
3220
 * not implementing the functionality are assumed to not support backing files,
3221
 * hence all their sectors are reported as allocated.
3222
 *
3223
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3224
 * and 'pnum' is set to 0.
3225
 *
3226
 * 'pnum' is set to the number of sectors (including and immediately following
3227
 * the specified sector) that are known to be in the same
3228
 * allocated/unallocated state.
3229
 *
3230
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3231
 * beyond the end of the disk image it will be clamped.
3232
 */
3233
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3234
                                                     int64_t sector_num,
3235
                                                     int nb_sectors, int *pnum)
3236
{
3237
    int64_t length;
3238
    int64_t n;
3239
    int64_t ret, ret2;
3240

    
3241
    length = bdrv_getlength(bs);
3242
    if (length < 0) {
3243
        return length;
3244
    }
3245

    
3246
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3247
        *pnum = 0;
3248
        return 0;
3249
    }
3250

    
3251
    n = bs->total_sectors - sector_num;
3252
    if (n < nb_sectors) {
3253
        nb_sectors = n;
3254
    }
3255

    
3256
    if (!bs->drv->bdrv_co_get_block_status) {
3257
        *pnum = nb_sectors;
3258
        ret = BDRV_BLOCK_DATA;
3259
        if (bs->drv->protocol_name) {
3260
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3261
        }
3262
        return ret;
3263
    }
3264

    
3265
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3266
    if (ret < 0) {
3267
        *pnum = 0;
3268
        return ret;
3269
    }
3270

    
3271
    if (ret & BDRV_BLOCK_RAW) {
3272
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3273
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3274
                                     *pnum, pnum);
3275
    }
3276

    
3277
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3278
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3279
            ret |= BDRV_BLOCK_ZERO;
3280
        } else if (bs->backing_hd) {
3281
            BlockDriverState *bs2 = bs->backing_hd;
3282
            int64_t length2 = bdrv_getlength(bs2);
3283
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3284
                ret |= BDRV_BLOCK_ZERO;
3285
            }
3286
        }
3287
    }
3288

    
3289
    if (bs->file &&
3290
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3291
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3292
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3293
                                        *pnum, pnum);
3294
        if (ret2 >= 0) {
3295
            /* Ignore errors.  This is just providing extra information, it
3296
             * is useful but not necessary.
3297
             */
3298
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3299
        }
3300
    }
3301

    
3302
    return ret;
3303
}
3304

    
3305
/* Coroutine wrapper for bdrv_get_block_status() */
3306
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3307
{
3308
    BdrvCoGetBlockStatusData *data = opaque;
3309
    BlockDriverState *bs = data->bs;
3310

    
3311
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3312
                                         data->pnum);
3313
    data->done = true;
3314
}
3315

    
3316
/*
3317
 * Synchronous wrapper around bdrv_co_get_block_status().
3318
 *
3319
 * See bdrv_co_get_block_status() for details.
3320
 */
3321
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3322
                              int nb_sectors, int *pnum)
3323
{
3324
    Coroutine *co;
3325
    BdrvCoGetBlockStatusData data = {
3326
        .bs = bs,
3327
        .sector_num = sector_num,
3328
        .nb_sectors = nb_sectors,
3329
        .pnum = pnum,
3330
        .done = false,
3331
    };
3332

    
3333
    if (qemu_in_coroutine()) {
3334
        /* Fast-path if already in coroutine context */
3335
        bdrv_get_block_status_co_entry(&data);
3336
    } else {
3337
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3338
        qemu_coroutine_enter(co, &data);
3339
        while (!data.done) {
3340
            qemu_aio_wait();
3341
        }
3342
    }
3343
    return data.ret;
3344
}
3345

    
3346
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3347
                                   int nb_sectors, int *pnum)
3348
{
3349
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3350
    if (ret < 0) {
3351
        return ret;
3352
    }
3353
    return
3354
        (ret & BDRV_BLOCK_DATA) ||
3355
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3356
}
3357

    
3358
/*
3359
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3360
 *
3361
 * Return true if the given sector is allocated in any image between
3362
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3363
 * sector is allocated in any image of the chain.  Return false otherwise.
3364
 *
3365
 * 'pnum' is set to the number of sectors (including and immediately following
3366
 *  the specified sector) that are known to be in the same
3367
 *  allocated/unallocated state.
3368
 *
3369
 */
3370
int bdrv_is_allocated_above(BlockDriverState *top,
3371
                            BlockDriverState *base,
3372
                            int64_t sector_num,
3373
                            int nb_sectors, int *pnum)
3374
{
3375
    BlockDriverState *intermediate;
3376
    int ret, n = nb_sectors;
3377

    
3378
    intermediate = top;
3379
    while (intermediate && intermediate != base) {
3380
        int pnum_inter;
3381
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3382
                                &pnum_inter);
3383
        if (ret < 0) {
3384
            return ret;
3385
        } else if (ret) {
3386
            *pnum = pnum_inter;
3387
            return 1;
3388
        }
3389

    
3390
        /*
3391
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3392
         * might have
3393
         *
3394
         * [sector_num+x, nr_sectors] allocated.
3395
         */
3396
        if (n > pnum_inter &&
3397
            (intermediate == top ||
3398
             sector_num + pnum_inter < intermediate->total_sectors)) {
3399
            n = pnum_inter;
3400
        }
3401

    
3402
        intermediate = intermediate->backing_hd;
3403
    }
3404

    
3405
    *pnum = n;
3406
    return 0;
3407
}
3408

    
3409
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3410
{
3411
    if (bs->backing_hd && bs->backing_hd->encrypted)
3412
        return bs->backing_file;
3413
    else if (bs->encrypted)
3414
        return bs->filename;
3415
    else
3416
        return NULL;
3417
}
3418

    
3419
void bdrv_get_backing_filename(BlockDriverState *bs,
3420
                               char *filename, int filename_size)
3421
{
3422
    pstrcpy(filename, filename_size, bs->backing_file);
3423
}
3424

    
3425
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3426
                          const uint8_t *buf, int nb_sectors)
3427
{
3428
    BlockDriver *drv = bs->drv;
3429
    if (!drv)
3430
        return -ENOMEDIUM;
3431
    if (!drv->bdrv_write_compressed)
3432
        return -ENOTSUP;
3433
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3434
        return -EIO;
3435

    
3436
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3437

    
3438
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3439
}
3440

    
3441
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3442
{
3443
    BlockDriver *drv = bs->drv;
3444
    if (!drv)
3445
        return -ENOMEDIUM;
3446
    if (!drv->bdrv_get_info)
3447
        return -ENOTSUP;
3448
    memset(bdi, 0, sizeof(*bdi));
3449
    return drv->bdrv_get_info(bs, bdi);
3450
}
3451

    
3452
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3453
{
3454
    BlockDriver *drv = bs->drv;
3455
    if (drv && drv->bdrv_get_specific_info) {
3456
        return drv->bdrv_get_specific_info(bs);
3457
    }
3458
    return NULL;
3459
}
3460

    
3461
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3462
                      int64_t pos, int size)
3463
{
3464
    QEMUIOVector qiov;
3465
    struct iovec iov = {
3466
        .iov_base   = (void *) buf,
3467
        .iov_len    = size,
3468
    };
3469

    
3470
    qemu_iovec_init_external(&qiov, &iov, 1);
3471
    return bdrv_writev_vmstate(bs, &qiov, pos);
3472
}
3473

    
3474
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3475
{
3476
    BlockDriver *drv = bs->drv;
3477

    
3478
    if (!drv) {
3479
        return -ENOMEDIUM;
3480
    } else if (drv->bdrv_save_vmstate) {
3481
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3482
    } else if (bs->file) {
3483
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3484
    }
3485

    
3486
    return -ENOTSUP;
3487
}
3488

    
3489
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3490
                      int64_t pos, int size)
3491
{
3492
    BlockDriver *drv = bs->drv;
3493
    if (!drv)
3494
        return -ENOMEDIUM;
3495
    if (drv->bdrv_load_vmstate)
3496
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3497
    if (bs->file)
3498
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3499
    return -ENOTSUP;
3500
}
3501

    
3502
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3503
{
3504
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3505
        return;
3506
    }
3507

    
3508
    bs->drv->bdrv_debug_event(bs, event);
3509
}
3510

    
3511
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3512
                          const char *tag)
3513
{
3514
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3515
        bs = bs->file;
3516
    }
3517

    
3518
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3519
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3520
    }
3521

    
3522
    return -ENOTSUP;
3523
}
3524

    
3525
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
3526
{
3527
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
3528
        bs = bs->file;
3529
    }
3530

    
3531
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
3532
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
3533
    }
3534

    
3535
    return -ENOTSUP;
3536
}
3537

    
3538
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3539
{
3540
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3541
        bs = bs->file;
3542
    }
3543

    
3544
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3545
        return bs->drv->bdrv_debug_resume(bs, tag);
3546
    }
3547

    
3548
    return -ENOTSUP;
3549
}
3550

    
3551
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3552
{
3553
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3554
        bs = bs->file;
3555
    }
3556

    
3557
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3558
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3559
    }
3560

    
3561
    return false;
3562
}
3563

    
3564
int bdrv_is_snapshot(BlockDriverState *bs)
3565
{
3566
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3567
}
3568

    
3569
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3570
 * relative, it must be relative to the chain.  So, passing in bs->filename
3571
 * from a BDS as backing_file should not be done, as that may be relative to
3572
 * the CWD rather than the chain. */
3573
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3574
        const char *backing_file)
3575
{
3576
    char *filename_full = NULL;
3577
    char *backing_file_full = NULL;
3578
    char *filename_tmp = NULL;
3579
    int is_protocol = 0;
3580
    BlockDriverState *curr_bs = NULL;
3581
    BlockDriverState *retval = NULL;
3582

    
3583
    if (!bs || !bs->drv || !backing_file) {
3584
        return NULL;
3585
    }
3586

    
3587
    filename_full     = g_malloc(PATH_MAX);
3588
    backing_file_full = g_malloc(PATH_MAX);
3589
    filename_tmp      = g_malloc(PATH_MAX);
3590

    
3591
    is_protocol = path_has_protocol(backing_file);
3592

    
3593
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3594

    
3595
        /* If either of the filename paths is actually a protocol, then
3596
         * compare unmodified paths; otherwise make paths relative */
3597
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3598
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3599
                retval = curr_bs->backing_hd;
3600
                break;
3601
            }
3602
        } else {
3603
            /* If not an absolute filename path, make it relative to the current
3604
             * image's filename path */
3605
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3606
                         backing_file);
3607

    
3608
            /* We are going to compare absolute pathnames */
3609
            if (!realpath(filename_tmp, filename_full)) {
3610
                continue;
3611
            }
3612

    
3613
            /* We need to make sure the backing filename we are comparing against
3614
             * is relative to the current image filename (or absolute) */
3615
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3616
                         curr_bs->backing_file);
3617

    
3618
            if (!realpath(filename_tmp, backing_file_full)) {
3619
                continue;
3620
            }
3621

    
3622
            if (strcmp(backing_file_full, filename_full) == 0) {
3623
                retval = curr_bs->backing_hd;
3624
                break;
3625
            }
3626
        }
3627
    }
3628

    
3629
    g_free(filename_full);
3630
    g_free(backing_file_full);
3631
    g_free(filename_tmp);
3632
    return retval;
3633
}
3634

    
3635
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3636
{
3637
    if (!bs->drv) {
3638
        return 0;
3639
    }
3640

    
3641
    if (!bs->backing_hd) {
3642
        return 0;
3643
    }
3644

    
3645
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3646
}
3647

    
3648
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3649
{
3650
    BlockDriverState *curr_bs = NULL;
3651

    
3652
    if (!bs) {
3653
        return NULL;
3654
    }
3655

    
3656
    curr_bs = bs;
3657

    
3658
    while (curr_bs->backing_hd) {
3659
        curr_bs = curr_bs->backing_hd;
3660
    }
3661
    return curr_bs;
3662
}
3663

    
3664
/**************************************************************/
3665
/* async I/Os */
3666

    
3667
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3668
                                 QEMUIOVector *qiov, int nb_sectors,
3669
                                 BlockDriverCompletionFunc *cb, void *opaque)
3670
{
3671
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3672

    
3673
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3674
                                 cb, opaque, false);
3675
}
3676

    
3677
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3678
                                  QEMUIOVector *qiov, int nb_sectors,
3679
                                  BlockDriverCompletionFunc *cb, void *opaque)
3680
{
3681
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3682

    
3683
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3684
                                 cb, opaque, true);
3685
}
3686

    
3687
BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
3688
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
3689
        BlockDriverCompletionFunc *cb, void *opaque)
3690
{
3691
    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
3692

    
3693
    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
3694
                                 BDRV_REQ_ZERO_WRITE | flags,
3695
                                 cb, opaque, true);
3696
}
3697

    
3698

    
3699
typedef struct MultiwriteCB {
3700
    int error;
3701
    int num_requests;
3702
    int num_callbacks;
3703
    struct {
3704
        BlockDriverCompletionFunc *cb;
3705
        void *opaque;
3706
        QEMUIOVector *free_qiov;
3707
    } callbacks[];
3708
} MultiwriteCB;
3709

    
3710
static void multiwrite_user_cb(MultiwriteCB *mcb)
3711
{
3712
    int i;
3713

    
3714
    for (i = 0; i < mcb->num_callbacks; i++) {
3715
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3716
        if (mcb->callbacks[i].free_qiov) {
3717
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3718
        }
3719
        g_free(mcb->callbacks[i].free_qiov);
3720
    }
3721
}
3722

    
3723
static void multiwrite_cb(void *opaque, int ret)
3724
{
3725
    MultiwriteCB *mcb = opaque;
3726

    
3727
    trace_multiwrite_cb(mcb, ret);
3728

    
3729
    if (ret < 0 && !mcb->error) {
3730
        mcb->error = ret;
3731
    }
3732

    
3733
    mcb->num_requests--;
3734
    if (mcb->num_requests == 0) {
3735
        multiwrite_user_cb(mcb);
3736
        g_free(mcb);
3737
    }
3738
}
3739

    
3740
static int multiwrite_req_compare(const void *a, const void *b)
3741
{
3742
    const BlockRequest *req1 = a, *req2 = b;
3743

    
3744
    /*
3745
     * Note that we can't simply subtract req2->sector from req1->sector
3746
     * here as that could overflow the return value.
3747
     */
3748
    if (req1->sector > req2->sector) {
3749
        return 1;
3750
    } else if (req1->sector < req2->sector) {
3751
        return -1;
3752
    } else {
3753
        return 0;
3754
    }
3755
}
3756

    
3757
/*
3758
 * Takes a bunch of requests and tries to merge them. Returns the number of
3759
 * requests that remain after merging.
3760
 */
3761
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3762
    int num_reqs, MultiwriteCB *mcb)
3763
{
3764
    int i, outidx;
3765

    
3766
    // Sort requests by start sector
3767
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3768

    
3769
    // Check if adjacent requests touch the same clusters. If so, combine them,
3770
    // filling up gaps with zero sectors.
3771
    outidx = 0;
3772
    for (i = 1; i < num_reqs; i++) {
3773
        int merge = 0;
3774
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3775

    
3776
        // Handle exactly sequential writes and overlapping writes.
3777
        if (reqs[i].sector <= oldreq_last) {
3778
            merge = 1;
3779
        }
3780

    
3781
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3782
            merge = 0;
3783
        }
3784

    
3785
        if (merge) {
3786
            size_t size;
3787
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3788
            qemu_iovec_init(qiov,
3789
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3790

    
3791
            // Add the first request to the merged one. If the requests are
3792
            // overlapping, drop the last sectors of the first request.
3793
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3794
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3795

    
3796
            // We should need to add any zeros between the two requests
3797
            assert (reqs[i].sector <= oldreq_last);
3798

    
3799
            // Add the second request
3800
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3801

    
3802
            reqs[outidx].nb_sectors = qiov->size >> 9;
3803
            reqs[outidx].qiov = qiov;
3804

    
3805
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3806
        } else {
3807
            outidx++;
3808
            reqs[outidx].sector     = reqs[i].sector;
3809
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3810
            reqs[outidx].qiov       = reqs[i].qiov;
3811
        }
3812
    }
3813

    
3814
    return outidx + 1;
3815
}
3816

    
3817
/*
3818
 * Submit multiple AIO write requests at once.
3819
 *
3820
 * On success, the function returns 0 and all requests in the reqs array have
3821
 * been submitted. In error case this function returns -1, and any of the
3822
 * requests may or may not be submitted yet. In particular, this means that the
3823
 * callback will be called for some of the requests, for others it won't. The
3824
 * caller must check the error field of the BlockRequest to wait for the right
3825
 * callbacks (if error != 0, no callback will be called).
3826
 *
3827
 * The implementation may modify the contents of the reqs array, e.g. to merge
3828
 * requests. However, the fields opaque and error are left unmodified as they
3829
 * are used to signal failure for a single request to the caller.
3830
 */
3831
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3832
{
3833
    MultiwriteCB *mcb;
3834
    int i;
3835

    
3836
    /* don't submit writes if we don't have a medium */
3837
    if (bs->drv == NULL) {
3838
        for (i = 0; i < num_reqs; i++) {
3839
            reqs[i].error = -ENOMEDIUM;
3840
        }
3841
        return -1;
3842
    }
3843

    
3844
    if (num_reqs == 0) {
3845
        return 0;
3846
    }
3847

    
3848
    // Create MultiwriteCB structure
3849
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3850
    mcb->num_requests = 0;
3851
    mcb->num_callbacks = num_reqs;
3852

    
3853
    for (i = 0; i < num_reqs; i++) {
3854
        mcb->callbacks[i].cb = reqs[i].cb;
3855
        mcb->callbacks[i].opaque = reqs[i].opaque;
3856
    }
3857

    
3858
    // Check for mergable requests
3859
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3860

    
3861
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3862

    
3863
    /* Run the aio requests. */
3864
    mcb->num_requests = num_reqs;
3865
    for (i = 0; i < num_reqs; i++) {
3866
        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
3867
                              reqs[i].nb_sectors, reqs[i].flags,
3868
                              multiwrite_cb, mcb,
3869
                              true);
3870
    }
3871

    
3872
    return 0;
3873
}
3874

    
3875
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3876
{
3877
    acb->aiocb_info->cancel(acb);
3878
}
3879

    
3880
/**************************************************************/
3881
/* async block device emulation */
3882

    
3883
typedef struct BlockDriverAIOCBSync {
3884
    BlockDriverAIOCB common;
3885
    QEMUBH *bh;
3886
    int ret;
3887
    /* vector translation state */
3888
    QEMUIOVector *qiov;
3889
    uint8_t *bounce;
3890
    int is_write;
3891
} BlockDriverAIOCBSync;
3892

    
3893
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3894
{
3895
    BlockDriverAIOCBSync *acb =
3896
        container_of(blockacb, BlockDriverAIOCBSync, common);
3897
    qemu_bh_delete(acb->bh);
3898
    acb->bh = NULL;
3899
    qemu_aio_release(acb);
3900
}
3901

    
3902
static const AIOCBInfo bdrv_em_aiocb_info = {
3903
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3904
    .cancel             = bdrv_aio_cancel_em,
3905
};
3906

    
3907
static void bdrv_aio_bh_cb(void *opaque)
3908
{
3909
    BlockDriverAIOCBSync *acb = opaque;
3910

    
3911
    if (!acb->is_write)
3912
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3913
    qemu_vfree(acb->bounce);
3914
    acb->common.cb(acb->common.opaque, acb->ret);
3915
    qemu_bh_delete(acb->bh);
3916
    acb->bh = NULL;
3917
    qemu_aio_release(acb);
3918
}
3919

    
3920
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3921
                                            int64_t sector_num,
3922
                                            QEMUIOVector *qiov,
3923
                                            int nb_sectors,
3924
                                            BlockDriverCompletionFunc *cb,
3925
                                            void *opaque,
3926
                                            int is_write)
3927

    
3928
{
3929
    BlockDriverAIOCBSync *acb;
3930

    
3931
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3932
    acb->is_write = is_write;
3933
    acb->qiov = qiov;
3934
    acb->bounce = qemu_blockalign(bs, qiov->size);
3935
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3936

    
3937
    if (is_write) {
3938
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3939
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3940
    } else {
3941
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3942
    }
3943

    
3944
    qemu_bh_schedule(acb->bh);
3945

    
3946
    return &acb->common;
3947
}
3948

    
3949
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3950
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3951
        BlockDriverCompletionFunc *cb, void *opaque)
3952
{
3953
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3954
}
3955

    
3956
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3957
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3958
        BlockDriverCompletionFunc *cb, void *opaque)
3959
{
3960
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3961
}
3962

    
3963

    
3964
typedef struct BlockDriverAIOCBCoroutine {
3965
    BlockDriverAIOCB common;
3966
    BlockRequest req;
3967
    bool is_write;
3968
    bool *done;
3969
    QEMUBH* bh;
3970
} BlockDriverAIOCBCoroutine;
3971

    
3972
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3973
{
3974
    BlockDriverAIOCBCoroutine *acb =
3975
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3976
    bool done = false;
3977

    
3978
    acb->done = &done;
3979
    while (!done) {
3980
        qemu_aio_wait();
3981
    }
3982
}
3983

    
3984
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3985
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3986
    .cancel             = bdrv_aio_co_cancel_em,
3987
};
3988

    
3989
static void bdrv_co_em_bh(void *opaque)
3990
{
3991
    BlockDriverAIOCBCoroutine *acb = opaque;
3992

    
3993
    acb->common.cb(acb->common.opaque, acb->req.error);
3994

    
3995
    if (acb->done) {
3996
        *acb->done = true;
3997
    }
3998

    
3999
    qemu_bh_delete(acb->bh);
4000
    qemu_aio_release(acb);
4001
}
4002

    
4003
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4004
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4005
{
4006
    BlockDriverAIOCBCoroutine *acb = opaque;
4007
    BlockDriverState *bs = acb->common.bs;
4008

    
4009
    if (!acb->is_write) {
4010
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4011
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4012
    } else {
4013
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4014
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4015
    }
4016

    
4017
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4018
    qemu_bh_schedule(acb->bh);
4019
}
4020

    
4021
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4022
                                               int64_t sector_num,
4023
                                               QEMUIOVector *qiov,
4024
                                               int nb_sectors,
4025
                                               BdrvRequestFlags flags,
4026
                                               BlockDriverCompletionFunc *cb,
4027
                                               void *opaque,
4028
                                               bool is_write)
4029
{
4030
    Coroutine *co;
4031
    BlockDriverAIOCBCoroutine *acb;
4032

    
4033
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4034
    acb->req.sector = sector_num;
4035
    acb->req.nb_sectors = nb_sectors;
4036
    acb->req.qiov = qiov;
4037
    acb->req.flags = flags;
4038
    acb->is_write = is_write;
4039
    acb->done = NULL;
4040

    
4041
    co = qemu_coroutine_create(bdrv_co_do_rw);
4042
    qemu_coroutine_enter(co, acb);
4043

    
4044
    return &acb->common;
4045
}
4046

    
4047
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4048
{
4049
    BlockDriverAIOCBCoroutine *acb = opaque;
4050
    BlockDriverState *bs = acb->common.bs;
4051

    
4052
    acb->req.error = bdrv_co_flush(bs);
4053
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4054
    qemu_bh_schedule(acb->bh);
4055
}
4056

    
4057
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4058
        BlockDriverCompletionFunc *cb, void *opaque)
4059
{
4060
    trace_bdrv_aio_flush(bs, opaque);
4061

    
4062
    Coroutine *co;
4063
    BlockDriverAIOCBCoroutine *acb;
4064

    
4065
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4066
    acb->done = NULL;
4067

    
4068
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4069
    qemu_coroutine_enter(co, acb);
4070

    
4071
    return &acb->common;
4072
}
4073

    
4074
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4075
{
4076
    BlockDriverAIOCBCoroutine *acb = opaque;
4077
    BlockDriverState *bs = acb->common.bs;
4078

    
4079
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4080
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4081
    qemu_bh_schedule(acb->bh);
4082
}
4083

    
4084
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4085
        int64_t sector_num, int nb_sectors,
4086
        BlockDriverCompletionFunc *cb, void *opaque)
4087
{
4088
    Coroutine *co;
4089
    BlockDriverAIOCBCoroutine *acb;
4090

    
4091
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4092

    
4093
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4094
    acb->req.sector = sector_num;
4095
    acb->req.nb_sectors = nb_sectors;
4096
    acb->done = NULL;
4097
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4098
    qemu_coroutine_enter(co, acb);
4099

    
4100
    return &acb->common;
4101
}
4102

    
4103
void bdrv_init(void)
4104
{
4105
    module_call_init(MODULE_INIT_BLOCK);
4106
}
4107

    
4108
void bdrv_init_with_whitelist(void)
4109
{
4110
    use_bdrv_whitelist = 1;
4111
    bdrv_init();
4112
}
4113

    
4114
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4115
                   BlockDriverCompletionFunc *cb, void *opaque)
4116
{
4117
    BlockDriverAIOCB *acb;
4118

    
4119
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4120
    acb->aiocb_info = aiocb_info;
4121
    acb->bs = bs;
4122
    acb->cb = cb;
4123
    acb->opaque = opaque;
4124
    return acb;
4125
}
4126

    
4127
void qemu_aio_release(void *p)
4128
{
4129
    BlockDriverAIOCB *acb = p;
4130
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4131
}
4132

    
4133
/**************************************************************/
4134
/* Coroutine block device emulation */
4135

    
4136
typedef struct CoroutineIOCompletion {
4137
    Coroutine *coroutine;
4138
    int ret;
4139
} CoroutineIOCompletion;
4140

    
4141
static void bdrv_co_io_em_complete(void *opaque, int ret)
4142
{
4143
    CoroutineIOCompletion *co = opaque;
4144

    
4145
    co->ret = ret;
4146
    qemu_coroutine_enter(co->coroutine, NULL);
4147
}
4148

    
4149
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4150
                                      int nb_sectors, QEMUIOVector *iov,
4151
                                      bool is_write)
4152
{
4153
    CoroutineIOCompletion co = {
4154
        .coroutine = qemu_coroutine_self(),
4155
    };
4156
    BlockDriverAIOCB *acb;
4157

    
4158
    if (is_write) {
4159
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4160
                                       bdrv_co_io_em_complete, &co);
4161
    } else {
4162
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4163
                                      bdrv_co_io_em_complete, &co);
4164
    }
4165

    
4166
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4167
    if (!acb) {
4168
        return -EIO;
4169
    }
4170
    qemu_coroutine_yield();
4171

    
4172
    return co.ret;
4173
}
4174

    
4175
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4176
                                         int64_t sector_num, int nb_sectors,
4177
                                         QEMUIOVector *iov)
4178
{
4179
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4180
}
4181

    
4182
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4183
                                         int64_t sector_num, int nb_sectors,
4184
                                         QEMUIOVector *iov)
4185
{
4186
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4187
}
4188

    
4189
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4190
{
4191
    RwCo *rwco = opaque;
4192

    
4193
    rwco->ret = bdrv_co_flush(rwco->bs);
4194
}
4195

    
4196
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4197
{
4198
    int ret;
4199

    
4200
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4201
        return 0;
4202
    }
4203

    
4204
    /* Write back cached data to the OS even with cache=unsafe */
4205
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4206
    if (bs->drv->bdrv_co_flush_to_os) {
4207
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4208
        if (ret < 0) {
4209
            return ret;
4210
        }
4211
    }
4212

    
4213
    /* But don't actually force it to the disk with cache=unsafe */
4214
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4215
        goto flush_parent;
4216
    }
4217

    
4218
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4219
    if (bs->drv->bdrv_co_flush_to_disk) {
4220
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4221
    } else if (bs->drv->bdrv_aio_flush) {
4222
        BlockDriverAIOCB *acb;
4223
        CoroutineIOCompletion co = {
4224
            .coroutine = qemu_coroutine_self(),
4225
        };
4226

    
4227
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4228
        if (acb == NULL) {
4229
            ret = -EIO;
4230
        } else {
4231
            qemu_coroutine_yield();
4232
            ret = co.ret;
4233
        }
4234
    } else {
4235
        /*
4236
         * Some block drivers always operate in either writethrough or unsafe
4237
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4238
         * know how the server works (because the behaviour is hardcoded or
4239
         * depends on server-side configuration), so we can't ensure that
4240
         * everything is safe on disk. Returning an error doesn't work because
4241
         * that would break guests even if the server operates in writethrough
4242
         * mode.
4243
         *
4244
         * Let's hope the user knows what he's doing.
4245
         */
4246
        ret = 0;
4247
    }
4248
    if (ret < 0) {
4249
        return ret;
4250
    }
4251

    
4252
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4253
     * in the case of cache=unsafe, so there are no useless flushes.
4254
     */
4255
flush_parent:
4256
    return bdrv_co_flush(bs->file);
4257
}
4258

    
4259
void bdrv_invalidate_cache(BlockDriverState *bs)
4260
{
4261
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4262
        bs->drv->bdrv_invalidate_cache(bs);
4263
    }
4264
}
4265

    
4266
void bdrv_invalidate_cache_all(void)
4267
{
4268
    BlockDriverState *bs;
4269

    
4270
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4271
        bdrv_invalidate_cache(bs);
4272
    }
4273
}
4274

    
4275
void bdrv_clear_incoming_migration_all(void)
4276
{
4277
    BlockDriverState *bs;
4278

    
4279
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4280
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4281
    }
4282
}
4283

    
4284
int bdrv_flush(BlockDriverState *bs)
4285
{
4286
    Coroutine *co;
4287
    RwCo rwco = {
4288
        .bs = bs,
4289
        .ret = NOT_DONE,
4290
    };
4291

    
4292
    if (qemu_in_coroutine()) {
4293
        /* Fast-path if already in coroutine context */
4294
        bdrv_flush_co_entry(&rwco);
4295
    } else {
4296
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4297
        qemu_coroutine_enter(co, &rwco);
4298
        while (rwco.ret == NOT_DONE) {
4299
            qemu_aio_wait();
4300
        }
4301
    }
4302

    
4303
    return rwco.ret;
4304
}
4305

    
4306
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4307
{
4308
    RwCo *rwco = opaque;
4309

    
4310
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4311
}
4312

    
4313
/* if no limit is specified in the BlockLimits use a default
4314
 * of 32768 512-byte sectors (16 MiB) per request.
4315
 */
4316
#define MAX_DISCARD_DEFAULT 32768
4317

    
4318
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4319
                                 int nb_sectors)
4320
{
4321
    int max_discard;
4322

    
4323
    if (!bs->drv) {
4324
        return -ENOMEDIUM;
4325
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4326
        return -EIO;
4327
    } else if (bs->read_only) {
4328
        return -EROFS;
4329
    }
4330

    
4331
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4332

    
4333
    /* Do nothing if disabled.  */
4334
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4335
        return 0;
4336
    }
4337

    
4338
    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4339
        return 0;
4340
    }
4341

    
4342
    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4343
    while (nb_sectors > 0) {
4344
        int ret;
4345
        int num = nb_sectors;
4346

    
4347
        /* align request */
4348
        if (bs->bl.discard_alignment &&
4349
            num >= bs->bl.discard_alignment &&
4350
            sector_num % bs->bl.discard_alignment) {
4351
            if (num > bs->bl.discard_alignment) {
4352
                num = bs->bl.discard_alignment;
4353
            }
4354
            num -= sector_num % bs->bl.discard_alignment;
4355
        }
4356

    
4357
        /* limit request size */
4358
        if (num > max_discard) {
4359
            num = max_discard;
4360
        }
4361

    
4362
        if (bs->drv->bdrv_co_discard) {
4363
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4364
        } else {
4365
            BlockDriverAIOCB *acb;
4366
            CoroutineIOCompletion co = {
4367
                .coroutine = qemu_coroutine_self(),
4368
            };
4369

    
4370
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4371
                                            bdrv_co_io_em_complete, &co);
4372
            if (acb == NULL) {
4373
                return -EIO;
4374
            } else {
4375
                qemu_coroutine_yield();
4376
                ret = co.ret;
4377
            }
4378
        }
4379
        if (ret) {
4380
            return ret;
4381
        }
4382

    
4383
        sector_num += num;
4384
        nb_sectors -= num;
4385
    }
4386
    return 0;
4387
}
4388

    
4389
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4390
{
4391
    Coroutine *co;
4392
    RwCo rwco = {
4393
        .bs = bs,
4394
        .sector_num = sector_num,
4395
        .nb_sectors = nb_sectors,
4396
        .ret = NOT_DONE,
4397
    };
4398

    
4399
    if (qemu_in_coroutine()) {
4400
        /* Fast-path if already in coroutine context */
4401
        bdrv_discard_co_entry(&rwco);
4402
    } else {
4403
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4404
        qemu_coroutine_enter(co, &rwco);
4405
        while (rwco.ret == NOT_DONE) {
4406
            qemu_aio_wait();
4407
        }
4408
    }
4409

    
4410
    return rwco.ret;
4411
}
4412

    
4413
/**************************************************************/
4414
/* removable device support */
4415

    
4416
/**
4417
 * Return TRUE if the media is present
4418
 */
4419
int bdrv_is_inserted(BlockDriverState *bs)
4420
{
4421
    BlockDriver *drv = bs->drv;
4422

    
4423
    if (!drv)
4424
        return 0;
4425
    if (!drv->bdrv_is_inserted)
4426
        return 1;
4427
    return drv->bdrv_is_inserted(bs);
4428
}
4429

    
4430
/**
4431
 * Return whether the media changed since the last call to this
4432
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4433
 */
4434
int bdrv_media_changed(BlockDriverState *bs)
4435
{
4436
    BlockDriver *drv = bs->drv;
4437

    
4438
    if (drv && drv->bdrv_media_changed) {
4439
        return drv->bdrv_media_changed(bs);
4440
    }
4441
    return -ENOTSUP;
4442
}
4443

    
4444
/**
4445
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4446
 */
4447
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4448
{
4449
    BlockDriver *drv = bs->drv;
4450

    
4451
    if (drv && drv->bdrv_eject) {
4452
        drv->bdrv_eject(bs, eject_flag);
4453
    }
4454

    
4455
    if (bs->device_name[0] != '\0') {
4456
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4457
    }
4458
}
4459

    
4460
/**
4461
 * Lock or unlock the media (if it is locked, the user won't be able
4462
 * to eject it manually).
4463
 */
4464
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4465
{
4466
    BlockDriver *drv = bs->drv;
4467

    
4468
    trace_bdrv_lock_medium(bs, locked);
4469

    
4470
    if (drv && drv->bdrv_lock_medium) {
4471
        drv->bdrv_lock_medium(bs, locked);
4472
    }
4473
}
4474

    
4475
/* needed for generic scsi interface */
4476

    
4477
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4478
{
4479
    BlockDriver *drv = bs->drv;
4480

    
4481
    if (drv && drv->bdrv_ioctl)
4482
        return drv->bdrv_ioctl(bs, req, buf);
4483
    return -ENOTSUP;
4484
}
4485

    
4486
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4487
        unsigned long int req, void *buf,
4488
        BlockDriverCompletionFunc *cb, void *opaque)
4489
{
4490
    BlockDriver *drv = bs->drv;
4491

    
4492
    if (drv && drv->bdrv_aio_ioctl)
4493
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4494
    return NULL;
4495
}
4496

    
4497
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4498
{
4499
    bs->buffer_alignment = align;
4500
}
4501

    
4502
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4503
{
4504
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4505
}
4506

    
4507
/*
4508
 * Check if all memory in this vector is sector aligned.
4509
 */
4510
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4511
{
4512
    int i;
4513

    
4514
    for (i = 0; i < qiov->niov; i++) {
4515
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4516
            return false;
4517
        }
4518
    }
4519

    
4520
    return true;
4521
}
4522

    
4523
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
4524
{
4525
    int64_t bitmap_size;
4526
    BdrvDirtyBitmap *bitmap;
4527

    
4528
    assert((granularity & (granularity - 1)) == 0);
4529

    
4530
    granularity >>= BDRV_SECTOR_BITS;
4531
    assert(granularity);
4532
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4533
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
4534
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4535
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
4536
    return bitmap;
4537
}
4538

    
4539
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4540
{
4541
    BdrvDirtyBitmap *bm, *next;
4542
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
4543
        if (bm == bitmap) {
4544
            QLIST_REMOVE(bitmap, list);
4545
            hbitmap_free(bitmap->bitmap);
4546
            g_free(bitmap);
4547
            return;
4548
        }
4549
    }
4550
}
4551

    
4552
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
4553
{
4554
    BdrvDirtyBitmap *bm;
4555
    BlockDirtyInfoList *list = NULL;
4556
    BlockDirtyInfoList **plist = &list;
4557

    
4558
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
4559
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
4560
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
4561
        info->count = bdrv_get_dirty_count(bs, bm);
4562
        info->granularity =
4563
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
4564
        entry->value = info;
4565
        *plist = entry;
4566
        plist = &entry->next;
4567
    }
4568

    
4569
    return list;
4570
}
4571

    
4572
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
4573
{
4574
    if (bitmap) {
4575
        return hbitmap_get(bitmap->bitmap, sector);
4576
    } else {
4577
        return 0;
4578
    }
4579
}
4580

    
4581
void bdrv_dirty_iter_init(BlockDriverState *bs,
4582
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
4583
{
4584
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
4585
}
4586

    
4587
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4588
                    int nr_sectors)
4589
{
4590
    BdrvDirtyBitmap *bitmap;
4591
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4592
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
4593
    }
4594
}
4595

    
4596
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
4597
{
4598
    BdrvDirtyBitmap *bitmap;
4599
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4600
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
4601
    }
4602
}
4603

    
4604
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4605
{
4606
    return hbitmap_count(bitmap->bitmap);
4607
}
4608

    
4609
/* Get a reference to bs */
4610
void bdrv_ref(BlockDriverState *bs)
4611
{
4612
    bs->refcnt++;
4613
}
4614

    
4615
/* Release a previously grabbed reference to bs.
4616
 * If after releasing, reference count is zero, the BlockDriverState is
4617
 * deleted. */
4618
void bdrv_unref(BlockDriverState *bs)
4619
{
4620
    assert(bs->refcnt > 0);
4621
    if (--bs->refcnt == 0) {
4622
        bdrv_delete(bs);
4623
    }
4624
}
4625

    
4626
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4627
{
4628
    assert(bs->in_use != in_use);
4629
    bs->in_use = in_use;
4630
}
4631

    
4632
int bdrv_in_use(BlockDriverState *bs)
4633
{
4634
    return bs->in_use;
4635
}
4636

    
4637
void bdrv_iostatus_enable(BlockDriverState *bs)
4638
{
4639
    bs->iostatus_enabled = true;
4640
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4641
}
4642

    
4643
/* The I/O status is only enabled if the drive explicitly
4644
 * enables it _and_ the VM is configured to stop on errors */
4645
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4646
{
4647
    return (bs->iostatus_enabled &&
4648
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4649
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4650
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4651
}
4652

    
4653
void bdrv_iostatus_disable(BlockDriverState *bs)
4654
{
4655
    bs->iostatus_enabled = false;
4656
}
4657

    
4658
void bdrv_iostatus_reset(BlockDriverState *bs)
4659
{
4660
    if (bdrv_iostatus_is_enabled(bs)) {
4661
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4662
        if (bs->job) {
4663
            block_job_iostatus_reset(bs->job);
4664
        }
4665
    }
4666
}
4667

    
4668
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4669
{
4670
    assert(bdrv_iostatus_is_enabled(bs));
4671
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4672
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4673
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4674
    }
4675
}
4676

    
4677
void
4678
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4679
        enum BlockAcctType type)
4680
{
4681
    assert(type < BDRV_MAX_IOTYPE);
4682

    
4683
    cookie->bytes = bytes;
4684
    cookie->start_time_ns = get_clock();
4685
    cookie->type = type;
4686
}
4687

    
4688
void
4689
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4690
{
4691
    assert(cookie->type < BDRV_MAX_IOTYPE);
4692

    
4693
    bs->nr_bytes[cookie->type] += cookie->bytes;
4694
    bs->nr_ops[cookie->type]++;
4695
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4696
}
4697

    
4698
void bdrv_img_create(const char *filename, const char *fmt,
4699
                     const char *base_filename, const char *base_fmt,
4700
                     char *options, uint64_t img_size, int flags,
4701
                     Error **errp, bool quiet)
4702
{
4703
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4704
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4705
    BlockDriverState *bs = NULL;
4706
    BlockDriver *drv, *proto_drv;
4707
    BlockDriver *backing_drv = NULL;
4708
    Error *local_err = NULL;
4709
    int ret = 0;
4710

    
4711
    /* Find driver and parse its options */
4712
    drv = bdrv_find_format(fmt);
4713
    if (!drv) {
4714
        error_setg(errp, "Unknown file format '%s'", fmt);
4715
        return;
4716
    }
4717

    
4718
    proto_drv = bdrv_find_protocol(filename, true);
4719
    if (!proto_drv) {
4720
        error_setg(errp, "Unknown protocol '%s'", filename);
4721
        return;
4722
    }
4723

    
4724
    create_options = append_option_parameters(create_options,
4725
                                              drv->create_options);
4726
    create_options = append_option_parameters(create_options,
4727
                                              proto_drv->create_options);
4728

    
4729
    /* Create parameter list with default values */
4730
    param = parse_option_parameters("", create_options, param);
4731

    
4732
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4733

    
4734
    /* Parse -o options */
4735
    if (options) {
4736
        param = parse_option_parameters(options, create_options, param);
4737
        if (param == NULL) {
4738
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4739
            goto out;
4740
        }
4741
    }
4742

    
4743
    if (base_filename) {
4744
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4745
                                 base_filename)) {
4746
            error_setg(errp, "Backing file not supported for file format '%s'",
4747
                       fmt);
4748
            goto out;
4749
        }
4750
    }
4751

    
4752
    if (base_fmt) {
4753
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4754
            error_setg(errp, "Backing file format not supported for file "
4755
                             "format '%s'", fmt);
4756
            goto out;
4757
        }
4758
    }
4759

    
4760
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4761
    if (backing_file && backing_file->value.s) {
4762
        if (!strcmp(filename, backing_file->value.s)) {
4763
            error_setg(errp, "Error: Trying to create an image with the "
4764
                             "same filename as the backing file");
4765
            goto out;
4766
        }
4767
    }
4768

    
4769
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4770
    if (backing_fmt && backing_fmt->value.s) {
4771
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4772
        if (!backing_drv) {
4773
            error_setg(errp, "Unknown backing file format '%s'",
4774
                       backing_fmt->value.s);
4775
            goto out;
4776
        }
4777
    }
4778

    
4779
    // The size for the image must always be specified, with one exception:
4780
    // If we are using a backing file, we can obtain the size from there
4781
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4782
    if (size && size->value.n == -1) {
4783
        if (backing_file && backing_file->value.s) {
4784
            uint64_t size;
4785
            char buf[32];
4786
            int back_flags;
4787

    
4788
            /* backing files always opened read-only */
4789
            back_flags =
4790
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4791

    
4792
            bs = bdrv_new("");
4793

    
4794
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4795
                            backing_drv, &local_err);
4796
            if (ret < 0) {
4797
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
4798
                                 backing_file->value.s,
4799
                                 error_get_pretty(local_err));
4800
                error_free(local_err);
4801
                local_err = NULL;
4802
                goto out;
4803
            }
4804
            bdrv_get_geometry(bs, &size);
4805
            size *= 512;
4806

    
4807
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4808
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4809
        } else {
4810
            error_setg(errp, "Image creation needs a size parameter");
4811
            goto out;
4812
        }
4813
    }
4814

    
4815
    if (!quiet) {
4816
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4817
        print_option_parameters(param);
4818
        puts("");
4819
    }
4820
    ret = bdrv_create(drv, filename, param, &local_err);
4821
    if (ret == -EFBIG) {
4822
        /* This is generally a better message than whatever the driver would
4823
         * deliver (especially because of the cluster_size_hint), since that
4824
         * is most probably not much different from "image too large". */
4825
        const char *cluster_size_hint = "";
4826
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4827
            cluster_size_hint = " (try using a larger cluster size)";
4828
        }
4829
        error_setg(errp, "The image size is too large for file format '%s'"
4830
                   "%s", fmt, cluster_size_hint);
4831
        error_free(local_err);
4832
        local_err = NULL;
4833
    }
4834

    
4835
out:
4836
    free_option_parameters(create_options);
4837
    free_option_parameters(param);
4838

    
4839
    if (bs) {
4840
        bdrv_unref(bs);
4841
    }
4842
    if (error_is_set(&local_err)) {
4843
        error_propagate(errp, local_err);
4844
    }
4845
}
4846

    
4847
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4848
{
4849
    /* Currently BlockDriverState always uses the main loop AioContext */
4850
    return qemu_get_aio_context();
4851
}
4852

    
4853
void bdrv_add_before_write_notifier(BlockDriverState *bs,
4854
                                    NotifierWithReturn *notifier)
4855
{
4856
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4857
}
4858

    
4859
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
4860
{
4861
    if (bs->drv->bdrv_amend_options == NULL) {
4862
        return -ENOTSUP;
4863
    }
4864
    return bs->drv->bdrv_amend_options(bs, options);
4865
}
4866

    
4867
ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs)
4868
{
4869
    if (bs->drv->bdrv_check_ext_snapshot) {
4870
        return bs->drv->bdrv_check_ext_snapshot(bs);
4871
    }
4872

    
4873
    if (bs->file && bs->file->drv && bs->file->drv->bdrv_check_ext_snapshot) {
4874
        return bs->file->drv->bdrv_check_ext_snapshot(bs);
4875
    }
4876

    
4877
    /* external snapshots are allowed by default */
4878
    return EXT_SNAPSHOT_ALLOWED;
4879
}
4880

    
4881
ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs)
4882
{
4883
    return EXT_SNAPSHOT_FORBIDDEN;
4884
}