Statistics
| Branch: | Revision:

root / block.c @ 4cc70e93

History | View | Annotate | Download (134.6 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "qmp-commands.h"
36
#include "qemu/timer.h"
37

    
38
#ifdef CONFIG_BSD
39
#include <sys/types.h>
40
#include <sys/stat.h>
41
#include <sys/ioctl.h>
42
#include <sys/queue.h>
43
#ifndef __DragonFly__
44
#include <sys/disk.h>
45
#endif
46
#endif
47

    
48
#ifdef _WIN32
49
#include <windows.h>
50
#endif
51

    
52
struct BdrvDirtyBitmap {
53
    HBitmap *bitmap;
54
    QLIST_ENTRY(BdrvDirtyBitmap) list;
55
};
56

    
57
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
58

    
59
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65
        BlockDriverCompletionFunc *cb, void *opaque);
66
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70
                                         int64_t sector_num, int nb_sectors,
71
                                         QEMUIOVector *iov);
72
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77
    BdrvRequestFlags flags);
78
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79
                                               int64_t sector_num,
80
                                               QEMUIOVector *qiov,
81
                                               int nb_sectors,
82
                                               BlockDriverCompletionFunc *cb,
83
                                               void *opaque,
84
                                               bool is_write);
85
static void coroutine_fn bdrv_co_do_rw(void *opaque);
86
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
88

    
89
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
90
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
91

    
92
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
93
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
94

    
95
/* If non-zero, use only whitelisted block drivers */
96
static int use_bdrv_whitelist;
97

    
98
#ifdef _WIN32
99
static int is_windows_drive_prefix(const char *filename)
100
{
101
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
102
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
103
            filename[1] == ':');
104
}
105

    
106
int is_windows_drive(const char *filename)
107
{
108
    if (is_windows_drive_prefix(filename) &&
109
        filename[2] == '\0')
110
        return 1;
111
    if (strstart(filename, "\\\\.\\", NULL) ||
112
        strstart(filename, "//./", NULL))
113
        return 1;
114
    return 0;
115
}
116
#endif
117

    
118
/* throttling disk I/O limits */
119
void bdrv_set_io_limits(BlockDriverState *bs,
120
                        ThrottleConfig *cfg)
121
{
122
    int i;
123

    
124
    throttle_config(&bs->throttle_state, cfg);
125

    
126
    for (i = 0; i < 2; i++) {
127
        qemu_co_enter_next(&bs->throttled_reqs[i]);
128
    }
129
}
130

    
131
/* this function drain all the throttled IOs */
132
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
133
{
134
    bool drained = false;
135
    bool enabled = bs->io_limits_enabled;
136
    int i;
137

    
138
    bs->io_limits_enabled = false;
139

    
140
    for (i = 0; i < 2; i++) {
141
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
142
            drained = true;
143
        }
144
    }
145

    
146
    bs->io_limits_enabled = enabled;
147

    
148
    return drained;
149
}
150

    
151
void bdrv_io_limits_disable(BlockDriverState *bs)
152
{
153
    bs->io_limits_enabled = false;
154

    
155
    bdrv_start_throttled_reqs(bs);
156

    
157
    throttle_destroy(&bs->throttle_state);
158
}
159

    
160
static void bdrv_throttle_read_timer_cb(void *opaque)
161
{
162
    BlockDriverState *bs = opaque;
163
    qemu_co_enter_next(&bs->throttled_reqs[0]);
164
}
165

    
166
static void bdrv_throttle_write_timer_cb(void *opaque)
167
{
168
    BlockDriverState *bs = opaque;
169
    qemu_co_enter_next(&bs->throttled_reqs[1]);
170
}
171

    
172
/* should be called before bdrv_set_io_limits if a limit is set */
173
void bdrv_io_limits_enable(BlockDriverState *bs)
174
{
175
    assert(!bs->io_limits_enabled);
176
    throttle_init(&bs->throttle_state,
177
                  QEMU_CLOCK_VIRTUAL,
178
                  bdrv_throttle_read_timer_cb,
179
                  bdrv_throttle_write_timer_cb,
180
                  bs);
181
    bs->io_limits_enabled = true;
182
}
183

    
184
/* This function makes an IO wait if needed
185
 *
186
 * @nb_sectors: the number of sectors of the IO
187
 * @is_write:   is the IO a write
188
 */
189
static void bdrv_io_limits_intercept(BlockDriverState *bs,
190
                                     int nb_sectors,
191
                                     bool is_write)
192
{
193
    /* does this io must wait */
194
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
195

    
196
    /* if must wait or any request of this type throttled queue the IO */
197
    if (must_wait ||
198
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
199
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
200
    }
201

    
202
    /* the IO will be executed, do the accounting */
203
    throttle_account(&bs->throttle_state,
204
                     is_write,
205
                     nb_sectors * BDRV_SECTOR_SIZE);
206

    
207
    /* if the next request must wait -> do nothing */
208
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
209
        return;
210
    }
211

    
212
    /* else queue next request for execution */
213
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
214
}
215

    
216
/* check if the path starts with "<protocol>:" */
217
static int path_has_protocol(const char *path)
218
{
219
    const char *p;
220

    
221
#ifdef _WIN32
222
    if (is_windows_drive(path) ||
223
        is_windows_drive_prefix(path)) {
224
        return 0;
225
    }
226
    p = path + strcspn(path, ":/\\");
227
#else
228
    p = path + strcspn(path, ":/");
229
#endif
230

    
231
    return *p == ':';
232
}
233

    
234
int path_is_absolute(const char *path)
235
{
236
#ifdef _WIN32
237
    /* specific case for names like: "\\.\d:" */
238
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
239
        return 1;
240
    }
241
    return (*path == '/' || *path == '\\');
242
#else
243
    return (*path == '/');
244
#endif
245
}
246

    
247
/* if filename is absolute, just copy it to dest. Otherwise, build a
248
   path to it by considering it is relative to base_path. URL are
249
   supported. */
250
void path_combine(char *dest, int dest_size,
251
                  const char *base_path,
252
                  const char *filename)
253
{
254
    const char *p, *p1;
255
    int len;
256

    
257
    if (dest_size <= 0)
258
        return;
259
    if (path_is_absolute(filename)) {
260
        pstrcpy(dest, dest_size, filename);
261
    } else {
262
        p = strchr(base_path, ':');
263
        if (p)
264
            p++;
265
        else
266
            p = base_path;
267
        p1 = strrchr(base_path, '/');
268
#ifdef _WIN32
269
        {
270
            const char *p2;
271
            p2 = strrchr(base_path, '\\');
272
            if (!p1 || p2 > p1)
273
                p1 = p2;
274
        }
275
#endif
276
        if (p1)
277
            p1++;
278
        else
279
            p1 = base_path;
280
        if (p1 > p)
281
            p = p1;
282
        len = p - base_path;
283
        if (len > dest_size - 1)
284
            len = dest_size - 1;
285
        memcpy(dest, base_path, len);
286
        dest[len] = '\0';
287
        pstrcat(dest, dest_size, filename);
288
    }
289
}
290

    
291
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
292
{
293
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
294
        pstrcpy(dest, sz, bs->backing_file);
295
    } else {
296
        path_combine(dest, sz, bs->filename, bs->backing_file);
297
    }
298
}
299

    
300
void bdrv_register(BlockDriver *bdrv)
301
{
302
    /* Block drivers without coroutine functions need emulation */
303
    if (!bdrv->bdrv_co_readv) {
304
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
305
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
306

    
307
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
308
         * the block driver lacks aio we need to emulate that too.
309
         */
310
        if (!bdrv->bdrv_aio_readv) {
311
            /* add AIO emulation layer */
312
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
313
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
314
        }
315
    }
316

    
317
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
318
}
319

    
320
/* create a new block device (by default it is empty) */
321
BlockDriverState *bdrv_new(const char *device_name)
322
{
323
    BlockDriverState *bs;
324

    
325
    bs = g_malloc0(sizeof(BlockDriverState));
326
    QLIST_INIT(&bs->dirty_bitmaps);
327
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
328
    if (device_name[0] != '\0') {
329
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
330
    }
331
    bdrv_iostatus_disable(bs);
332
    notifier_list_init(&bs->close_notifiers);
333
    notifier_with_return_list_init(&bs->before_write_notifiers);
334
    qemu_co_queue_init(&bs->throttled_reqs[0]);
335
    qemu_co_queue_init(&bs->throttled_reqs[1]);
336
    bs->refcnt = 1;
337

    
338
    return bs;
339
}
340

    
341
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
342
{
343
    notifier_list_add(&bs->close_notifiers, notify);
344
}
345

    
346
BlockDriver *bdrv_find_format(const char *format_name)
347
{
348
    BlockDriver *drv1;
349
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
350
        if (!strcmp(drv1->format_name, format_name)) {
351
            return drv1;
352
        }
353
    }
354
    return NULL;
355
}
356

    
357
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
358
{
359
    static const char *whitelist_rw[] = {
360
        CONFIG_BDRV_RW_WHITELIST
361
    };
362
    static const char *whitelist_ro[] = {
363
        CONFIG_BDRV_RO_WHITELIST
364
    };
365
    const char **p;
366

    
367
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
368
        return 1;               /* no whitelist, anything goes */
369
    }
370

    
371
    for (p = whitelist_rw; *p; p++) {
372
        if (!strcmp(drv->format_name, *p)) {
373
            return 1;
374
        }
375
    }
376
    if (read_only) {
377
        for (p = whitelist_ro; *p; p++) {
378
            if (!strcmp(drv->format_name, *p)) {
379
                return 1;
380
            }
381
        }
382
    }
383
    return 0;
384
}
385

    
386
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
387
                                          bool read_only)
388
{
389
    BlockDriver *drv = bdrv_find_format(format_name);
390
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
391
}
392

    
393
typedef struct CreateCo {
394
    BlockDriver *drv;
395
    char *filename;
396
    QEMUOptionParameter *options;
397
    int ret;
398
    Error *err;
399
} CreateCo;
400

    
401
static void coroutine_fn bdrv_create_co_entry(void *opaque)
402
{
403
    Error *local_err = NULL;
404
    int ret;
405

    
406
    CreateCo *cco = opaque;
407
    assert(cco->drv);
408

    
409
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
410
    if (error_is_set(&local_err)) {
411
        error_propagate(&cco->err, local_err);
412
    }
413
    cco->ret = ret;
414
}
415

    
416
int bdrv_create(BlockDriver *drv, const char* filename,
417
    QEMUOptionParameter *options, Error **errp)
418
{
419
    int ret;
420

    
421
    Coroutine *co;
422
    CreateCo cco = {
423
        .drv = drv,
424
        .filename = g_strdup(filename),
425
        .options = options,
426
        .ret = NOT_DONE,
427
        .err = NULL,
428
    };
429

    
430
    if (!drv->bdrv_create) {
431
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
432
        ret = -ENOTSUP;
433
        goto out;
434
    }
435

    
436
    if (qemu_in_coroutine()) {
437
        /* Fast-path if already in coroutine context */
438
        bdrv_create_co_entry(&cco);
439
    } else {
440
        co = qemu_coroutine_create(bdrv_create_co_entry);
441
        qemu_coroutine_enter(co, &cco);
442
        while (cco.ret == NOT_DONE) {
443
            qemu_aio_wait();
444
        }
445
    }
446

    
447
    ret = cco.ret;
448
    if (ret < 0) {
449
        if (error_is_set(&cco.err)) {
450
            error_propagate(errp, cco.err);
451
        } else {
452
            error_setg_errno(errp, -ret, "Could not create image");
453
        }
454
    }
455

    
456
out:
457
    g_free(cco.filename);
458
    return ret;
459
}
460

    
461
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
462
                     Error **errp)
463
{
464
    BlockDriver *drv;
465
    Error *local_err = NULL;
466
    int ret;
467

    
468
    drv = bdrv_find_protocol(filename, true);
469
    if (drv == NULL) {
470
        error_setg(errp, "Could not find protocol for file '%s'", filename);
471
        return -ENOENT;
472
    }
473

    
474
    ret = bdrv_create(drv, filename, options, &local_err);
475
    if (error_is_set(&local_err)) {
476
        error_propagate(errp, local_err);
477
    }
478
    return ret;
479
}
480

    
481
/*
482
 * Create a uniquely-named empty temporary file.
483
 * Return 0 upon success, otherwise a negative errno value.
484
 */
485
int get_tmp_filename(char *filename, int size)
486
{
487
#ifdef _WIN32
488
    char temp_dir[MAX_PATH];
489
    /* GetTempFileName requires that its output buffer (4th param)
490
       have length MAX_PATH or greater.  */
491
    assert(size >= MAX_PATH);
492
    return (GetTempPath(MAX_PATH, temp_dir)
493
            && GetTempFileName(temp_dir, "qem", 0, filename)
494
            ? 0 : -GetLastError());
495
#else
496
    int fd;
497
    const char *tmpdir;
498
    tmpdir = getenv("TMPDIR");
499
    if (!tmpdir)
500
        tmpdir = "/tmp";
501
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
502
        return -EOVERFLOW;
503
    }
504
    fd = mkstemp(filename);
505
    if (fd < 0) {
506
        return -errno;
507
    }
508
    if (close(fd) != 0) {
509
        unlink(filename);
510
        return -errno;
511
    }
512
    return 0;
513
#endif
514
}
515

    
516
/*
517
 * Detect host devices. By convention, /dev/cdrom[N] is always
518
 * recognized as a host CDROM.
519
 */
520
static BlockDriver *find_hdev_driver(const char *filename)
521
{
522
    int score_max = 0, score;
523
    BlockDriver *drv = NULL, *d;
524

    
525
    QLIST_FOREACH(d, &bdrv_drivers, list) {
526
        if (d->bdrv_probe_device) {
527
            score = d->bdrv_probe_device(filename);
528
            if (score > score_max) {
529
                score_max = score;
530
                drv = d;
531
            }
532
        }
533
    }
534

    
535
    return drv;
536
}
537

    
538
BlockDriver *bdrv_find_protocol(const char *filename,
539
                                bool allow_protocol_prefix)
540
{
541
    BlockDriver *drv1;
542
    char protocol[128];
543
    int len;
544
    const char *p;
545

    
546
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
547

    
548
    /*
549
     * XXX(hch): we really should not let host device detection
550
     * override an explicit protocol specification, but moving this
551
     * later breaks access to device names with colons in them.
552
     * Thanks to the brain-dead persistent naming schemes on udev-
553
     * based Linux systems those actually are quite common.
554
     */
555
    drv1 = find_hdev_driver(filename);
556
    if (drv1) {
557
        return drv1;
558
    }
559

    
560
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
561
        return bdrv_find_format("file");
562
    }
563

    
564
    p = strchr(filename, ':');
565
    assert(p != NULL);
566
    len = p - filename;
567
    if (len > sizeof(protocol) - 1)
568
        len = sizeof(protocol) - 1;
569
    memcpy(protocol, filename, len);
570
    protocol[len] = '\0';
571
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
572
        if (drv1->protocol_name &&
573
            !strcmp(drv1->protocol_name, protocol)) {
574
            return drv1;
575
        }
576
    }
577
    return NULL;
578
}
579

    
580
static int find_image_format(BlockDriverState *bs, const char *filename,
581
                             BlockDriver **pdrv, Error **errp)
582
{
583
    int score, score_max;
584
    BlockDriver *drv1, *drv;
585
    uint8_t buf[2048];
586
    int ret = 0;
587

    
588
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
589
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
590
        drv = bdrv_find_format("raw");
591
        if (!drv) {
592
            error_setg(errp, "Could not find raw image format");
593
            ret = -ENOENT;
594
        }
595
        *pdrv = drv;
596
        return ret;
597
    }
598

    
599
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
600
    if (ret < 0) {
601
        error_setg_errno(errp, -ret, "Could not read image for determining its "
602
                         "format");
603
        *pdrv = NULL;
604
        return ret;
605
    }
606

    
607
    score_max = 0;
608
    drv = NULL;
609
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
610
        if (drv1->bdrv_probe) {
611
            score = drv1->bdrv_probe(buf, ret, filename);
612
            if (score > score_max) {
613
                score_max = score;
614
                drv = drv1;
615
            }
616
        }
617
    }
618
    if (!drv) {
619
        error_setg(errp, "Could not determine image format: No compatible "
620
                   "driver found");
621
        ret = -ENOENT;
622
    }
623
    *pdrv = drv;
624
    return ret;
625
}
626

    
627
/**
628
 * Set the current 'total_sectors' value
629
 */
630
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
631
{
632
    BlockDriver *drv = bs->drv;
633

    
634
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
635
    if (bs->sg)
636
        return 0;
637

    
638
    /* query actual device if possible, otherwise just trust the hint */
639
    if (drv->bdrv_getlength) {
640
        int64_t length = drv->bdrv_getlength(bs);
641
        if (length < 0) {
642
            return length;
643
        }
644
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
645
    }
646

    
647
    bs->total_sectors = hint;
648
    return 0;
649
}
650

    
651
/**
652
 * Set open flags for a given discard mode
653
 *
654
 * Return 0 on success, -1 if the discard mode was invalid.
655
 */
656
int bdrv_parse_discard_flags(const char *mode, int *flags)
657
{
658
    *flags &= ~BDRV_O_UNMAP;
659

    
660
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
661
        /* do nothing */
662
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
663
        *flags |= BDRV_O_UNMAP;
664
    } else {
665
        return -1;
666
    }
667

    
668
    return 0;
669
}
670

    
671
/**
672
 * Set open flags for a given cache mode
673
 *
674
 * Return 0 on success, -1 if the cache mode was invalid.
675
 */
676
int bdrv_parse_cache_flags(const char *mode, int *flags)
677
{
678
    *flags &= ~BDRV_O_CACHE_MASK;
679

    
680
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
681
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
682
    } else if (!strcmp(mode, "directsync")) {
683
        *flags |= BDRV_O_NOCACHE;
684
    } else if (!strcmp(mode, "writeback")) {
685
        *flags |= BDRV_O_CACHE_WB;
686
    } else if (!strcmp(mode, "unsafe")) {
687
        *flags |= BDRV_O_CACHE_WB;
688
        *flags |= BDRV_O_NO_FLUSH;
689
    } else if (!strcmp(mode, "writethrough")) {
690
        /* this is the default */
691
    } else {
692
        return -1;
693
    }
694

    
695
    return 0;
696
}
697

    
698
/**
699
 * The copy-on-read flag is actually a reference count so multiple users may
700
 * use the feature without worrying about clobbering its previous state.
701
 * Copy-on-read stays enabled until all users have called to disable it.
702
 */
703
void bdrv_enable_copy_on_read(BlockDriverState *bs)
704
{
705
    bs->copy_on_read++;
706
}
707

    
708
void bdrv_disable_copy_on_read(BlockDriverState *bs)
709
{
710
    assert(bs->copy_on_read > 0);
711
    bs->copy_on_read--;
712
}
713

    
714
static int bdrv_open_flags(BlockDriverState *bs, int flags)
715
{
716
    int open_flags = flags | BDRV_O_CACHE_WB;
717

    
718
    /*
719
     * Clear flags that are internal to the block layer before opening the
720
     * image.
721
     */
722
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
723

    
724
    /*
725
     * Snapshots should be writable.
726
     */
727
    if (bs->is_temporary) {
728
        open_flags |= BDRV_O_RDWR;
729
    }
730

    
731
    return open_flags;
732
}
733

    
734
/*
735
 * Common part for opening disk images and files
736
 *
737
 * Removes all processed options from *options.
738
 */
739
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
740
    QDict *options, int flags, BlockDriver *drv, Error **errp)
741
{
742
    int ret, open_flags;
743
    const char *filename;
744
    Error *local_err = NULL;
745

    
746
    assert(drv != NULL);
747
    assert(bs->file == NULL);
748
    assert(options != NULL && bs->options != options);
749

    
750
    if (file != NULL) {
751
        filename = file->filename;
752
    } else {
753
        filename = qdict_get_try_str(options, "filename");
754
    }
755

    
756
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
757

    
758
    /* bdrv_open() with directly using a protocol as drv. This layer is already
759
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
760
     * and return immediately. */
761
    if (file != NULL && drv->bdrv_file_open) {
762
        bdrv_swap(file, bs);
763
        return 0;
764
    }
765

    
766
    bs->open_flags = flags;
767
    bs->buffer_alignment = 512;
768
    bs->zero_beyond_eof = true;
769
    open_flags = bdrv_open_flags(bs, flags);
770
    bs->read_only = !(open_flags & BDRV_O_RDWR);
771

    
772
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
773
        error_setg(errp,
774
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
775
                        ? "Driver '%s' can only be used for read-only devices"
776
                        : "Driver '%s' is not whitelisted",
777
                   drv->format_name);
778
        return -ENOTSUP;
779
    }
780

    
781
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
782
    if (flags & BDRV_O_COPY_ON_READ) {
783
        if (!bs->read_only) {
784
            bdrv_enable_copy_on_read(bs);
785
        } else {
786
            error_setg(errp, "Can't use copy-on-read on read-only device");
787
            return -EINVAL;
788
        }
789
    }
790

    
791
    if (filename != NULL) {
792
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
793
    } else {
794
        bs->filename[0] = '\0';
795
    }
796

    
797
    bs->drv = drv;
798
    bs->opaque = g_malloc0(drv->instance_size);
799

    
800
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
801

    
802
    /* Open the image, either directly or using a protocol */
803
    if (drv->bdrv_file_open) {
804
        assert(file == NULL);
805
        assert(!drv->bdrv_needs_filename || filename != NULL);
806
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
807
    } else {
808
        if (file == NULL) {
809
            error_setg(errp, "Can't use '%s' as a block driver for the "
810
                       "protocol level", drv->format_name);
811
            ret = -EINVAL;
812
            goto free_and_fail;
813
        }
814
        bs->file = file;
815
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
816
    }
817

    
818
    if (ret < 0) {
819
        if (error_is_set(&local_err)) {
820
            error_propagate(errp, local_err);
821
        } else if (bs->filename[0]) {
822
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
823
        } else {
824
            error_setg_errno(errp, -ret, "Could not open image");
825
        }
826
        goto free_and_fail;
827
    }
828

    
829
    ret = refresh_total_sectors(bs, bs->total_sectors);
830
    if (ret < 0) {
831
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
832
        goto free_and_fail;
833
    }
834

    
835
#ifndef _WIN32
836
    if (bs->is_temporary) {
837
        assert(bs->filename[0] != '\0');
838
        unlink(bs->filename);
839
    }
840
#endif
841
    return 0;
842

    
843
free_and_fail:
844
    bs->file = NULL;
845
    g_free(bs->opaque);
846
    bs->opaque = NULL;
847
    bs->drv = NULL;
848
    return ret;
849
}
850

    
851
/*
852
 * Opens a file using a protocol (file, host_device, nbd, ...)
853
 *
854
 * options is a QDict of options to pass to the block drivers, or NULL for an
855
 * empty set of options. The reference to the QDict belongs to the block layer
856
 * after the call (even on failure), so if the caller intends to reuse the
857
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
858
 */
859
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
860
                   QDict *options, int flags, Error **errp)
861
{
862
    BlockDriverState *bs;
863
    BlockDriver *drv;
864
    const char *drvname;
865
    bool allow_protocol_prefix = false;
866
    Error *local_err = NULL;
867
    int ret;
868

    
869
    /* NULL means an empty set of options */
870
    if (options == NULL) {
871
        options = qdict_new();
872
    }
873

    
874
    bs = bdrv_new("");
875
    bs->options = options;
876
    options = qdict_clone_shallow(options);
877

    
878
    /* Fetch the file name from the options QDict if necessary */
879
    if (!filename) {
880
        filename = qdict_get_try_str(options, "filename");
881
    } else if (filename && !qdict_haskey(options, "filename")) {
882
        qdict_put(options, "filename", qstring_from_str(filename));
883
        allow_protocol_prefix = true;
884
    } else {
885
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
886
                   "same time");
887
        ret = -EINVAL;
888
        goto fail;
889
    }
890

    
891
    /* Find the right block driver */
892
    drvname = qdict_get_try_str(options, "driver");
893
    if (drvname) {
894
        drv = bdrv_find_format(drvname);
895
        if (!drv) {
896
            error_setg(errp, "Unknown driver '%s'", drvname);
897
        }
898
        qdict_del(options, "driver");
899
    } else if (filename) {
900
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
901
        if (!drv) {
902
            error_setg(errp, "Unknown protocol");
903
        }
904
    } else {
905
        error_setg(errp, "Must specify either driver or file");
906
        drv = NULL;
907
    }
908

    
909
    if (!drv) {
910
        /* errp has been set already */
911
        ret = -ENOENT;
912
        goto fail;
913
    }
914

    
915
    /* Parse the filename and open it */
916
    if (drv->bdrv_parse_filename && filename) {
917
        drv->bdrv_parse_filename(filename, options, &local_err);
918
        if (error_is_set(&local_err)) {
919
            error_propagate(errp, local_err);
920
            ret = -EINVAL;
921
            goto fail;
922
        }
923
        qdict_del(options, "filename");
924
    } else if (drv->bdrv_needs_filename && !filename) {
925
        error_setg(errp, "The '%s' block driver requires a file name",
926
                   drv->format_name);
927
        ret = -EINVAL;
928
        goto fail;
929
    }
930

    
931
    ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
932
    if (ret < 0) {
933
        error_propagate(errp, local_err);
934
        goto fail;
935
    }
936

    
937
    /* Check if any unknown options were used */
938
    if (qdict_size(options) != 0) {
939
        const QDictEntry *entry = qdict_first(options);
940
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
941
                   drv->format_name, entry->key);
942
        ret = -EINVAL;
943
        goto fail;
944
    }
945
    QDECREF(options);
946

    
947
    bs->growable = 1;
948
    *pbs = bs;
949
    return 0;
950

    
951
fail:
952
    QDECREF(options);
953
    if (!bs->drv) {
954
        QDECREF(bs->options);
955
    }
956
    bdrv_unref(bs);
957
    return ret;
958
}
959

    
960
/*
961
 * Opens the backing file for a BlockDriverState if not yet open
962
 *
963
 * options is a QDict of options to pass to the block drivers, or NULL for an
964
 * empty set of options. The reference to the QDict is transferred to this
965
 * function (even on failure), so if the caller intends to reuse the dictionary,
966
 * it needs to use QINCREF() before calling bdrv_file_open.
967
 */
968
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
969
{
970
    char backing_filename[PATH_MAX];
971
    int back_flags, ret;
972
    BlockDriver *back_drv = NULL;
973
    Error *local_err = NULL;
974

    
975
    if (bs->backing_hd != NULL) {
976
        QDECREF(options);
977
        return 0;
978
    }
979

    
980
    /* NULL means an empty set of options */
981
    if (options == NULL) {
982
        options = qdict_new();
983
    }
984

    
985
    bs->open_flags &= ~BDRV_O_NO_BACKING;
986
    if (qdict_haskey(options, "file.filename")) {
987
        backing_filename[0] = '\0';
988
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
989
        QDECREF(options);
990
        return 0;
991
    } else {
992
        bdrv_get_full_backing_filename(bs, backing_filename,
993
                                       sizeof(backing_filename));
994
    }
995

    
996
    bs->backing_hd = bdrv_new("");
997

    
998
    if (bs->backing_format[0] != '\0') {
999
        back_drv = bdrv_find_format(bs->backing_format);
1000
    }
1001

    
1002
    /* backing files always opened read-only */
1003
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1004
                                    BDRV_O_COPY_ON_READ);
1005

    
1006
    ret = bdrv_open(bs->backing_hd,
1007
                    *backing_filename ? backing_filename : NULL, options,
1008
                    back_flags, back_drv, &local_err);
1009
    if (ret < 0) {
1010
        bdrv_unref(bs->backing_hd);
1011
        bs->backing_hd = NULL;
1012
        bs->open_flags |= BDRV_O_NO_BACKING;
1013
        error_setg(errp, "Could not open backing file: %s",
1014
                   error_get_pretty(local_err));
1015
        error_free(local_err);
1016
        return ret;
1017
    }
1018
    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1019
            bs->backing_hd->file->filename);
1020
    return 0;
1021
}
1022

    
1023
/*
1024
 * Opens a disk image (raw, qcow2, vmdk, ...)
1025
 *
1026
 * options is a QDict of options to pass to the block drivers, or NULL for an
1027
 * empty set of options. The reference to the QDict belongs to the block layer
1028
 * after the call (even on failure), so if the caller intends to reuse the
1029
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1030
 */
1031
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1032
              int flags, BlockDriver *drv, Error **errp)
1033
{
1034
    int ret;
1035
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1036
    char tmp_filename[PATH_MAX + 1];
1037
    BlockDriverState *file = NULL;
1038
    QDict *file_options = NULL;
1039
    const char *drvname;
1040
    Error *local_err = NULL;
1041

    
1042
    /* NULL means an empty set of options */
1043
    if (options == NULL) {
1044
        options = qdict_new();
1045
    }
1046

    
1047
    bs->options = options;
1048
    options = qdict_clone_shallow(options);
1049

    
1050
    /* For snapshot=on, create a temporary qcow2 overlay */
1051
    if (flags & BDRV_O_SNAPSHOT) {
1052
        BlockDriverState *bs1;
1053
        int64_t total_size;
1054
        BlockDriver *bdrv_qcow2;
1055
        QEMUOptionParameter *create_options;
1056
        char backing_filename[PATH_MAX];
1057

    
1058
        if (qdict_size(options) != 0) {
1059
            error_setg(errp, "Can't use snapshot=on with driver-specific options");
1060
            ret = -EINVAL;
1061
            goto fail;
1062
        }
1063
        assert(filename != NULL);
1064

    
1065
        /* if snapshot, we create a temporary backing file and open it
1066
           instead of opening 'filename' directly */
1067

    
1068
        /* if there is a backing file, use it */
1069
        bs1 = bdrv_new("");
1070
        ret = bdrv_open(bs1, filename, NULL, 0, drv, &local_err);
1071
        if (ret < 0) {
1072
            bdrv_unref(bs1);
1073
            goto fail;
1074
        }
1075
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1076

    
1077
        bdrv_unref(bs1);
1078

    
1079
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1080
        if (ret < 0) {
1081
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1082
            goto fail;
1083
        }
1084

    
1085
        /* Real path is meaningless for protocols */
1086
        if (path_has_protocol(filename)) {
1087
            snprintf(backing_filename, sizeof(backing_filename),
1088
                     "%s", filename);
1089
        } else if (!realpath(filename, backing_filename)) {
1090
            ret = -errno;
1091
            error_setg_errno(errp, errno, "Could not resolve path '%s'", filename);
1092
            goto fail;
1093
        }
1094

    
1095
        bdrv_qcow2 = bdrv_find_format("qcow2");
1096
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1097
                                                 NULL);
1098

    
1099
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1100
        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
1101
                             backing_filename);
1102
        if (drv) {
1103
            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
1104
                drv->format_name);
1105
        }
1106

    
1107
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1108
        free_option_parameters(create_options);
1109
        if (ret < 0) {
1110
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1111
                             "'%s': %s", tmp_filename,
1112
                             error_get_pretty(local_err));
1113
            error_free(local_err);
1114
            local_err = NULL;
1115
            goto fail;
1116
        }
1117

    
1118
        filename = tmp_filename;
1119
        drv = bdrv_qcow2;
1120
        bs->is_temporary = 1;
1121
    }
1122

    
1123
    /* Open image file without format layer */
1124
    if (flags & BDRV_O_RDWR) {
1125
        flags |= BDRV_O_ALLOW_RDWR;
1126
    }
1127

    
1128
    qdict_extract_subqdict(options, &file_options, "file.");
1129

    
1130
    ret = bdrv_file_open(&file, filename, file_options,
1131
                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
1132
    if (ret < 0) {
1133
        goto fail;
1134
    }
1135

    
1136
    /* Find the right image format driver */
1137
    drvname = qdict_get_try_str(options, "driver");
1138
    if (drvname) {
1139
        drv = bdrv_find_format(drvname);
1140
        qdict_del(options, "driver");
1141
        if (!drv) {
1142
            error_setg(errp, "Invalid driver: '%s'", drvname);
1143
            ret = -EINVAL;
1144
            goto unlink_and_fail;
1145
        }
1146
    }
1147

    
1148
    if (!drv) {
1149
        ret = find_image_format(file, filename, &drv, &local_err);
1150
    }
1151

    
1152
    if (!drv) {
1153
        goto unlink_and_fail;
1154
    }
1155

    
1156
    /* Open the image */
1157
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1158
    if (ret < 0) {
1159
        goto unlink_and_fail;
1160
    }
1161

    
1162
    if (bs->file != file) {
1163
        bdrv_unref(file);
1164
        file = NULL;
1165
    }
1166

    
1167
    /* If there is a backing file, use it */
1168
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1169
        QDict *backing_options;
1170

    
1171
        qdict_extract_subqdict(options, &backing_options, "backing.");
1172
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1173
        if (ret < 0) {
1174
            goto close_and_fail;
1175
        }
1176
    }
1177

    
1178
    /* Check if any unknown options were used */
1179
    if (qdict_size(options) != 0) {
1180
        const QDictEntry *entry = qdict_first(options);
1181
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1182
                   "support the option '%s'", drv->format_name, bs->device_name,
1183
                   entry->key);
1184

    
1185
        ret = -EINVAL;
1186
        goto close_and_fail;
1187
    }
1188
    QDECREF(options);
1189

    
1190
    if (!bdrv_key_required(bs)) {
1191
        bdrv_dev_change_media_cb(bs, true);
1192
    }
1193

    
1194
    return 0;
1195

    
1196
unlink_and_fail:
1197
    if (file != NULL) {
1198
        bdrv_unref(file);
1199
    }
1200
    if (bs->is_temporary) {
1201
        unlink(filename);
1202
    }
1203
fail:
1204
    QDECREF(bs->options);
1205
    QDECREF(options);
1206
    bs->options = NULL;
1207
    if (error_is_set(&local_err)) {
1208
        error_propagate(errp, local_err);
1209
    }
1210
    return ret;
1211

    
1212
close_and_fail:
1213
    bdrv_close(bs);
1214
    QDECREF(options);
1215
    if (error_is_set(&local_err)) {
1216
        error_propagate(errp, local_err);
1217
    }
1218
    return ret;
1219
}
1220

    
1221
typedef struct BlockReopenQueueEntry {
1222
     bool prepared;
1223
     BDRVReopenState state;
1224
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1225
} BlockReopenQueueEntry;
1226

    
1227
/*
1228
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1229
 * reopen of multiple devices.
1230
 *
1231
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1232
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1233
 * be created and initialized. This newly created BlockReopenQueue should be
1234
 * passed back in for subsequent calls that are intended to be of the same
1235
 * atomic 'set'.
1236
 *
1237
 * bs is the BlockDriverState to add to the reopen queue.
1238
 *
1239
 * flags contains the open flags for the associated bs
1240
 *
1241
 * returns a pointer to bs_queue, which is either the newly allocated
1242
 * bs_queue, or the existing bs_queue being used.
1243
 *
1244
 */
1245
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1246
                                    BlockDriverState *bs, int flags)
1247
{
1248
    assert(bs != NULL);
1249

    
1250
    BlockReopenQueueEntry *bs_entry;
1251
    if (bs_queue == NULL) {
1252
        bs_queue = g_new0(BlockReopenQueue, 1);
1253
        QSIMPLEQ_INIT(bs_queue);
1254
    }
1255

    
1256
    if (bs->file) {
1257
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1258
    }
1259

    
1260
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1261
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1262

    
1263
    bs_entry->state.bs = bs;
1264
    bs_entry->state.flags = flags;
1265

    
1266
    return bs_queue;
1267
}
1268

    
1269
/*
1270
 * Reopen multiple BlockDriverStates atomically & transactionally.
1271
 *
1272
 * The queue passed in (bs_queue) must have been built up previous
1273
 * via bdrv_reopen_queue().
1274
 *
1275
 * Reopens all BDS specified in the queue, with the appropriate
1276
 * flags.  All devices are prepared for reopen, and failure of any
1277
 * device will cause all device changes to be abandonded, and intermediate
1278
 * data cleaned up.
1279
 *
1280
 * If all devices prepare successfully, then the changes are committed
1281
 * to all devices.
1282
 *
1283
 */
1284
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1285
{
1286
    int ret = -1;
1287
    BlockReopenQueueEntry *bs_entry, *next;
1288
    Error *local_err = NULL;
1289

    
1290
    assert(bs_queue != NULL);
1291

    
1292
    bdrv_drain_all();
1293

    
1294
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1295
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1296
            error_propagate(errp, local_err);
1297
            goto cleanup;
1298
        }
1299
        bs_entry->prepared = true;
1300
    }
1301

    
1302
    /* If we reach this point, we have success and just need to apply the
1303
     * changes
1304
     */
1305
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1306
        bdrv_reopen_commit(&bs_entry->state);
1307
    }
1308

    
1309
    ret = 0;
1310

    
1311
cleanup:
1312
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1313
        if (ret && bs_entry->prepared) {
1314
            bdrv_reopen_abort(&bs_entry->state);
1315
        }
1316
        g_free(bs_entry);
1317
    }
1318
    g_free(bs_queue);
1319
    return ret;
1320
}
1321

    
1322

    
1323
/* Reopen a single BlockDriverState with the specified flags. */
1324
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1325
{
1326
    int ret = -1;
1327
    Error *local_err = NULL;
1328
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1329

    
1330
    ret = bdrv_reopen_multiple(queue, &local_err);
1331
    if (local_err != NULL) {
1332
        error_propagate(errp, local_err);
1333
    }
1334
    return ret;
1335
}
1336

    
1337

    
1338
/*
1339
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1340
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1341
 * the block driver layer .bdrv_reopen_prepare()
1342
 *
1343
 * bs is the BlockDriverState to reopen
1344
 * flags are the new open flags
1345
 * queue is the reopen queue
1346
 *
1347
 * Returns 0 on success, non-zero on error.  On error errp will be set
1348
 * as well.
1349
 *
1350
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1351
 * It is the responsibility of the caller to then call the abort() or
1352
 * commit() for any other BDS that have been left in a prepare() state
1353
 *
1354
 */
1355
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1356
                        Error **errp)
1357
{
1358
    int ret = -1;
1359
    Error *local_err = NULL;
1360
    BlockDriver *drv;
1361

    
1362
    assert(reopen_state != NULL);
1363
    assert(reopen_state->bs->drv != NULL);
1364
    drv = reopen_state->bs->drv;
1365

    
1366
    /* if we are to stay read-only, do not allow permission change
1367
     * to r/w */
1368
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1369
        reopen_state->flags & BDRV_O_RDWR) {
1370
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1371
                  reopen_state->bs->device_name);
1372
        goto error;
1373
    }
1374

    
1375

    
1376
    ret = bdrv_flush(reopen_state->bs);
1377
    if (ret) {
1378
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1379
                  strerror(-ret));
1380
        goto error;
1381
    }
1382

    
1383
    if (drv->bdrv_reopen_prepare) {
1384
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1385
        if (ret) {
1386
            if (local_err != NULL) {
1387
                error_propagate(errp, local_err);
1388
            } else {
1389
                error_setg(errp, "failed while preparing to reopen image '%s'",
1390
                           reopen_state->bs->filename);
1391
            }
1392
            goto error;
1393
        }
1394
    } else {
1395
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1396
         * handler for each supported drv. */
1397
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1398
                  drv->format_name, reopen_state->bs->device_name,
1399
                 "reopening of file");
1400
        ret = -1;
1401
        goto error;
1402
    }
1403

    
1404
    ret = 0;
1405

    
1406
error:
1407
    return ret;
1408
}
1409

    
1410
/*
1411
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1412
 * makes them final by swapping the staging BlockDriverState contents into
1413
 * the active BlockDriverState contents.
1414
 */
1415
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1416
{
1417
    BlockDriver *drv;
1418

    
1419
    assert(reopen_state != NULL);
1420
    drv = reopen_state->bs->drv;
1421
    assert(drv != NULL);
1422

    
1423
    /* If there are any driver level actions to take */
1424
    if (drv->bdrv_reopen_commit) {
1425
        drv->bdrv_reopen_commit(reopen_state);
1426
    }
1427

    
1428
    /* set BDS specific flags now */
1429
    reopen_state->bs->open_flags         = reopen_state->flags;
1430
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1431
                                              BDRV_O_CACHE_WB);
1432
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1433
}
1434

    
1435
/*
1436
 * Abort the reopen, and delete and free the staged changes in
1437
 * reopen_state
1438
 */
1439
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1440
{
1441
    BlockDriver *drv;
1442

    
1443
    assert(reopen_state != NULL);
1444
    drv = reopen_state->bs->drv;
1445
    assert(drv != NULL);
1446

    
1447
    if (drv->bdrv_reopen_abort) {
1448
        drv->bdrv_reopen_abort(reopen_state);
1449
    }
1450
}
1451

    
1452

    
1453
void bdrv_close(BlockDriverState *bs)
1454
{
1455
    if (bs->job) {
1456
        block_job_cancel_sync(bs->job);
1457
    }
1458
    bdrv_drain_all(); /* complete I/O */
1459
    bdrv_flush(bs);
1460
    bdrv_drain_all(); /* in case flush left pending I/O */
1461
    notifier_list_notify(&bs->close_notifiers, bs);
1462

    
1463
    if (bs->drv) {
1464
        if (bs->backing_hd) {
1465
            bdrv_unref(bs->backing_hd);
1466
            bs->backing_hd = NULL;
1467
        }
1468
        bs->drv->bdrv_close(bs);
1469
        g_free(bs->opaque);
1470
#ifdef _WIN32
1471
        if (bs->is_temporary) {
1472
            unlink(bs->filename);
1473
        }
1474
#endif
1475
        bs->opaque = NULL;
1476
        bs->drv = NULL;
1477
        bs->copy_on_read = 0;
1478
        bs->backing_file[0] = '\0';
1479
        bs->backing_format[0] = '\0';
1480
        bs->total_sectors = 0;
1481
        bs->encrypted = 0;
1482
        bs->valid_key = 0;
1483
        bs->sg = 0;
1484
        bs->growable = 0;
1485
        bs->zero_beyond_eof = false;
1486
        QDECREF(bs->options);
1487
        bs->options = NULL;
1488

    
1489
        if (bs->file != NULL) {
1490
            bdrv_unref(bs->file);
1491
            bs->file = NULL;
1492
        }
1493
    }
1494

    
1495
    bdrv_dev_change_media_cb(bs, false);
1496

    
1497
    /*throttling disk I/O limits*/
1498
    if (bs->io_limits_enabled) {
1499
        bdrv_io_limits_disable(bs);
1500
    }
1501
}
1502

    
1503
void bdrv_close_all(void)
1504
{
1505
    BlockDriverState *bs;
1506

    
1507
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1508
        bdrv_close(bs);
1509
    }
1510
}
1511

    
1512
/* Check if any requests are in-flight (including throttled requests) */
1513
static bool bdrv_requests_pending(BlockDriverState *bs)
1514
{
1515
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1516
        return true;
1517
    }
1518
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1519
        return true;
1520
    }
1521
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1522
        return true;
1523
    }
1524
    if (bs->file && bdrv_requests_pending(bs->file)) {
1525
        return true;
1526
    }
1527
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1528
        return true;
1529
    }
1530
    return false;
1531
}
1532

    
1533
static bool bdrv_requests_pending_all(void)
1534
{
1535
    BlockDriverState *bs;
1536
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1537
        if (bdrv_requests_pending(bs)) {
1538
            return true;
1539
        }
1540
    }
1541
    return false;
1542
}
1543

    
1544
/*
1545
 * Wait for pending requests to complete across all BlockDriverStates
1546
 *
1547
 * This function does not flush data to disk, use bdrv_flush_all() for that
1548
 * after calling this function.
1549
 *
1550
 * Note that completion of an asynchronous I/O operation can trigger any
1551
 * number of other I/O operations on other devices---for example a coroutine
1552
 * can be arbitrarily complex and a constant flow of I/O can come until the
1553
 * coroutine is complete.  Because of this, it is not possible to have a
1554
 * function to drain a single device's I/O queue.
1555
 */
1556
void bdrv_drain_all(void)
1557
{
1558
    /* Always run first iteration so any pending completion BHs run */
1559
    bool busy = true;
1560
    BlockDriverState *bs;
1561

    
1562
    while (busy) {
1563
        /* FIXME: We do not have timer support here, so this is effectively
1564
         * a busy wait.
1565
         */
1566
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1567
            if (bdrv_start_throttled_reqs(bs)) {
1568
                busy = true;
1569
            }
1570
        }
1571

    
1572
        busy = bdrv_requests_pending_all();
1573
        busy |= aio_poll(qemu_get_aio_context(), busy);
1574
    }
1575
}
1576

    
1577
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1578
   Also, NULL terminate the device_name to prevent double remove */
1579
void bdrv_make_anon(BlockDriverState *bs)
1580
{
1581
    if (bs->device_name[0] != '\0') {
1582
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1583
    }
1584
    bs->device_name[0] = '\0';
1585
}
1586

    
1587
static void bdrv_rebind(BlockDriverState *bs)
1588
{
1589
    if (bs->drv && bs->drv->bdrv_rebind) {
1590
        bs->drv->bdrv_rebind(bs);
1591
    }
1592
}
1593

    
1594
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1595
                                     BlockDriverState *bs_src)
1596
{
1597
    /* move some fields that need to stay attached to the device */
1598
    bs_dest->open_flags         = bs_src->open_flags;
1599

    
1600
    /* dev info */
1601
    bs_dest->dev_ops            = bs_src->dev_ops;
1602
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1603
    bs_dest->dev                = bs_src->dev;
1604
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1605
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1606

    
1607
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1608

    
1609
    /* i/o throttled req */
1610
    memcpy(&bs_dest->throttle_state,
1611
           &bs_src->throttle_state,
1612
           sizeof(ThrottleState));
1613
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1614
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1615
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1616

    
1617
    /* r/w error */
1618
    bs_dest->on_read_error      = bs_src->on_read_error;
1619
    bs_dest->on_write_error     = bs_src->on_write_error;
1620

    
1621
    /* i/o status */
1622
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1623
    bs_dest->iostatus           = bs_src->iostatus;
1624

    
1625
    /* dirty bitmap */
1626
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1627

    
1628
    /* reference count */
1629
    bs_dest->refcnt             = bs_src->refcnt;
1630

    
1631
    /* job */
1632
    bs_dest->in_use             = bs_src->in_use;
1633
    bs_dest->job                = bs_src->job;
1634

    
1635
    /* keep the same entry in bdrv_states */
1636
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1637
            bs_src->device_name);
1638
    bs_dest->list = bs_src->list;
1639
}
1640

    
1641
/*
1642
 * Swap bs contents for two image chains while they are live,
1643
 * while keeping required fields on the BlockDriverState that is
1644
 * actually attached to a device.
1645
 *
1646
 * This will modify the BlockDriverState fields, and swap contents
1647
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1648
 *
1649
 * bs_new is required to be anonymous.
1650
 *
1651
 * This function does not create any image files.
1652
 */
1653
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1654
{
1655
    BlockDriverState tmp;
1656

    
1657
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1658
    assert(bs_new->device_name[0] == '\0');
1659
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1660
    assert(bs_new->job == NULL);
1661
    assert(bs_new->dev == NULL);
1662
    assert(bs_new->in_use == 0);
1663
    assert(bs_new->io_limits_enabled == false);
1664
    assert(!throttle_have_timer(&bs_new->throttle_state));
1665

    
1666
    tmp = *bs_new;
1667
    *bs_new = *bs_old;
1668
    *bs_old = tmp;
1669

    
1670
    /* there are some fields that should not be swapped, move them back */
1671
    bdrv_move_feature_fields(&tmp, bs_old);
1672
    bdrv_move_feature_fields(bs_old, bs_new);
1673
    bdrv_move_feature_fields(bs_new, &tmp);
1674

    
1675
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1676
    assert(bs_new->device_name[0] == '\0');
1677

    
1678
    /* Check a few fields that should remain attached to the device */
1679
    assert(bs_new->dev == NULL);
1680
    assert(bs_new->job == NULL);
1681
    assert(bs_new->in_use == 0);
1682
    assert(bs_new->io_limits_enabled == false);
1683
    assert(!throttle_have_timer(&bs_new->throttle_state));
1684

    
1685
    bdrv_rebind(bs_new);
1686
    bdrv_rebind(bs_old);
1687
}
1688

    
1689
/*
1690
 * Add new bs contents at the top of an image chain while the chain is
1691
 * live, while keeping required fields on the top layer.
1692
 *
1693
 * This will modify the BlockDriverState fields, and swap contents
1694
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1695
 *
1696
 * bs_new is required to be anonymous.
1697
 *
1698
 * This function does not create any image files.
1699
 */
1700
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1701
{
1702
    bdrv_swap(bs_new, bs_top);
1703

    
1704
    /* The contents of 'tmp' will become bs_top, as we are
1705
     * swapping bs_new and bs_top contents. */
1706
    bs_top->backing_hd = bs_new;
1707
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1708
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1709
            bs_new->filename);
1710
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1711
            bs_new->drv ? bs_new->drv->format_name : "");
1712
}
1713

    
1714
static void bdrv_delete(BlockDriverState *bs)
1715
{
1716
    assert(!bs->dev);
1717
    assert(!bs->job);
1718
    assert(!bs->in_use);
1719
    assert(!bs->refcnt);
1720
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1721

    
1722
    bdrv_close(bs);
1723

    
1724
    /* remove from list, if necessary */
1725
    bdrv_make_anon(bs);
1726

    
1727
    g_free(bs);
1728
}
1729

    
1730
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1731
/* TODO change to DeviceState *dev when all users are qdevified */
1732
{
1733
    if (bs->dev) {
1734
        return -EBUSY;
1735
    }
1736
    bs->dev = dev;
1737
    bdrv_iostatus_reset(bs);
1738
    return 0;
1739
}
1740

    
1741
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1742
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1743
{
1744
    if (bdrv_attach_dev(bs, dev) < 0) {
1745
        abort();
1746
    }
1747
}
1748

    
1749
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1750
/* TODO change to DeviceState *dev when all users are qdevified */
1751
{
1752
    assert(bs->dev == dev);
1753
    bs->dev = NULL;
1754
    bs->dev_ops = NULL;
1755
    bs->dev_opaque = NULL;
1756
    bs->buffer_alignment = 512;
1757
}
1758

    
1759
/* TODO change to return DeviceState * when all users are qdevified */
1760
void *bdrv_get_attached_dev(BlockDriverState *bs)
1761
{
1762
    return bs->dev;
1763
}
1764

    
1765
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1766
                      void *opaque)
1767
{
1768
    bs->dev_ops = ops;
1769
    bs->dev_opaque = opaque;
1770
}
1771

    
1772
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1773
                               enum MonitorEvent ev,
1774
                               BlockErrorAction action, bool is_read)
1775
{
1776
    QObject *data;
1777
    const char *action_str;
1778

    
1779
    switch (action) {
1780
    case BDRV_ACTION_REPORT:
1781
        action_str = "report";
1782
        break;
1783
    case BDRV_ACTION_IGNORE:
1784
        action_str = "ignore";
1785
        break;
1786
    case BDRV_ACTION_STOP:
1787
        action_str = "stop";
1788
        break;
1789
    default:
1790
        abort();
1791
    }
1792

    
1793
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1794
                              bdrv->device_name,
1795
                              action_str,
1796
                              is_read ? "read" : "write");
1797
    monitor_protocol_event(ev, data);
1798

    
1799
    qobject_decref(data);
1800
}
1801

    
1802
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1803
{
1804
    QObject *data;
1805

    
1806
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1807
                              bdrv_get_device_name(bs), ejected);
1808
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1809

    
1810
    qobject_decref(data);
1811
}
1812

    
1813
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1814
{
1815
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1816
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1817
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1818
        if (tray_was_closed) {
1819
            /* tray open */
1820
            bdrv_emit_qmp_eject_event(bs, true);
1821
        }
1822
        if (load) {
1823
            /* tray close */
1824
            bdrv_emit_qmp_eject_event(bs, false);
1825
        }
1826
    }
1827
}
1828

    
1829
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1830
{
1831
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1832
}
1833

    
1834
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1835
{
1836
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1837
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1838
    }
1839
}
1840

    
1841
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1842
{
1843
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1844
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1845
    }
1846
    return false;
1847
}
1848

    
1849
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1850
{
1851
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1852
        bs->dev_ops->resize_cb(bs->dev_opaque);
1853
    }
1854
}
1855

    
1856
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1857
{
1858
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1859
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1860
    }
1861
    return false;
1862
}
1863

    
1864
/*
1865
 * Run consistency checks on an image
1866
 *
1867
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1868
 * free of errors) or -errno when an internal error occurred. The results of the
1869
 * check are stored in res.
1870
 */
1871
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1872
{
1873
    if (bs->drv->bdrv_check == NULL) {
1874
        return -ENOTSUP;
1875
    }
1876

    
1877
    memset(res, 0, sizeof(*res));
1878
    return bs->drv->bdrv_check(bs, res, fix);
1879
}
1880

    
1881
#define COMMIT_BUF_SECTORS 2048
1882

    
1883
/* commit COW file into the raw image */
1884
int bdrv_commit(BlockDriverState *bs)
1885
{
1886
    BlockDriver *drv = bs->drv;
1887
    int64_t sector, total_sectors;
1888
    int n, ro, open_flags;
1889
    int ret = 0;
1890
    uint8_t *buf;
1891
    char filename[PATH_MAX];
1892

    
1893
    if (!drv)
1894
        return -ENOMEDIUM;
1895
    
1896
    if (!bs->backing_hd) {
1897
        return -ENOTSUP;
1898
    }
1899

    
1900
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1901
        return -EBUSY;
1902
    }
1903

    
1904
    ro = bs->backing_hd->read_only;
1905
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1906
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1907
    open_flags =  bs->backing_hd->open_flags;
1908

    
1909
    if (ro) {
1910
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1911
            return -EACCES;
1912
        }
1913
    }
1914

    
1915
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1916
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1917

    
1918
    for (sector = 0; sector < total_sectors; sector += n) {
1919
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
1920
        if (ret < 0) {
1921
            goto ro_cleanup;
1922
        }
1923
        if (ret) {
1924
            if (bdrv_read(bs, sector, buf, n) != 0) {
1925
                ret = -EIO;
1926
                goto ro_cleanup;
1927
            }
1928

    
1929
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1930
                ret = -EIO;
1931
                goto ro_cleanup;
1932
            }
1933
        }
1934
    }
1935

    
1936
    if (drv->bdrv_make_empty) {
1937
        ret = drv->bdrv_make_empty(bs);
1938
        bdrv_flush(bs);
1939
    }
1940

    
1941
    /*
1942
     * Make sure all data we wrote to the backing device is actually
1943
     * stable on disk.
1944
     */
1945
    if (bs->backing_hd)
1946
        bdrv_flush(bs->backing_hd);
1947

    
1948
ro_cleanup:
1949
    g_free(buf);
1950

    
1951
    if (ro) {
1952
        /* ignoring error return here */
1953
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1954
    }
1955

    
1956
    return ret;
1957
}
1958

    
1959
int bdrv_commit_all(void)
1960
{
1961
    BlockDriverState *bs;
1962

    
1963
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1964
        if (bs->drv && bs->backing_hd) {
1965
            int ret = bdrv_commit(bs);
1966
            if (ret < 0) {
1967
                return ret;
1968
            }
1969
        }
1970
    }
1971
    return 0;
1972
}
1973

    
1974
/**
1975
 * Remove an active request from the tracked requests list
1976
 *
1977
 * This function should be called when a tracked request is completing.
1978
 */
1979
static void tracked_request_end(BdrvTrackedRequest *req)
1980
{
1981
    QLIST_REMOVE(req, list);
1982
    qemu_co_queue_restart_all(&req->wait_queue);
1983
}
1984

    
1985
/**
1986
 * Add an active request to the tracked requests list
1987
 */
1988
static void tracked_request_begin(BdrvTrackedRequest *req,
1989
                                  BlockDriverState *bs,
1990
                                  int64_t sector_num,
1991
                                  int nb_sectors, bool is_write)
1992
{
1993
    *req = (BdrvTrackedRequest){
1994
        .bs = bs,
1995
        .sector_num = sector_num,
1996
        .nb_sectors = nb_sectors,
1997
        .is_write = is_write,
1998
        .co = qemu_coroutine_self(),
1999
    };
2000

    
2001
    qemu_co_queue_init(&req->wait_queue);
2002

    
2003
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2004
}
2005

    
2006
/**
2007
 * Round a region to cluster boundaries
2008
 */
2009
void bdrv_round_to_clusters(BlockDriverState *bs,
2010
                            int64_t sector_num, int nb_sectors,
2011
                            int64_t *cluster_sector_num,
2012
                            int *cluster_nb_sectors)
2013
{
2014
    BlockDriverInfo bdi;
2015

    
2016
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2017
        *cluster_sector_num = sector_num;
2018
        *cluster_nb_sectors = nb_sectors;
2019
    } else {
2020
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2021
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2022
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2023
                                            nb_sectors, c);
2024
    }
2025
}
2026

    
2027
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2028
                                     int64_t sector_num, int nb_sectors) {
2029
    /*        aaaa   bbbb */
2030
    if (sector_num >= req->sector_num + req->nb_sectors) {
2031
        return false;
2032
    }
2033
    /* bbbb   aaaa        */
2034
    if (req->sector_num >= sector_num + nb_sectors) {
2035
        return false;
2036
    }
2037
    return true;
2038
}
2039

    
2040
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2041
        int64_t sector_num, int nb_sectors)
2042
{
2043
    BdrvTrackedRequest *req;
2044
    int64_t cluster_sector_num;
2045
    int cluster_nb_sectors;
2046
    bool retry;
2047

    
2048
    /* If we touch the same cluster it counts as an overlap.  This guarantees
2049
     * that allocating writes will be serialized and not race with each other
2050
     * for the same cluster.  For example, in copy-on-read it ensures that the
2051
     * CoR read and write operations are atomic and guest writes cannot
2052
     * interleave between them.
2053
     */
2054
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2055
                           &cluster_sector_num, &cluster_nb_sectors);
2056

    
2057
    do {
2058
        retry = false;
2059
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2060
            if (tracked_request_overlaps(req, cluster_sector_num,
2061
                                         cluster_nb_sectors)) {
2062
                /* Hitting this means there was a reentrant request, for
2063
                 * example, a block driver issuing nested requests.  This must
2064
                 * never happen since it means deadlock.
2065
                 */
2066
                assert(qemu_coroutine_self() != req->co);
2067

    
2068
                qemu_co_queue_wait(&req->wait_queue);
2069
                retry = true;
2070
                break;
2071
            }
2072
        }
2073
    } while (retry);
2074
}
2075

    
2076
/*
2077
 * Return values:
2078
 * 0        - success
2079
 * -EINVAL  - backing format specified, but no file
2080
 * -ENOSPC  - can't update the backing file because no space is left in the
2081
 *            image file header
2082
 * -ENOTSUP - format driver doesn't support changing the backing file
2083
 */
2084
int bdrv_change_backing_file(BlockDriverState *bs,
2085
    const char *backing_file, const char *backing_fmt)
2086
{
2087
    BlockDriver *drv = bs->drv;
2088
    int ret;
2089

    
2090
    /* Backing file format doesn't make sense without a backing file */
2091
    if (backing_fmt && !backing_file) {
2092
        return -EINVAL;
2093
    }
2094

    
2095
    if (drv->bdrv_change_backing_file != NULL) {
2096
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2097
    } else {
2098
        ret = -ENOTSUP;
2099
    }
2100

    
2101
    if (ret == 0) {
2102
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2103
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2104
    }
2105
    return ret;
2106
}
2107

    
2108
/*
2109
 * Finds the image layer in the chain that has 'bs' as its backing file.
2110
 *
2111
 * active is the current topmost image.
2112
 *
2113
 * Returns NULL if bs is not found in active's image chain,
2114
 * or if active == bs.
2115
 */
2116
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2117
                                    BlockDriverState *bs)
2118
{
2119
    BlockDriverState *overlay = NULL;
2120
    BlockDriverState *intermediate;
2121

    
2122
    assert(active != NULL);
2123
    assert(bs != NULL);
2124

    
2125
    /* if bs is the same as active, then by definition it has no overlay
2126
     */
2127
    if (active == bs) {
2128
        return NULL;
2129
    }
2130

    
2131
    intermediate = active;
2132
    while (intermediate->backing_hd) {
2133
        if (intermediate->backing_hd == bs) {
2134
            overlay = intermediate;
2135
            break;
2136
        }
2137
        intermediate = intermediate->backing_hd;
2138
    }
2139

    
2140
    return overlay;
2141
}
2142

    
2143
typedef struct BlkIntermediateStates {
2144
    BlockDriverState *bs;
2145
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2146
} BlkIntermediateStates;
2147

    
2148

    
2149
/*
2150
 * Drops images above 'base' up to and including 'top', and sets the image
2151
 * above 'top' to have base as its backing file.
2152
 *
2153
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2154
 * information in 'bs' can be properly updated.
2155
 *
2156
 * E.g., this will convert the following chain:
2157
 * bottom <- base <- intermediate <- top <- active
2158
 *
2159
 * to
2160
 *
2161
 * bottom <- base <- active
2162
 *
2163
 * It is allowed for bottom==base, in which case it converts:
2164
 *
2165
 * base <- intermediate <- top <- active
2166
 *
2167
 * to
2168
 *
2169
 * base <- active
2170
 *
2171
 * Error conditions:
2172
 *  if active == top, that is considered an error
2173
 *
2174
 */
2175
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2176
                           BlockDriverState *base)
2177
{
2178
    BlockDriverState *intermediate;
2179
    BlockDriverState *base_bs = NULL;
2180
    BlockDriverState *new_top_bs = NULL;
2181
    BlkIntermediateStates *intermediate_state, *next;
2182
    int ret = -EIO;
2183

    
2184
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2185
    QSIMPLEQ_INIT(&states_to_delete);
2186

    
2187
    if (!top->drv || !base->drv) {
2188
        goto exit;
2189
    }
2190

    
2191
    new_top_bs = bdrv_find_overlay(active, top);
2192

    
2193
    if (new_top_bs == NULL) {
2194
        /* we could not find the image above 'top', this is an error */
2195
        goto exit;
2196
    }
2197

    
2198
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2199
     * to do, no intermediate images */
2200
    if (new_top_bs->backing_hd == base) {
2201
        ret = 0;
2202
        goto exit;
2203
    }
2204

    
2205
    intermediate = top;
2206

    
2207
    /* now we will go down through the list, and add each BDS we find
2208
     * into our deletion queue, until we hit the 'base'
2209
     */
2210
    while (intermediate) {
2211
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2212
        intermediate_state->bs = intermediate;
2213
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2214

    
2215
        if (intermediate->backing_hd == base) {
2216
            base_bs = intermediate->backing_hd;
2217
            break;
2218
        }
2219
        intermediate = intermediate->backing_hd;
2220
    }
2221
    if (base_bs == NULL) {
2222
        /* something went wrong, we did not end at the base. safely
2223
         * unravel everything, and exit with error */
2224
        goto exit;
2225
    }
2226

    
2227
    /* success - we can delete the intermediate states, and link top->base */
2228
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2229
                                   base_bs->drv ? base_bs->drv->format_name : "");
2230
    if (ret) {
2231
        goto exit;
2232
    }
2233
    new_top_bs->backing_hd = base_bs;
2234

    
2235

    
2236
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2237
        /* so that bdrv_close() does not recursively close the chain */
2238
        intermediate_state->bs->backing_hd = NULL;
2239
        bdrv_unref(intermediate_state->bs);
2240
    }
2241
    ret = 0;
2242

    
2243
exit:
2244
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2245
        g_free(intermediate_state);
2246
    }
2247
    return ret;
2248
}
2249

    
2250

    
2251
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2252
                                   size_t size)
2253
{
2254
    int64_t len;
2255

    
2256
    if (!bdrv_is_inserted(bs))
2257
        return -ENOMEDIUM;
2258

    
2259
    if (bs->growable)
2260
        return 0;
2261

    
2262
    len = bdrv_getlength(bs);
2263

    
2264
    if (offset < 0)
2265
        return -EIO;
2266

    
2267
    if ((offset > len) || (len - offset < size))
2268
        return -EIO;
2269

    
2270
    return 0;
2271
}
2272

    
2273
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2274
                              int nb_sectors)
2275
{
2276
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2277
                                   nb_sectors * BDRV_SECTOR_SIZE);
2278
}
2279

    
2280
typedef struct RwCo {
2281
    BlockDriverState *bs;
2282
    int64_t sector_num;
2283
    int nb_sectors;
2284
    QEMUIOVector *qiov;
2285
    bool is_write;
2286
    int ret;
2287
    BdrvRequestFlags flags;
2288
} RwCo;
2289

    
2290
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2291
{
2292
    RwCo *rwco = opaque;
2293

    
2294
    if (!rwco->is_write) {
2295
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2296
                                     rwco->nb_sectors, rwco->qiov,
2297
                                     rwco->flags);
2298
    } else {
2299
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2300
                                      rwco->nb_sectors, rwco->qiov,
2301
                                      rwco->flags);
2302
    }
2303
}
2304

    
2305
/*
2306
 * Process a vectored synchronous request using coroutines
2307
 */
2308
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2309
                       QEMUIOVector *qiov, bool is_write,
2310
                       BdrvRequestFlags flags)
2311
{
2312
    Coroutine *co;
2313
    RwCo rwco = {
2314
        .bs = bs,
2315
        .sector_num = sector_num,
2316
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2317
        .qiov = qiov,
2318
        .is_write = is_write,
2319
        .ret = NOT_DONE,
2320
        .flags = flags,
2321
    };
2322
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2323

    
2324
    /**
2325
     * In sync call context, when the vcpu is blocked, this throttling timer
2326
     * will not fire; so the I/O throttling function has to be disabled here
2327
     * if it has been enabled.
2328
     */
2329
    if (bs->io_limits_enabled) {
2330
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2331
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2332
        bdrv_io_limits_disable(bs);
2333
    }
2334

    
2335
    if (qemu_in_coroutine()) {
2336
        /* Fast-path if already in coroutine context */
2337
        bdrv_rw_co_entry(&rwco);
2338
    } else {
2339
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2340
        qemu_coroutine_enter(co, &rwco);
2341
        while (rwco.ret == NOT_DONE) {
2342
            qemu_aio_wait();
2343
        }
2344
    }
2345
    return rwco.ret;
2346
}
2347

    
2348
/*
2349
 * Process a synchronous request using coroutines
2350
 */
2351
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2352
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2353
{
2354
    QEMUIOVector qiov;
2355
    struct iovec iov = {
2356
        .iov_base = (void *)buf,
2357
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2358
    };
2359

    
2360
    qemu_iovec_init_external(&qiov, &iov, 1);
2361
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2362
}
2363

    
2364
/* return < 0 if error. See bdrv_write() for the return codes */
2365
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2366
              uint8_t *buf, int nb_sectors)
2367
{
2368
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2369
}
2370

    
2371
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2372
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2373
                          uint8_t *buf, int nb_sectors)
2374
{
2375
    bool enabled;
2376
    int ret;
2377

    
2378
    enabled = bs->io_limits_enabled;
2379
    bs->io_limits_enabled = false;
2380
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2381
    bs->io_limits_enabled = enabled;
2382
    return ret;
2383
}
2384

    
2385
/* Return < 0 if error. Important errors are:
2386
  -EIO         generic I/O error (may happen for all errors)
2387
  -ENOMEDIUM   No media inserted.
2388
  -EINVAL      Invalid sector number or nb_sectors
2389
  -EACCES      Trying to write a read-only device
2390
*/
2391
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2392
               const uint8_t *buf, int nb_sectors)
2393
{
2394
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2395
}
2396

    
2397
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2398
{
2399
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2400
}
2401

    
2402
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2403
                      int nb_sectors, BdrvRequestFlags flags)
2404
{
2405
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2406
                      BDRV_REQ_ZERO_WRITE | flags);
2407
}
2408

    
2409
/*
2410
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2411
 * The operation is sped up by checking the block status and only writing
2412
 * zeroes to the device if they currently do not return zeroes. Optional
2413
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2414
 *
2415
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2416
 */
2417
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2418
{
2419
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2420
    int64_t ret, nb_sectors, sector_num = 0;
2421
    int n;
2422

    
2423
    for (;;) {
2424
        nb_sectors = target_size - sector_num;
2425
        if (nb_sectors <= 0) {
2426
            return 0;
2427
        }
2428
        if (nb_sectors > INT_MAX) {
2429
            nb_sectors = INT_MAX;
2430
        }
2431
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2432
        if (ret & BDRV_BLOCK_ZERO) {
2433
            sector_num += n;
2434
            continue;
2435
        }
2436
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2437
        if (ret < 0) {
2438
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2439
                         sector_num, strerror(-ret));
2440
            return ret;
2441
        }
2442
        sector_num += n;
2443
    }
2444
}
2445

    
2446
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2447
               void *buf, int count1)
2448
{
2449
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2450
    int len, nb_sectors, count;
2451
    int64_t sector_num;
2452
    int ret;
2453

    
2454
    count = count1;
2455
    /* first read to align to sector start */
2456
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2457
    if (len > count)
2458
        len = count;
2459
    sector_num = offset >> BDRV_SECTOR_BITS;
2460
    if (len > 0) {
2461
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2462
            return ret;
2463
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2464
        count -= len;
2465
        if (count == 0)
2466
            return count1;
2467
        sector_num++;
2468
        buf += len;
2469
    }
2470

    
2471
    /* read the sectors "in place" */
2472
    nb_sectors = count >> BDRV_SECTOR_BITS;
2473
    if (nb_sectors > 0) {
2474
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2475
            return ret;
2476
        sector_num += nb_sectors;
2477
        len = nb_sectors << BDRV_SECTOR_BITS;
2478
        buf += len;
2479
        count -= len;
2480
    }
2481

    
2482
    /* add data from the last sector */
2483
    if (count > 0) {
2484
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2485
            return ret;
2486
        memcpy(buf, tmp_buf, count);
2487
    }
2488
    return count1;
2489
}
2490

    
2491
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2492
{
2493
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2494
    int len, nb_sectors, count;
2495
    int64_t sector_num;
2496
    int ret;
2497

    
2498
    count = qiov->size;
2499

    
2500
    /* first write to align to sector start */
2501
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2502
    if (len > count)
2503
        len = count;
2504
    sector_num = offset >> BDRV_SECTOR_BITS;
2505
    if (len > 0) {
2506
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2507
            return ret;
2508
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2509
                          len);
2510
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2511
            return ret;
2512
        count -= len;
2513
        if (count == 0)
2514
            return qiov->size;
2515
        sector_num++;
2516
    }
2517

    
2518
    /* write the sectors "in place" */
2519
    nb_sectors = count >> BDRV_SECTOR_BITS;
2520
    if (nb_sectors > 0) {
2521
        QEMUIOVector qiov_inplace;
2522

    
2523
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2524
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2525
                          nb_sectors << BDRV_SECTOR_BITS);
2526
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2527
        qemu_iovec_destroy(&qiov_inplace);
2528
        if (ret < 0) {
2529
            return ret;
2530
        }
2531

    
2532
        sector_num += nb_sectors;
2533
        len = nb_sectors << BDRV_SECTOR_BITS;
2534
        count -= len;
2535
    }
2536

    
2537
    /* add data from the last sector */
2538
    if (count > 0) {
2539
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2540
            return ret;
2541
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2542
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2543
            return ret;
2544
    }
2545
    return qiov->size;
2546
}
2547

    
2548
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2549
                const void *buf, int count1)
2550
{
2551
    QEMUIOVector qiov;
2552
    struct iovec iov = {
2553
        .iov_base   = (void *) buf,
2554
        .iov_len    = count1,
2555
    };
2556

    
2557
    qemu_iovec_init_external(&qiov, &iov, 1);
2558
    return bdrv_pwritev(bs, offset, &qiov);
2559
}
2560

    
2561
/*
2562
 * Writes to the file and ensures that no writes are reordered across this
2563
 * request (acts as a barrier)
2564
 *
2565
 * Returns 0 on success, -errno in error cases.
2566
 */
2567
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2568
    const void *buf, int count)
2569
{
2570
    int ret;
2571

    
2572
    ret = bdrv_pwrite(bs, offset, buf, count);
2573
    if (ret < 0) {
2574
        return ret;
2575
    }
2576

    
2577
    /* No flush needed for cache modes that already do it */
2578
    if (bs->enable_write_cache) {
2579
        bdrv_flush(bs);
2580
    }
2581

    
2582
    return 0;
2583
}
2584

    
2585
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2586
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2587
{
2588
    /* Perform I/O through a temporary buffer so that users who scribble over
2589
     * their read buffer while the operation is in progress do not end up
2590
     * modifying the image file.  This is critical for zero-copy guest I/O
2591
     * where anything might happen inside guest memory.
2592
     */
2593
    void *bounce_buffer;
2594

    
2595
    BlockDriver *drv = bs->drv;
2596
    struct iovec iov;
2597
    QEMUIOVector bounce_qiov;
2598
    int64_t cluster_sector_num;
2599
    int cluster_nb_sectors;
2600
    size_t skip_bytes;
2601
    int ret;
2602

    
2603
    /* Cover entire cluster so no additional backing file I/O is required when
2604
     * allocating cluster in the image file.
2605
     */
2606
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2607
                           &cluster_sector_num, &cluster_nb_sectors);
2608

    
2609
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2610
                                   cluster_sector_num, cluster_nb_sectors);
2611

    
2612
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2613
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2614
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2615

    
2616
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2617
                             &bounce_qiov);
2618
    if (ret < 0) {
2619
        goto err;
2620
    }
2621

    
2622
    if (drv->bdrv_co_write_zeroes &&
2623
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2624
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2625
                                      cluster_nb_sectors, 0);
2626
    } else {
2627
        /* This does not change the data on the disk, it is not necessary
2628
         * to flush even in cache=writethrough mode.
2629
         */
2630
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2631
                                  &bounce_qiov);
2632
    }
2633

    
2634
    if (ret < 0) {
2635
        /* It might be okay to ignore write errors for guest requests.  If this
2636
         * is a deliberate copy-on-read then we don't want to ignore the error.
2637
         * Simply report it in all cases.
2638
         */
2639
        goto err;
2640
    }
2641

    
2642
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2643
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2644
                        nb_sectors * BDRV_SECTOR_SIZE);
2645

    
2646
err:
2647
    qemu_vfree(bounce_buffer);
2648
    return ret;
2649
}
2650

    
2651
/*
2652
 * Handle a read request in coroutine context
2653
 */
2654
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2655
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2656
    BdrvRequestFlags flags)
2657
{
2658
    BlockDriver *drv = bs->drv;
2659
    BdrvTrackedRequest req;
2660
    int ret;
2661

    
2662
    if (!drv) {
2663
        return -ENOMEDIUM;
2664
    }
2665
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2666
        return -EIO;
2667
    }
2668

    
2669
    if (bs->copy_on_read) {
2670
        flags |= BDRV_REQ_COPY_ON_READ;
2671
    }
2672
    if (flags & BDRV_REQ_COPY_ON_READ) {
2673
        bs->copy_on_read_in_flight++;
2674
    }
2675

    
2676
    if (bs->copy_on_read_in_flight) {
2677
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2678
    }
2679

    
2680
    /* throttling disk I/O */
2681
    if (bs->io_limits_enabled) {
2682
        bdrv_io_limits_intercept(bs, nb_sectors, false);
2683
    }
2684

    
2685
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2686

    
2687
    if (flags & BDRV_REQ_COPY_ON_READ) {
2688
        int pnum;
2689

    
2690
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2691
        if (ret < 0) {
2692
            goto out;
2693
        }
2694

    
2695
        if (!ret || pnum != nb_sectors) {
2696
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2697
            goto out;
2698
        }
2699
    }
2700

    
2701
    if (!(bs->zero_beyond_eof && bs->growable)) {
2702
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2703
    } else {
2704
        /* Read zeros after EOF of growable BDSes */
2705
        int64_t len, total_sectors, max_nb_sectors;
2706

    
2707
        len = bdrv_getlength(bs);
2708
        if (len < 0) {
2709
            ret = len;
2710
            goto out;
2711
        }
2712

    
2713
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2714
        max_nb_sectors = MAX(0, total_sectors - sector_num);
2715
        if (max_nb_sectors > 0) {
2716
            ret = drv->bdrv_co_readv(bs, sector_num,
2717
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2718
        } else {
2719
            ret = 0;
2720
        }
2721

    
2722
        /* Reading beyond end of file is supposed to produce zeroes */
2723
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2724
            uint64_t offset = MAX(0, total_sectors - sector_num);
2725
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2726
                              BDRV_SECTOR_SIZE;
2727
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2728
        }
2729
    }
2730

    
2731
out:
2732
    tracked_request_end(&req);
2733

    
2734
    if (flags & BDRV_REQ_COPY_ON_READ) {
2735
        bs->copy_on_read_in_flight--;
2736
    }
2737

    
2738
    return ret;
2739
}
2740

    
2741
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2742
    int nb_sectors, QEMUIOVector *qiov)
2743
{
2744
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2745

    
2746
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2747
}
2748

    
2749
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2750
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2751
{
2752
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2753

    
2754
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2755
                            BDRV_REQ_COPY_ON_READ);
2756
}
2757

    
2758
/* if no limit is specified in the BlockLimits use a default
2759
 * of 32768 512-byte sectors (16 MiB) per request.
2760
 */
2761
#define MAX_WRITE_ZEROES_DEFAULT 32768
2762

    
2763
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2764
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
2765
{
2766
    BlockDriver *drv = bs->drv;
2767
    QEMUIOVector qiov;
2768
    struct iovec iov = {0};
2769
    int ret = 0;
2770

    
2771
    int max_write_zeroes = bs->bl.max_write_zeroes ?
2772
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
2773

    
2774
    while (nb_sectors > 0 && !ret) {
2775
        int num = nb_sectors;
2776

    
2777
        /* align request */
2778
        if (bs->bl.write_zeroes_alignment &&
2779
            num >= bs->bl.write_zeroes_alignment &&
2780
            sector_num % bs->bl.write_zeroes_alignment) {
2781
            if (num > bs->bl.write_zeroes_alignment) {
2782
                num = bs->bl.write_zeroes_alignment;
2783
            }
2784
            num -= sector_num % bs->bl.write_zeroes_alignment;
2785
        }
2786

    
2787
        /* limit request size */
2788
        if (num > max_write_zeroes) {
2789
            num = max_write_zeroes;
2790
        }
2791

    
2792
        ret = -ENOTSUP;
2793
        /* First try the efficient write zeroes operation */
2794
        if (drv->bdrv_co_write_zeroes) {
2795
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
2796
        }
2797

    
2798
        if (ret == -ENOTSUP) {
2799
            /* Fall back to bounce buffer if write zeroes is unsupported */
2800
            iov.iov_len = num * BDRV_SECTOR_SIZE;
2801
            if (iov.iov_base == NULL) {
2802
                /* allocate bounce buffer only once and ensure that it
2803
                 * is big enough for this and all future requests.
2804
                 */
2805
                size_t bufsize = num <= nb_sectors ? num : max_write_zeroes;
2806
                iov.iov_base = qemu_blockalign(bs, bufsize * BDRV_SECTOR_SIZE);
2807
                memset(iov.iov_base, 0, bufsize * BDRV_SECTOR_SIZE);
2808
            }
2809
            qemu_iovec_init_external(&qiov, &iov, 1);
2810

    
2811
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
2812
        }
2813

    
2814
        sector_num += num;
2815
        nb_sectors -= num;
2816
    }
2817

    
2818
    qemu_vfree(iov.iov_base);
2819
    return ret;
2820
}
2821

    
2822
/*
2823
 * Handle a write request in coroutine context
2824
 */
2825
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2826
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2827
    BdrvRequestFlags flags)
2828
{
2829
    BlockDriver *drv = bs->drv;
2830
    BdrvTrackedRequest req;
2831
    int ret;
2832

    
2833
    if (!bs->drv) {
2834
        return -ENOMEDIUM;
2835
    }
2836
    if (bs->read_only) {
2837
        return -EACCES;
2838
    }
2839
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2840
        return -EIO;
2841
    }
2842

    
2843
    if (bs->copy_on_read_in_flight) {
2844
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2845
    }
2846

    
2847
    /* throttling disk I/O */
2848
    if (bs->io_limits_enabled) {
2849
        bdrv_io_limits_intercept(bs, nb_sectors, true);
2850
    }
2851

    
2852
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2853

    
2854
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2855

    
2856
    if (ret < 0) {
2857
        /* Do nothing, write notifier decided to fail this request */
2858
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
2859
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
2860
    } else {
2861
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2862
    }
2863

    
2864
    if (ret == 0 && !bs->enable_write_cache) {
2865
        ret = bdrv_co_flush(bs);
2866
    }
2867

    
2868
    bdrv_set_dirty(bs, sector_num, nb_sectors);
2869

    
2870
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2871
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2872
    }
2873
    if (bs->growable && ret >= 0) {
2874
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
2875
    }
2876

    
2877
    tracked_request_end(&req);
2878

    
2879
    return ret;
2880
}
2881

    
2882
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2883
    int nb_sectors, QEMUIOVector *qiov)
2884
{
2885
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2886

    
2887
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2888
}
2889

    
2890
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2891
                                      int64_t sector_num, int nb_sectors,
2892
                                      BdrvRequestFlags flags)
2893
{
2894
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2895

    
2896
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
2897
        flags &= ~BDRV_REQ_MAY_UNMAP;
2898
    }
2899

    
2900
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2901
                             BDRV_REQ_ZERO_WRITE | flags);
2902
}
2903

    
2904
/**
2905
 * Truncate file to 'offset' bytes (needed only for file protocols)
2906
 */
2907
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2908
{
2909
    BlockDriver *drv = bs->drv;
2910
    int ret;
2911
    if (!drv)
2912
        return -ENOMEDIUM;
2913
    if (!drv->bdrv_truncate)
2914
        return -ENOTSUP;
2915
    if (bs->read_only)
2916
        return -EACCES;
2917
    if (bdrv_in_use(bs))
2918
        return -EBUSY;
2919
    ret = drv->bdrv_truncate(bs, offset);
2920
    if (ret == 0) {
2921
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2922
        bdrv_dev_resize_cb(bs);
2923
    }
2924
    return ret;
2925
}
2926

    
2927
/**
2928
 * Length of a allocated file in bytes. Sparse files are counted by actual
2929
 * allocated space. Return < 0 if error or unknown.
2930
 */
2931
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2932
{
2933
    BlockDriver *drv = bs->drv;
2934
    if (!drv) {
2935
        return -ENOMEDIUM;
2936
    }
2937
    if (drv->bdrv_get_allocated_file_size) {
2938
        return drv->bdrv_get_allocated_file_size(bs);
2939
    }
2940
    if (bs->file) {
2941
        return bdrv_get_allocated_file_size(bs->file);
2942
    }
2943
    return -ENOTSUP;
2944
}
2945

    
2946
/**
2947
 * Length of a file in bytes. Return < 0 if error or unknown.
2948
 */
2949
int64_t bdrv_getlength(BlockDriverState *bs)
2950
{
2951
    BlockDriver *drv = bs->drv;
2952
    if (!drv)
2953
        return -ENOMEDIUM;
2954

    
2955
    if (drv->has_variable_length) {
2956
        int ret = refresh_total_sectors(bs, bs->total_sectors);
2957
        if (ret < 0) {
2958
            return ret;
2959
        }
2960
    }
2961
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2962
}
2963

    
2964
/* return 0 as number of sectors if no device present or error */
2965
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2966
{
2967
    int64_t length;
2968
    length = bdrv_getlength(bs);
2969
    if (length < 0)
2970
        length = 0;
2971
    else
2972
        length = length >> BDRV_SECTOR_BITS;
2973
    *nb_sectors_ptr = length;
2974
}
2975

    
2976
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2977
                       BlockdevOnError on_write_error)
2978
{
2979
    bs->on_read_error = on_read_error;
2980
    bs->on_write_error = on_write_error;
2981
}
2982

    
2983
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2984
{
2985
    return is_read ? bs->on_read_error : bs->on_write_error;
2986
}
2987

    
2988
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2989
{
2990
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2991

    
2992
    switch (on_err) {
2993
    case BLOCKDEV_ON_ERROR_ENOSPC:
2994
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2995
    case BLOCKDEV_ON_ERROR_STOP:
2996
        return BDRV_ACTION_STOP;
2997
    case BLOCKDEV_ON_ERROR_REPORT:
2998
        return BDRV_ACTION_REPORT;
2999
    case BLOCKDEV_ON_ERROR_IGNORE:
3000
        return BDRV_ACTION_IGNORE;
3001
    default:
3002
        abort();
3003
    }
3004
}
3005

    
3006
/* This is done by device models because, while the block layer knows
3007
 * about the error, it does not know whether an operation comes from
3008
 * the device or the block layer (from a job, for example).
3009
 */
3010
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3011
                       bool is_read, int error)
3012
{
3013
    assert(error >= 0);
3014
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3015
    if (action == BDRV_ACTION_STOP) {
3016
        vm_stop(RUN_STATE_IO_ERROR);
3017
        bdrv_iostatus_set_err(bs, error);
3018
    }
3019
}
3020

    
3021
int bdrv_is_read_only(BlockDriverState *bs)
3022
{
3023
    return bs->read_only;
3024
}
3025

    
3026
int bdrv_is_sg(BlockDriverState *bs)
3027
{
3028
    return bs->sg;
3029
}
3030

    
3031
int bdrv_enable_write_cache(BlockDriverState *bs)
3032
{
3033
    return bs->enable_write_cache;
3034
}
3035

    
3036
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3037
{
3038
    bs->enable_write_cache = wce;
3039

    
3040
    /* so a reopen() will preserve wce */
3041
    if (wce) {
3042
        bs->open_flags |= BDRV_O_CACHE_WB;
3043
    } else {
3044
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3045
    }
3046
}
3047

    
3048
int bdrv_is_encrypted(BlockDriverState *bs)
3049
{
3050
    if (bs->backing_hd && bs->backing_hd->encrypted)
3051
        return 1;
3052
    return bs->encrypted;
3053
}
3054

    
3055
int bdrv_key_required(BlockDriverState *bs)
3056
{
3057
    BlockDriverState *backing_hd = bs->backing_hd;
3058

    
3059
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3060
        return 1;
3061
    return (bs->encrypted && !bs->valid_key);
3062
}
3063

    
3064
int bdrv_set_key(BlockDriverState *bs, const char *key)
3065
{
3066
    int ret;
3067
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3068
        ret = bdrv_set_key(bs->backing_hd, key);
3069
        if (ret < 0)
3070
            return ret;
3071
        if (!bs->encrypted)
3072
            return 0;
3073
    }
3074
    if (!bs->encrypted) {
3075
        return -EINVAL;
3076
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3077
        return -ENOMEDIUM;
3078
    }
3079
    ret = bs->drv->bdrv_set_key(bs, key);
3080
    if (ret < 0) {
3081
        bs->valid_key = 0;
3082
    } else if (!bs->valid_key) {
3083
        bs->valid_key = 1;
3084
        /* call the change callback now, we skipped it on open */
3085
        bdrv_dev_change_media_cb(bs, true);
3086
    }
3087
    return ret;
3088
}
3089

    
3090
const char *bdrv_get_format_name(BlockDriverState *bs)
3091
{
3092
    return bs->drv ? bs->drv->format_name : NULL;
3093
}
3094

    
3095
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3096
                         void *opaque)
3097
{
3098
    BlockDriver *drv;
3099

    
3100
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3101
        it(opaque, drv->format_name);
3102
    }
3103
}
3104

    
3105
BlockDriverState *bdrv_find(const char *name)
3106
{
3107
    BlockDriverState *bs;
3108

    
3109
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3110
        if (!strcmp(name, bs->device_name)) {
3111
            return bs;
3112
        }
3113
    }
3114
    return NULL;
3115
}
3116

    
3117
BlockDriverState *bdrv_next(BlockDriverState *bs)
3118
{
3119
    if (!bs) {
3120
        return QTAILQ_FIRST(&bdrv_states);
3121
    }
3122
    return QTAILQ_NEXT(bs, list);
3123
}
3124

    
3125
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3126
{
3127
    BlockDriverState *bs;
3128

    
3129
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3130
        it(opaque, bs);
3131
    }
3132
}
3133

    
3134
const char *bdrv_get_device_name(BlockDriverState *bs)
3135
{
3136
    return bs->device_name;
3137
}
3138

    
3139
int bdrv_get_flags(BlockDriverState *bs)
3140
{
3141
    return bs->open_flags;
3142
}
3143

    
3144
int bdrv_flush_all(void)
3145
{
3146
    BlockDriverState *bs;
3147
    int result = 0;
3148

    
3149
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3150
        int ret = bdrv_flush(bs);
3151
        if (ret < 0 && !result) {
3152
            result = ret;
3153
        }
3154
    }
3155

    
3156
    return result;
3157
}
3158

    
3159
int bdrv_has_zero_init_1(BlockDriverState *bs)
3160
{
3161
    return 1;
3162
}
3163

    
3164
int bdrv_has_zero_init(BlockDriverState *bs)
3165
{
3166
    assert(bs->drv);
3167

    
3168
    /* If BS is a copy on write image, it is initialized to
3169
       the contents of the base image, which may not be zeroes.  */
3170
    if (bs->backing_hd) {
3171
        return 0;
3172
    }
3173
    if (bs->drv->bdrv_has_zero_init) {
3174
        return bs->drv->bdrv_has_zero_init(bs);
3175
    }
3176

    
3177
    /* safe default */
3178
    return 0;
3179
}
3180

    
3181
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3182
{
3183
    BlockDriverInfo bdi;
3184

    
3185
    if (bs->backing_hd) {
3186
        return false;
3187
    }
3188

    
3189
    if (bdrv_get_info(bs, &bdi) == 0) {
3190
        return bdi.unallocated_blocks_are_zero;
3191
    }
3192

    
3193
    return false;
3194
}
3195

    
3196
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3197
{
3198
    BlockDriverInfo bdi;
3199

    
3200
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3201
        return false;
3202
    }
3203

    
3204
    if (bdrv_get_info(bs, &bdi) == 0) {
3205
        return bdi.can_write_zeroes_with_unmap;
3206
    }
3207

    
3208
    return false;
3209
}
3210

    
3211
typedef struct BdrvCoGetBlockStatusData {
3212
    BlockDriverState *bs;
3213
    BlockDriverState *base;
3214
    int64_t sector_num;
3215
    int nb_sectors;
3216
    int *pnum;
3217
    int64_t ret;
3218
    bool done;
3219
} BdrvCoGetBlockStatusData;
3220

    
3221
/*
3222
 * Returns true iff the specified sector is present in the disk image. Drivers
3223
 * not implementing the functionality are assumed to not support backing files,
3224
 * hence all their sectors are reported as allocated.
3225
 *
3226
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3227
 * and 'pnum' is set to 0.
3228
 *
3229
 * 'pnum' is set to the number of sectors (including and immediately following
3230
 * the specified sector) that are known to be in the same
3231
 * allocated/unallocated state.
3232
 *
3233
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3234
 * beyond the end of the disk image it will be clamped.
3235
 */
3236
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3237
                                                     int64_t sector_num,
3238
                                                     int nb_sectors, int *pnum)
3239
{
3240
    int64_t length;
3241
    int64_t n;
3242
    int64_t ret, ret2;
3243

    
3244
    length = bdrv_getlength(bs);
3245
    if (length < 0) {
3246
        return length;
3247
    }
3248

    
3249
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3250
        *pnum = 0;
3251
        return 0;
3252
    }
3253

    
3254
    n = bs->total_sectors - sector_num;
3255
    if (n < nb_sectors) {
3256
        nb_sectors = n;
3257
    }
3258

    
3259
    if (!bs->drv->bdrv_co_get_block_status) {
3260
        *pnum = nb_sectors;
3261
        ret = BDRV_BLOCK_DATA;
3262
        if (bs->drv->protocol_name) {
3263
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3264
        }
3265
        return ret;
3266
    }
3267

    
3268
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3269
    if (ret < 0) {
3270
        *pnum = 0;
3271
        return ret;
3272
    }
3273

    
3274
    if (ret & BDRV_BLOCK_RAW) {
3275
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3276
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3277
                                     *pnum, pnum);
3278
    }
3279

    
3280
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3281
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3282
            ret |= BDRV_BLOCK_ZERO;
3283
        } else if (bs->backing_hd) {
3284
            BlockDriverState *bs2 = bs->backing_hd;
3285
            int64_t length2 = bdrv_getlength(bs2);
3286
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3287
                ret |= BDRV_BLOCK_ZERO;
3288
            }
3289
        }
3290
    }
3291

    
3292
    if (bs->file &&
3293
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3294
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3295
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3296
                                        *pnum, pnum);
3297
        if (ret2 >= 0) {
3298
            /* Ignore errors.  This is just providing extra information, it
3299
             * is useful but not necessary.
3300
             */
3301
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3302
        }
3303
    }
3304

    
3305
    return ret;
3306
}
3307

    
3308
/* Coroutine wrapper for bdrv_get_block_status() */
3309
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3310
{
3311
    BdrvCoGetBlockStatusData *data = opaque;
3312
    BlockDriverState *bs = data->bs;
3313

    
3314
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3315
                                         data->pnum);
3316
    data->done = true;
3317
}
3318

    
3319
/*
3320
 * Synchronous wrapper around bdrv_co_get_block_status().
3321
 *
3322
 * See bdrv_co_get_block_status() for details.
3323
 */
3324
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3325
                              int nb_sectors, int *pnum)
3326
{
3327
    Coroutine *co;
3328
    BdrvCoGetBlockStatusData data = {
3329
        .bs = bs,
3330
        .sector_num = sector_num,
3331
        .nb_sectors = nb_sectors,
3332
        .pnum = pnum,
3333
        .done = false,
3334
    };
3335

    
3336
    if (qemu_in_coroutine()) {
3337
        /* Fast-path if already in coroutine context */
3338
        bdrv_get_block_status_co_entry(&data);
3339
    } else {
3340
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3341
        qemu_coroutine_enter(co, &data);
3342
        while (!data.done) {
3343
            qemu_aio_wait();
3344
        }
3345
    }
3346
    return data.ret;
3347
}
3348

    
3349
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3350
                                   int nb_sectors, int *pnum)
3351
{
3352
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3353
    if (ret < 0) {
3354
        return ret;
3355
    }
3356
    return
3357
        (ret & BDRV_BLOCK_DATA) ||
3358
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3359
}
3360

    
3361
/*
3362
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3363
 *
3364
 * Return true if the given sector is allocated in any image between
3365
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3366
 * sector is allocated in any image of the chain.  Return false otherwise.
3367
 *
3368
 * 'pnum' is set to the number of sectors (including and immediately following
3369
 *  the specified sector) that are known to be in the same
3370
 *  allocated/unallocated state.
3371
 *
3372
 */
3373
int bdrv_is_allocated_above(BlockDriverState *top,
3374
                            BlockDriverState *base,
3375
                            int64_t sector_num,
3376
                            int nb_sectors, int *pnum)
3377
{
3378
    BlockDriverState *intermediate;
3379
    int ret, n = nb_sectors;
3380

    
3381
    intermediate = top;
3382
    while (intermediate && intermediate != base) {
3383
        int pnum_inter;
3384
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3385
                                &pnum_inter);
3386
        if (ret < 0) {
3387
            return ret;
3388
        } else if (ret) {
3389
            *pnum = pnum_inter;
3390
            return 1;
3391
        }
3392

    
3393
        /*
3394
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3395
         * might have
3396
         *
3397
         * [sector_num+x, nr_sectors] allocated.
3398
         */
3399
        if (n > pnum_inter &&
3400
            (intermediate == top ||
3401
             sector_num + pnum_inter < intermediate->total_sectors)) {
3402
            n = pnum_inter;
3403
        }
3404

    
3405
        intermediate = intermediate->backing_hd;
3406
    }
3407

    
3408
    *pnum = n;
3409
    return 0;
3410
}
3411

    
3412
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3413
{
3414
    if (bs->backing_hd && bs->backing_hd->encrypted)
3415
        return bs->backing_file;
3416
    else if (bs->encrypted)
3417
        return bs->filename;
3418
    else
3419
        return NULL;
3420
}
3421

    
3422
void bdrv_get_backing_filename(BlockDriverState *bs,
3423
                               char *filename, int filename_size)
3424
{
3425
    pstrcpy(filename, filename_size, bs->backing_file);
3426
}
3427

    
3428
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3429
                          const uint8_t *buf, int nb_sectors)
3430
{
3431
    BlockDriver *drv = bs->drv;
3432
    if (!drv)
3433
        return -ENOMEDIUM;
3434
    if (!drv->bdrv_write_compressed)
3435
        return -ENOTSUP;
3436
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3437
        return -EIO;
3438

    
3439
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3440

    
3441
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3442
}
3443

    
3444
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3445
{
3446
    BlockDriver *drv = bs->drv;
3447
    if (!drv)
3448
        return -ENOMEDIUM;
3449
    if (!drv->bdrv_get_info)
3450
        return -ENOTSUP;
3451
    memset(bdi, 0, sizeof(*bdi));
3452
    return drv->bdrv_get_info(bs, bdi);
3453
}
3454

    
3455
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3456
{
3457
    BlockDriver *drv = bs->drv;
3458
    if (drv && drv->bdrv_get_specific_info) {
3459
        return drv->bdrv_get_specific_info(bs);
3460
    }
3461
    return NULL;
3462
}
3463

    
3464
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3465
                      int64_t pos, int size)
3466
{
3467
    QEMUIOVector qiov;
3468
    struct iovec iov = {
3469
        .iov_base   = (void *) buf,
3470
        .iov_len    = size,
3471
    };
3472

    
3473
    qemu_iovec_init_external(&qiov, &iov, 1);
3474
    return bdrv_writev_vmstate(bs, &qiov, pos);
3475
}
3476

    
3477
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3478
{
3479
    BlockDriver *drv = bs->drv;
3480

    
3481
    if (!drv) {
3482
        return -ENOMEDIUM;
3483
    } else if (drv->bdrv_save_vmstate) {
3484
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3485
    } else if (bs->file) {
3486
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3487
    }
3488

    
3489
    return -ENOTSUP;
3490
}
3491

    
3492
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3493
                      int64_t pos, int size)
3494
{
3495
    BlockDriver *drv = bs->drv;
3496
    if (!drv)
3497
        return -ENOMEDIUM;
3498
    if (drv->bdrv_load_vmstate)
3499
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3500
    if (bs->file)
3501
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3502
    return -ENOTSUP;
3503
}
3504

    
3505
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3506
{
3507
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3508
        return;
3509
    }
3510

    
3511
    bs->drv->bdrv_debug_event(bs, event);
3512
}
3513

    
3514
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3515
                          const char *tag)
3516
{
3517
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3518
        bs = bs->file;
3519
    }
3520

    
3521
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3522
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3523
    }
3524

    
3525
    return -ENOTSUP;
3526
}
3527

    
3528
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
3529
{
3530
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
3531
        bs = bs->file;
3532
    }
3533

    
3534
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
3535
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
3536
    }
3537

    
3538
    return -ENOTSUP;
3539
}
3540

    
3541
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3542
{
3543
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3544
        bs = bs->file;
3545
    }
3546

    
3547
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3548
        return bs->drv->bdrv_debug_resume(bs, tag);
3549
    }
3550

    
3551
    return -ENOTSUP;
3552
}
3553

    
3554
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3555
{
3556
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3557
        bs = bs->file;
3558
    }
3559

    
3560
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3561
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3562
    }
3563

    
3564
    return false;
3565
}
3566

    
3567
int bdrv_is_snapshot(BlockDriverState *bs)
3568
{
3569
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3570
}
3571

    
3572
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3573
 * relative, it must be relative to the chain.  So, passing in bs->filename
3574
 * from a BDS as backing_file should not be done, as that may be relative to
3575
 * the CWD rather than the chain. */
3576
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3577
        const char *backing_file)
3578
{
3579
    char *filename_full = NULL;
3580
    char *backing_file_full = NULL;
3581
    char *filename_tmp = NULL;
3582
    int is_protocol = 0;
3583
    BlockDriverState *curr_bs = NULL;
3584
    BlockDriverState *retval = NULL;
3585

    
3586
    if (!bs || !bs->drv || !backing_file) {
3587
        return NULL;
3588
    }
3589

    
3590
    filename_full     = g_malloc(PATH_MAX);
3591
    backing_file_full = g_malloc(PATH_MAX);
3592
    filename_tmp      = g_malloc(PATH_MAX);
3593

    
3594
    is_protocol = path_has_protocol(backing_file);
3595

    
3596
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3597

    
3598
        /* If either of the filename paths is actually a protocol, then
3599
         * compare unmodified paths; otherwise make paths relative */
3600
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3601
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3602
                retval = curr_bs->backing_hd;
3603
                break;
3604
            }
3605
        } else {
3606
            /* If not an absolute filename path, make it relative to the current
3607
             * image's filename path */
3608
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3609
                         backing_file);
3610

    
3611
            /* We are going to compare absolute pathnames */
3612
            if (!realpath(filename_tmp, filename_full)) {
3613
                continue;
3614
            }
3615

    
3616
            /* We need to make sure the backing filename we are comparing against
3617
             * is relative to the current image filename (or absolute) */
3618
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3619
                         curr_bs->backing_file);
3620

    
3621
            if (!realpath(filename_tmp, backing_file_full)) {
3622
                continue;
3623
            }
3624

    
3625
            if (strcmp(backing_file_full, filename_full) == 0) {
3626
                retval = curr_bs->backing_hd;
3627
                break;
3628
            }
3629
        }
3630
    }
3631

    
3632
    g_free(filename_full);
3633
    g_free(backing_file_full);
3634
    g_free(filename_tmp);
3635
    return retval;
3636
}
3637

    
3638
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3639
{
3640
    if (!bs->drv) {
3641
        return 0;
3642
    }
3643

    
3644
    if (!bs->backing_hd) {
3645
        return 0;
3646
    }
3647

    
3648
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3649
}
3650

    
3651
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3652
{
3653
    BlockDriverState *curr_bs = NULL;
3654

    
3655
    if (!bs) {
3656
        return NULL;
3657
    }
3658

    
3659
    curr_bs = bs;
3660

    
3661
    while (curr_bs->backing_hd) {
3662
        curr_bs = curr_bs->backing_hd;
3663
    }
3664
    return curr_bs;
3665
}
3666

    
3667
/**************************************************************/
3668
/* async I/Os */
3669

    
3670
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3671
                                 QEMUIOVector *qiov, int nb_sectors,
3672
                                 BlockDriverCompletionFunc *cb, void *opaque)
3673
{
3674
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3675

    
3676
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3677
                                 cb, opaque, false);
3678
}
3679

    
3680
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3681
                                  QEMUIOVector *qiov, int nb_sectors,
3682
                                  BlockDriverCompletionFunc *cb, void *opaque)
3683
{
3684
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3685

    
3686
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3687
                                 cb, opaque, true);
3688
}
3689

    
3690

    
3691
typedef struct MultiwriteCB {
3692
    int error;
3693
    int num_requests;
3694
    int num_callbacks;
3695
    struct {
3696
        BlockDriverCompletionFunc *cb;
3697
        void *opaque;
3698
        QEMUIOVector *free_qiov;
3699
    } callbacks[];
3700
} MultiwriteCB;
3701

    
3702
static void multiwrite_user_cb(MultiwriteCB *mcb)
3703
{
3704
    int i;
3705

    
3706
    for (i = 0; i < mcb->num_callbacks; i++) {
3707
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3708
        if (mcb->callbacks[i].free_qiov) {
3709
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3710
        }
3711
        g_free(mcb->callbacks[i].free_qiov);
3712
    }
3713
}
3714

    
3715
static void multiwrite_cb(void *opaque, int ret)
3716
{
3717
    MultiwriteCB *mcb = opaque;
3718

    
3719
    trace_multiwrite_cb(mcb, ret);
3720

    
3721
    if (ret < 0 && !mcb->error) {
3722
        mcb->error = ret;
3723
    }
3724

    
3725
    mcb->num_requests--;
3726
    if (mcb->num_requests == 0) {
3727
        multiwrite_user_cb(mcb);
3728
        g_free(mcb);
3729
    }
3730
}
3731

    
3732
static int multiwrite_req_compare(const void *a, const void *b)
3733
{
3734
    const BlockRequest *req1 = a, *req2 = b;
3735

    
3736
    /*
3737
     * Note that we can't simply subtract req2->sector from req1->sector
3738
     * here as that could overflow the return value.
3739
     */
3740
    if (req1->sector > req2->sector) {
3741
        return 1;
3742
    } else if (req1->sector < req2->sector) {
3743
        return -1;
3744
    } else {
3745
        return 0;
3746
    }
3747
}
3748

    
3749
/*
3750
 * Takes a bunch of requests and tries to merge them. Returns the number of
3751
 * requests that remain after merging.
3752
 */
3753
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3754
    int num_reqs, MultiwriteCB *mcb)
3755
{
3756
    int i, outidx;
3757

    
3758
    // Sort requests by start sector
3759
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3760

    
3761
    // Check if adjacent requests touch the same clusters. If so, combine them,
3762
    // filling up gaps with zero sectors.
3763
    outidx = 0;
3764
    for (i = 1; i < num_reqs; i++) {
3765
        int merge = 0;
3766
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3767

    
3768
        // Handle exactly sequential writes and overlapping writes.
3769
        if (reqs[i].sector <= oldreq_last) {
3770
            merge = 1;
3771
        }
3772

    
3773
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3774
            merge = 0;
3775
        }
3776

    
3777
        if (merge) {
3778
            size_t size;
3779
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3780
            qemu_iovec_init(qiov,
3781
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3782

    
3783
            // Add the first request to the merged one. If the requests are
3784
            // overlapping, drop the last sectors of the first request.
3785
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3786
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3787

    
3788
            // We should need to add any zeros between the two requests
3789
            assert (reqs[i].sector <= oldreq_last);
3790

    
3791
            // Add the second request
3792
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3793

    
3794
            reqs[outidx].nb_sectors = qiov->size >> 9;
3795
            reqs[outidx].qiov = qiov;
3796

    
3797
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3798
        } else {
3799
            outidx++;
3800
            reqs[outidx].sector     = reqs[i].sector;
3801
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3802
            reqs[outidx].qiov       = reqs[i].qiov;
3803
        }
3804
    }
3805

    
3806
    return outidx + 1;
3807
}
3808

    
3809
/*
3810
 * Submit multiple AIO write requests at once.
3811
 *
3812
 * On success, the function returns 0 and all requests in the reqs array have
3813
 * been submitted. In error case this function returns -1, and any of the
3814
 * requests may or may not be submitted yet. In particular, this means that the
3815
 * callback will be called for some of the requests, for others it won't. The
3816
 * caller must check the error field of the BlockRequest to wait for the right
3817
 * callbacks (if error != 0, no callback will be called).
3818
 *
3819
 * The implementation may modify the contents of the reqs array, e.g. to merge
3820
 * requests. However, the fields opaque and error are left unmodified as they
3821
 * are used to signal failure for a single request to the caller.
3822
 */
3823
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3824
{
3825
    MultiwriteCB *mcb;
3826
    int i;
3827

    
3828
    /* don't submit writes if we don't have a medium */
3829
    if (bs->drv == NULL) {
3830
        for (i = 0; i < num_reqs; i++) {
3831
            reqs[i].error = -ENOMEDIUM;
3832
        }
3833
        return -1;
3834
    }
3835

    
3836
    if (num_reqs == 0) {
3837
        return 0;
3838
    }
3839

    
3840
    // Create MultiwriteCB structure
3841
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3842
    mcb->num_requests = 0;
3843
    mcb->num_callbacks = num_reqs;
3844

    
3845
    for (i = 0; i < num_reqs; i++) {
3846
        mcb->callbacks[i].cb = reqs[i].cb;
3847
        mcb->callbacks[i].opaque = reqs[i].opaque;
3848
    }
3849

    
3850
    // Check for mergable requests
3851
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3852

    
3853
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3854

    
3855
    /* Run the aio requests. */
3856
    mcb->num_requests = num_reqs;
3857
    for (i = 0; i < num_reqs; i++) {
3858
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3859
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3860
    }
3861

    
3862
    return 0;
3863
}
3864

    
3865
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3866
{
3867
    acb->aiocb_info->cancel(acb);
3868
}
3869

    
3870
/**************************************************************/
3871
/* async block device emulation */
3872

    
3873
typedef struct BlockDriverAIOCBSync {
3874
    BlockDriverAIOCB common;
3875
    QEMUBH *bh;
3876
    int ret;
3877
    /* vector translation state */
3878
    QEMUIOVector *qiov;
3879
    uint8_t *bounce;
3880
    int is_write;
3881
} BlockDriverAIOCBSync;
3882

    
3883
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3884
{
3885
    BlockDriverAIOCBSync *acb =
3886
        container_of(blockacb, BlockDriverAIOCBSync, common);
3887
    qemu_bh_delete(acb->bh);
3888
    acb->bh = NULL;
3889
    qemu_aio_release(acb);
3890
}
3891

    
3892
static const AIOCBInfo bdrv_em_aiocb_info = {
3893
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3894
    .cancel             = bdrv_aio_cancel_em,
3895
};
3896

    
3897
static void bdrv_aio_bh_cb(void *opaque)
3898
{
3899
    BlockDriverAIOCBSync *acb = opaque;
3900

    
3901
    if (!acb->is_write)
3902
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3903
    qemu_vfree(acb->bounce);
3904
    acb->common.cb(acb->common.opaque, acb->ret);
3905
    qemu_bh_delete(acb->bh);
3906
    acb->bh = NULL;
3907
    qemu_aio_release(acb);
3908
}
3909

    
3910
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3911
                                            int64_t sector_num,
3912
                                            QEMUIOVector *qiov,
3913
                                            int nb_sectors,
3914
                                            BlockDriverCompletionFunc *cb,
3915
                                            void *opaque,
3916
                                            int is_write)
3917

    
3918
{
3919
    BlockDriverAIOCBSync *acb;
3920

    
3921
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3922
    acb->is_write = is_write;
3923
    acb->qiov = qiov;
3924
    acb->bounce = qemu_blockalign(bs, qiov->size);
3925
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3926

    
3927
    if (is_write) {
3928
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3929
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3930
    } else {
3931
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3932
    }
3933

    
3934
    qemu_bh_schedule(acb->bh);
3935

    
3936
    return &acb->common;
3937
}
3938

    
3939
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3940
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3941
        BlockDriverCompletionFunc *cb, void *opaque)
3942
{
3943
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3944
}
3945

    
3946
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3947
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3948
        BlockDriverCompletionFunc *cb, void *opaque)
3949
{
3950
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3951
}
3952

    
3953

    
3954
typedef struct BlockDriverAIOCBCoroutine {
3955
    BlockDriverAIOCB common;
3956
    BlockRequest req;
3957
    bool is_write;
3958
    bool *done;
3959
    QEMUBH* bh;
3960
} BlockDriverAIOCBCoroutine;
3961

    
3962
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3963
{
3964
    BlockDriverAIOCBCoroutine *acb =
3965
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3966
    bool done = false;
3967

    
3968
    acb->done = &done;
3969
    while (!done) {
3970
        qemu_aio_wait();
3971
    }
3972
}
3973

    
3974
static const AIOCBInfo bdrv_em_co_aiocb_info = {
3975
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3976
    .cancel             = bdrv_aio_co_cancel_em,
3977
};
3978

    
3979
static void bdrv_co_em_bh(void *opaque)
3980
{
3981
    BlockDriverAIOCBCoroutine *acb = opaque;
3982

    
3983
    acb->common.cb(acb->common.opaque, acb->req.error);
3984

    
3985
    if (acb->done) {
3986
        *acb->done = true;
3987
    }
3988

    
3989
    qemu_bh_delete(acb->bh);
3990
    qemu_aio_release(acb);
3991
}
3992

    
3993
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3994
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3995
{
3996
    BlockDriverAIOCBCoroutine *acb = opaque;
3997
    BlockDriverState *bs = acb->common.bs;
3998

    
3999
    if (!acb->is_write) {
4000
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4001
            acb->req.nb_sectors, acb->req.qiov, 0);
4002
    } else {
4003
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4004
            acb->req.nb_sectors, acb->req.qiov, 0);
4005
    }
4006

    
4007
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4008
    qemu_bh_schedule(acb->bh);
4009
}
4010

    
4011
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4012
                                               int64_t sector_num,
4013
                                               QEMUIOVector *qiov,
4014
                                               int nb_sectors,
4015
                                               BlockDriverCompletionFunc *cb,
4016
                                               void *opaque,
4017
                                               bool is_write)
4018
{
4019
    Coroutine *co;
4020
    BlockDriverAIOCBCoroutine *acb;
4021

    
4022
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4023
    acb->req.sector = sector_num;
4024
    acb->req.nb_sectors = nb_sectors;
4025
    acb->req.qiov = qiov;
4026
    acb->is_write = is_write;
4027
    acb->done = NULL;
4028

    
4029
    co = qemu_coroutine_create(bdrv_co_do_rw);
4030
    qemu_coroutine_enter(co, acb);
4031

    
4032
    return &acb->common;
4033
}
4034

    
4035
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4036
{
4037
    BlockDriverAIOCBCoroutine *acb = opaque;
4038
    BlockDriverState *bs = acb->common.bs;
4039

    
4040
    acb->req.error = bdrv_co_flush(bs);
4041
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4042
    qemu_bh_schedule(acb->bh);
4043
}
4044

    
4045
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4046
        BlockDriverCompletionFunc *cb, void *opaque)
4047
{
4048
    trace_bdrv_aio_flush(bs, opaque);
4049

    
4050
    Coroutine *co;
4051
    BlockDriverAIOCBCoroutine *acb;
4052

    
4053
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4054
    acb->done = NULL;
4055

    
4056
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4057
    qemu_coroutine_enter(co, acb);
4058

    
4059
    return &acb->common;
4060
}
4061

    
4062
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4063
{
4064
    BlockDriverAIOCBCoroutine *acb = opaque;
4065
    BlockDriverState *bs = acb->common.bs;
4066

    
4067
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4068
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4069
    qemu_bh_schedule(acb->bh);
4070
}
4071

    
4072
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4073
        int64_t sector_num, int nb_sectors,
4074
        BlockDriverCompletionFunc *cb, void *opaque)
4075
{
4076
    Coroutine *co;
4077
    BlockDriverAIOCBCoroutine *acb;
4078

    
4079
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4080

    
4081
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4082
    acb->req.sector = sector_num;
4083
    acb->req.nb_sectors = nb_sectors;
4084
    acb->done = NULL;
4085
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4086
    qemu_coroutine_enter(co, acb);
4087

    
4088
    return &acb->common;
4089
}
4090

    
4091
void bdrv_init(void)
4092
{
4093
    module_call_init(MODULE_INIT_BLOCK);
4094
}
4095

    
4096
void bdrv_init_with_whitelist(void)
4097
{
4098
    use_bdrv_whitelist = 1;
4099
    bdrv_init();
4100
}
4101

    
4102
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4103
                   BlockDriverCompletionFunc *cb, void *opaque)
4104
{
4105
    BlockDriverAIOCB *acb;
4106

    
4107
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4108
    acb->aiocb_info = aiocb_info;
4109
    acb->bs = bs;
4110
    acb->cb = cb;
4111
    acb->opaque = opaque;
4112
    return acb;
4113
}
4114

    
4115
void qemu_aio_release(void *p)
4116
{
4117
    BlockDriverAIOCB *acb = p;
4118
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4119
}
4120

    
4121
/**************************************************************/
4122
/* Coroutine block device emulation */
4123

    
4124
typedef struct CoroutineIOCompletion {
4125
    Coroutine *coroutine;
4126
    int ret;
4127
} CoroutineIOCompletion;
4128

    
4129
static void bdrv_co_io_em_complete(void *opaque, int ret)
4130
{
4131
    CoroutineIOCompletion *co = opaque;
4132

    
4133
    co->ret = ret;
4134
    qemu_coroutine_enter(co->coroutine, NULL);
4135
}
4136

    
4137
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4138
                                      int nb_sectors, QEMUIOVector *iov,
4139
                                      bool is_write)
4140
{
4141
    CoroutineIOCompletion co = {
4142
        .coroutine = qemu_coroutine_self(),
4143
    };
4144
    BlockDriverAIOCB *acb;
4145

    
4146
    if (is_write) {
4147
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4148
                                       bdrv_co_io_em_complete, &co);
4149
    } else {
4150
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4151
                                      bdrv_co_io_em_complete, &co);
4152
    }
4153

    
4154
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4155
    if (!acb) {
4156
        return -EIO;
4157
    }
4158
    qemu_coroutine_yield();
4159

    
4160
    return co.ret;
4161
}
4162

    
4163
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4164
                                         int64_t sector_num, int nb_sectors,
4165
                                         QEMUIOVector *iov)
4166
{
4167
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4168
}
4169

    
4170
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4171
                                         int64_t sector_num, int nb_sectors,
4172
                                         QEMUIOVector *iov)
4173
{
4174
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4175
}
4176

    
4177
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4178
{
4179
    RwCo *rwco = opaque;
4180

    
4181
    rwco->ret = bdrv_co_flush(rwco->bs);
4182
}
4183

    
4184
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4185
{
4186
    int ret;
4187

    
4188
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4189
        return 0;
4190
    }
4191

    
4192
    /* Write back cached data to the OS even with cache=unsafe */
4193
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4194
    if (bs->drv->bdrv_co_flush_to_os) {
4195
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4196
        if (ret < 0) {
4197
            return ret;
4198
        }
4199
    }
4200

    
4201
    /* But don't actually force it to the disk with cache=unsafe */
4202
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4203
        goto flush_parent;
4204
    }
4205

    
4206
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4207
    if (bs->drv->bdrv_co_flush_to_disk) {
4208
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4209
    } else if (bs->drv->bdrv_aio_flush) {
4210
        BlockDriverAIOCB *acb;
4211
        CoroutineIOCompletion co = {
4212
            .coroutine = qemu_coroutine_self(),
4213
        };
4214

    
4215
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4216
        if (acb == NULL) {
4217
            ret = -EIO;
4218
        } else {
4219
            qemu_coroutine_yield();
4220
            ret = co.ret;
4221
        }
4222
    } else {
4223
        /*
4224
         * Some block drivers always operate in either writethrough or unsafe
4225
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4226
         * know how the server works (because the behaviour is hardcoded or
4227
         * depends on server-side configuration), so we can't ensure that
4228
         * everything is safe on disk. Returning an error doesn't work because
4229
         * that would break guests even if the server operates in writethrough
4230
         * mode.
4231
         *
4232
         * Let's hope the user knows what he's doing.
4233
         */
4234
        ret = 0;
4235
    }
4236
    if (ret < 0) {
4237
        return ret;
4238
    }
4239

    
4240
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4241
     * in the case of cache=unsafe, so there are no useless flushes.
4242
     */
4243
flush_parent:
4244
    return bdrv_co_flush(bs->file);
4245
}
4246

    
4247
void bdrv_invalidate_cache(BlockDriverState *bs)
4248
{
4249
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4250
        bs->drv->bdrv_invalidate_cache(bs);
4251
    }
4252
}
4253

    
4254
void bdrv_invalidate_cache_all(void)
4255
{
4256
    BlockDriverState *bs;
4257

    
4258
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4259
        bdrv_invalidate_cache(bs);
4260
    }
4261
}
4262

    
4263
void bdrv_clear_incoming_migration_all(void)
4264
{
4265
    BlockDriverState *bs;
4266

    
4267
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
4268
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4269
    }
4270
}
4271

    
4272
int bdrv_flush(BlockDriverState *bs)
4273
{
4274
    Coroutine *co;
4275
    RwCo rwco = {
4276
        .bs = bs,
4277
        .ret = NOT_DONE,
4278
    };
4279

    
4280
    if (qemu_in_coroutine()) {
4281
        /* Fast-path if already in coroutine context */
4282
        bdrv_flush_co_entry(&rwco);
4283
    } else {
4284
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4285
        qemu_coroutine_enter(co, &rwco);
4286
        while (rwco.ret == NOT_DONE) {
4287
            qemu_aio_wait();
4288
        }
4289
    }
4290

    
4291
    return rwco.ret;
4292
}
4293

    
4294
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4295
{
4296
    RwCo *rwco = opaque;
4297

    
4298
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4299
}
4300

    
4301
/* if no limit is specified in the BlockLimits use a default
4302
 * of 32768 512-byte sectors (16 MiB) per request.
4303
 */
4304
#define MAX_DISCARD_DEFAULT 32768
4305

    
4306
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4307
                                 int nb_sectors)
4308
{
4309
    if (!bs->drv) {
4310
        return -ENOMEDIUM;
4311
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4312
        return -EIO;
4313
    } else if (bs->read_only) {
4314
        return -EROFS;
4315
    }
4316

    
4317
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4318

    
4319
    /* Do nothing if disabled.  */
4320
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4321
        return 0;
4322
    }
4323

    
4324
    if (bs->drv->bdrv_co_discard) {
4325
        int max_discard = bs->bl.max_discard ?
4326
                          bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4327

    
4328
        while (nb_sectors > 0) {
4329
            int ret;
4330
            int num = nb_sectors;
4331

    
4332
            /* align request */
4333
            if (bs->bl.discard_alignment &&
4334
                num >= bs->bl.discard_alignment &&
4335
                sector_num % bs->bl.discard_alignment) {
4336
                if (num > bs->bl.discard_alignment) {
4337
                    num = bs->bl.discard_alignment;
4338
                }
4339
                num -= sector_num % bs->bl.discard_alignment;
4340
            }
4341

    
4342
            /* limit request size */
4343
            if (num > max_discard) {
4344
                num = max_discard;
4345
            }
4346

    
4347
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4348
            if (ret) {
4349
                return ret;
4350
            }
4351

    
4352
            sector_num += num;
4353
            nb_sectors -= num;
4354
        }
4355
        return 0;
4356
    } else if (bs->drv->bdrv_aio_discard) {
4357
        BlockDriverAIOCB *acb;
4358
        CoroutineIOCompletion co = {
4359
            .coroutine = qemu_coroutine_self(),
4360
        };
4361

    
4362
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4363
                                        bdrv_co_io_em_complete, &co);
4364
        if (acb == NULL) {
4365
            return -EIO;
4366
        } else {
4367
            qemu_coroutine_yield();
4368
            return co.ret;
4369
        }
4370
    } else {
4371
        return 0;
4372
    }
4373
}
4374

    
4375
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4376
{
4377
    Coroutine *co;
4378
    RwCo rwco = {
4379
        .bs = bs,
4380
        .sector_num = sector_num,
4381
        .nb_sectors = nb_sectors,
4382
        .ret = NOT_DONE,
4383
    };
4384

    
4385
    if (qemu_in_coroutine()) {
4386
        /* Fast-path if already in coroutine context */
4387
        bdrv_discard_co_entry(&rwco);
4388
    } else {
4389
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4390
        qemu_coroutine_enter(co, &rwco);
4391
        while (rwco.ret == NOT_DONE) {
4392
            qemu_aio_wait();
4393
        }
4394
    }
4395

    
4396
    return rwco.ret;
4397
}
4398

    
4399
/**************************************************************/
4400
/* removable device support */
4401

    
4402
/**
4403
 * Return TRUE if the media is present
4404
 */
4405
int bdrv_is_inserted(BlockDriverState *bs)
4406
{
4407
    BlockDriver *drv = bs->drv;
4408

    
4409
    if (!drv)
4410
        return 0;
4411
    if (!drv->bdrv_is_inserted)
4412
        return 1;
4413
    return drv->bdrv_is_inserted(bs);
4414
}
4415

    
4416
/**
4417
 * Return whether the media changed since the last call to this
4418
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4419
 */
4420
int bdrv_media_changed(BlockDriverState *bs)
4421
{
4422
    BlockDriver *drv = bs->drv;
4423

    
4424
    if (drv && drv->bdrv_media_changed) {
4425
        return drv->bdrv_media_changed(bs);
4426
    }
4427
    return -ENOTSUP;
4428
}
4429

    
4430
/**
4431
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4432
 */
4433
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4434
{
4435
    BlockDriver *drv = bs->drv;
4436

    
4437
    if (drv && drv->bdrv_eject) {
4438
        drv->bdrv_eject(bs, eject_flag);
4439
    }
4440

    
4441
    if (bs->device_name[0] != '\0') {
4442
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4443
    }
4444
}
4445

    
4446
/**
4447
 * Lock or unlock the media (if it is locked, the user won't be able
4448
 * to eject it manually).
4449
 */
4450
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4451
{
4452
    BlockDriver *drv = bs->drv;
4453

    
4454
    trace_bdrv_lock_medium(bs, locked);
4455

    
4456
    if (drv && drv->bdrv_lock_medium) {
4457
        drv->bdrv_lock_medium(bs, locked);
4458
    }
4459
}
4460

    
4461
/* needed for generic scsi interface */
4462

    
4463
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4464
{
4465
    BlockDriver *drv = bs->drv;
4466

    
4467
    if (drv && drv->bdrv_ioctl)
4468
        return drv->bdrv_ioctl(bs, req, buf);
4469
    return -ENOTSUP;
4470
}
4471

    
4472
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4473
        unsigned long int req, void *buf,
4474
        BlockDriverCompletionFunc *cb, void *opaque)
4475
{
4476
    BlockDriver *drv = bs->drv;
4477

    
4478
    if (drv && drv->bdrv_aio_ioctl)
4479
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4480
    return NULL;
4481
}
4482

    
4483
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4484
{
4485
    bs->buffer_alignment = align;
4486
}
4487

    
4488
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4489
{
4490
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4491
}
4492

    
4493
/*
4494
 * Check if all memory in this vector is sector aligned.
4495
 */
4496
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4497
{
4498
    int i;
4499

    
4500
    for (i = 0; i < qiov->niov; i++) {
4501
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4502
            return false;
4503
        }
4504
    }
4505

    
4506
    return true;
4507
}
4508

    
4509
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
4510
{
4511
    int64_t bitmap_size;
4512
    BdrvDirtyBitmap *bitmap;
4513

    
4514
    assert((granularity & (granularity - 1)) == 0);
4515

    
4516
    granularity >>= BDRV_SECTOR_BITS;
4517
    assert(granularity);
4518
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4519
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
4520
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4521
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
4522
    return bitmap;
4523
}
4524

    
4525
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4526
{
4527
    BdrvDirtyBitmap *bm, *next;
4528
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
4529
        if (bm == bitmap) {
4530
            QLIST_REMOVE(bitmap, list);
4531
            hbitmap_free(bitmap->bitmap);
4532
            g_free(bitmap);
4533
            return;
4534
        }
4535
    }
4536
}
4537

    
4538
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
4539
{
4540
    BdrvDirtyBitmap *bm;
4541
    BlockDirtyInfoList *list = NULL;
4542
    BlockDirtyInfoList **plist = &list;
4543

    
4544
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
4545
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
4546
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
4547
        info->count = bdrv_get_dirty_count(bs, bm);
4548
        info->granularity =
4549
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
4550
        entry->value = info;
4551
        *plist = entry;
4552
        plist = &entry->next;
4553
    }
4554

    
4555
    return list;
4556
}
4557

    
4558
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
4559
{
4560
    if (bitmap) {
4561
        return hbitmap_get(bitmap->bitmap, sector);
4562
    } else {
4563
        return 0;
4564
    }
4565
}
4566

    
4567
void bdrv_dirty_iter_init(BlockDriverState *bs,
4568
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
4569
{
4570
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
4571
}
4572

    
4573
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4574
                    int nr_sectors)
4575
{
4576
    BdrvDirtyBitmap *bitmap;
4577
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4578
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
4579
    }
4580
}
4581

    
4582
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
4583
{
4584
    BdrvDirtyBitmap *bitmap;
4585
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4586
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
4587
    }
4588
}
4589

    
4590
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4591
{
4592
    return hbitmap_count(bitmap->bitmap);
4593
}
4594

    
4595
/* Get a reference to bs */
4596
void bdrv_ref(BlockDriverState *bs)
4597
{
4598
    bs->refcnt++;
4599
}
4600

    
4601
/* Release a previously grabbed reference to bs.
4602
 * If after releasing, reference count is zero, the BlockDriverState is
4603
 * deleted. */
4604
void bdrv_unref(BlockDriverState *bs)
4605
{
4606
    assert(bs->refcnt > 0);
4607
    if (--bs->refcnt == 0) {
4608
        bdrv_delete(bs);
4609
    }
4610
}
4611

    
4612
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4613
{
4614
    assert(bs->in_use != in_use);
4615
    bs->in_use = in_use;
4616
}
4617

    
4618
int bdrv_in_use(BlockDriverState *bs)
4619
{
4620
    return bs->in_use;
4621
}
4622

    
4623
void bdrv_iostatus_enable(BlockDriverState *bs)
4624
{
4625
    bs->iostatus_enabled = true;
4626
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4627
}
4628

    
4629
/* The I/O status is only enabled if the drive explicitly
4630
 * enables it _and_ the VM is configured to stop on errors */
4631
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4632
{
4633
    return (bs->iostatus_enabled &&
4634
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4635
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4636
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4637
}
4638

    
4639
void bdrv_iostatus_disable(BlockDriverState *bs)
4640
{
4641
    bs->iostatus_enabled = false;
4642
}
4643

    
4644
void bdrv_iostatus_reset(BlockDriverState *bs)
4645
{
4646
    if (bdrv_iostatus_is_enabled(bs)) {
4647
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4648
        if (bs->job) {
4649
            block_job_iostatus_reset(bs->job);
4650
        }
4651
    }
4652
}
4653

    
4654
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4655
{
4656
    assert(bdrv_iostatus_is_enabled(bs));
4657
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4658
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4659
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4660
    }
4661
}
4662

    
4663
void
4664
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4665
        enum BlockAcctType type)
4666
{
4667
    assert(type < BDRV_MAX_IOTYPE);
4668

    
4669
    cookie->bytes = bytes;
4670
    cookie->start_time_ns = get_clock();
4671
    cookie->type = type;
4672
}
4673

    
4674
void
4675
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4676
{
4677
    assert(cookie->type < BDRV_MAX_IOTYPE);
4678

    
4679
    bs->nr_bytes[cookie->type] += cookie->bytes;
4680
    bs->nr_ops[cookie->type]++;
4681
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4682
}
4683

    
4684
void bdrv_img_create(const char *filename, const char *fmt,
4685
                     const char *base_filename, const char *base_fmt,
4686
                     char *options, uint64_t img_size, int flags,
4687
                     Error **errp, bool quiet)
4688
{
4689
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4690
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4691
    BlockDriverState *bs = NULL;
4692
    BlockDriver *drv, *proto_drv;
4693
    BlockDriver *backing_drv = NULL;
4694
    Error *local_err = NULL;
4695
    int ret = 0;
4696

    
4697
    /* Find driver and parse its options */
4698
    drv = bdrv_find_format(fmt);
4699
    if (!drv) {
4700
        error_setg(errp, "Unknown file format '%s'", fmt);
4701
        return;
4702
    }
4703

    
4704
    proto_drv = bdrv_find_protocol(filename, true);
4705
    if (!proto_drv) {
4706
        error_setg(errp, "Unknown protocol '%s'", filename);
4707
        return;
4708
    }
4709

    
4710
    create_options = append_option_parameters(create_options,
4711
                                              drv->create_options);
4712
    create_options = append_option_parameters(create_options,
4713
                                              proto_drv->create_options);
4714

    
4715
    /* Create parameter list with default values */
4716
    param = parse_option_parameters("", create_options, param);
4717

    
4718
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4719

    
4720
    /* Parse -o options */
4721
    if (options) {
4722
        param = parse_option_parameters(options, create_options, param);
4723
        if (param == NULL) {
4724
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4725
            goto out;
4726
        }
4727
    }
4728

    
4729
    if (base_filename) {
4730
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4731
                                 base_filename)) {
4732
            error_setg(errp, "Backing file not supported for file format '%s'",
4733
                       fmt);
4734
            goto out;
4735
        }
4736
    }
4737

    
4738
    if (base_fmt) {
4739
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4740
            error_setg(errp, "Backing file format not supported for file "
4741
                             "format '%s'", fmt);
4742
            goto out;
4743
        }
4744
    }
4745

    
4746
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4747
    if (backing_file && backing_file->value.s) {
4748
        if (!strcmp(filename, backing_file->value.s)) {
4749
            error_setg(errp, "Error: Trying to create an image with the "
4750
                             "same filename as the backing file");
4751
            goto out;
4752
        }
4753
    }
4754

    
4755
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4756
    if (backing_fmt && backing_fmt->value.s) {
4757
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4758
        if (!backing_drv) {
4759
            error_setg(errp, "Unknown backing file format '%s'",
4760
                       backing_fmt->value.s);
4761
            goto out;
4762
        }
4763
    }
4764

    
4765
    // The size for the image must always be specified, with one exception:
4766
    // If we are using a backing file, we can obtain the size from there
4767
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4768
    if (size && size->value.n == -1) {
4769
        if (backing_file && backing_file->value.s) {
4770
            uint64_t size;
4771
            char buf[32];
4772
            int back_flags;
4773

    
4774
            /* backing files always opened read-only */
4775
            back_flags =
4776
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4777

    
4778
            bs = bdrv_new("");
4779

    
4780
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4781
                            backing_drv, &local_err);
4782
            if (ret < 0) {
4783
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
4784
                                 backing_file->value.s,
4785
                                 error_get_pretty(local_err));
4786
                error_free(local_err);
4787
                local_err = NULL;
4788
                goto out;
4789
            }
4790
            bdrv_get_geometry(bs, &size);
4791
            size *= 512;
4792

    
4793
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4794
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4795
        } else {
4796
            error_setg(errp, "Image creation needs a size parameter");
4797
            goto out;
4798
        }
4799
    }
4800

    
4801
    if (!quiet) {
4802
        printf("Formatting '%s', fmt=%s ", filename, fmt);
4803
        print_option_parameters(param);
4804
        puts("");
4805
    }
4806
    ret = bdrv_create(drv, filename, param, &local_err);
4807
    if (ret == -EFBIG) {
4808
        /* This is generally a better message than whatever the driver would
4809
         * deliver (especially because of the cluster_size_hint), since that
4810
         * is most probably not much different from "image too large". */
4811
        const char *cluster_size_hint = "";
4812
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4813
            cluster_size_hint = " (try using a larger cluster size)";
4814
        }
4815
        error_setg(errp, "The image size is too large for file format '%s'"
4816
                   "%s", fmt, cluster_size_hint);
4817
        error_free(local_err);
4818
        local_err = NULL;
4819
    }
4820

    
4821
out:
4822
    free_option_parameters(create_options);
4823
    free_option_parameters(param);
4824

    
4825
    if (bs) {
4826
        bdrv_unref(bs);
4827
    }
4828
    if (error_is_set(&local_err)) {
4829
        error_propagate(errp, local_err);
4830
    }
4831
}
4832

    
4833
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4834
{
4835
    /* Currently BlockDriverState always uses the main loop AioContext */
4836
    return qemu_get_aio_context();
4837
}
4838

    
4839
void bdrv_add_before_write_notifier(BlockDriverState *bs,
4840
                                    NotifierWithReturn *notifier)
4841
{
4842
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4843
}
4844

    
4845
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
4846
{
4847
    if (bs->drv->bdrv_amend_options == NULL) {
4848
        return -ENOTSUP;
4849
    }
4850
    return bs->drv->bdrv_amend_options(bs, options);
4851
}
4852

    
4853
ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs)
4854
{
4855
    if (bs->drv->bdrv_check_ext_snapshot) {
4856
        return bs->drv->bdrv_check_ext_snapshot(bs);
4857
    }
4858

    
4859
    if (bs->file && bs->file->drv && bs->file->drv->bdrv_check_ext_snapshot) {
4860
        return bs->file->drv->bdrv_check_ext_snapshot(bs);
4861
    }
4862

    
4863
    /* external snapshots are allowed by default */
4864
    return EXT_SNAPSHOT_ALLOWED;
4865
}
4866

    
4867
ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs)
4868
{
4869
    return EXT_SNAPSHOT_FORBIDDEN;
4870
}