Statistics
| Branch: | Revision:

root / block.c @ 212a5a8f

History | View | Annotate | Download (143.1 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "block/qapi.h"
36
#include "qmp-commands.h"
37
#include "qemu/timer.h"
38

    
39
#ifdef CONFIG_BSD
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <sys/ioctl.h>
43
#include <sys/queue.h>
44
#ifndef __DragonFly__
45
#include <sys/disk.h>
46
#endif
47
#endif
48

    
49
#ifdef _WIN32
50
#include <windows.h>
51
#endif
52

    
53
struct BdrvDirtyBitmap {
54
    HBitmap *bitmap;
55
    QLIST_ENTRY(BdrvDirtyBitmap) list;
56
};
57

    
58
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59

    
60
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63
        BlockDriverCompletionFunc *cb, void *opaque);
64
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66
        BlockDriverCompletionFunc *cb, void *opaque);
67
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68
                                         int64_t sector_num, int nb_sectors,
69
                                         QEMUIOVector *iov);
70
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71
                                         int64_t sector_num, int nb_sectors,
72
                                         QEMUIOVector *iov);
73
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
74
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
75
    BdrvRequestFlags flags);
76
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
77
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
78
    BdrvRequestFlags flags);
79
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80
                                               int64_t sector_num,
81
                                               QEMUIOVector *qiov,
82
                                               int nb_sectors,
83
                                               BdrvRequestFlags flags,
84
                                               BlockDriverCompletionFunc *cb,
85
                                               void *opaque,
86
                                               bool is_write);
87
static void coroutine_fn bdrv_co_do_rw(void *opaque);
88
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96

    
97
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_set_io_limits(BlockDriverState *bs,
125
                        ThrottleConfig *cfg)
126
{
127
    int i;
128

    
129
    throttle_config(&bs->throttle_state, cfg);
130

    
131
    for (i = 0; i < 2; i++) {
132
        qemu_co_enter_next(&bs->throttled_reqs[i]);
133
    }
134
}
135

    
136
/* this function drain all the throttled IOs */
137
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138
{
139
    bool drained = false;
140
    bool enabled = bs->io_limits_enabled;
141
    int i;
142

    
143
    bs->io_limits_enabled = false;
144

    
145
    for (i = 0; i < 2; i++) {
146
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147
            drained = true;
148
        }
149
    }
150

    
151
    bs->io_limits_enabled = enabled;
152

    
153
    return drained;
154
}
155

    
156
void bdrv_io_limits_disable(BlockDriverState *bs)
157
{
158
    bs->io_limits_enabled = false;
159

    
160
    bdrv_start_throttled_reqs(bs);
161

    
162
    throttle_destroy(&bs->throttle_state);
163
}
164

    
165
static void bdrv_throttle_read_timer_cb(void *opaque)
166
{
167
    BlockDriverState *bs = opaque;
168
    qemu_co_enter_next(&bs->throttled_reqs[0]);
169
}
170

    
171
static void bdrv_throttle_write_timer_cb(void *opaque)
172
{
173
    BlockDriverState *bs = opaque;
174
    qemu_co_enter_next(&bs->throttled_reqs[1]);
175
}
176

    
177
/* should be called before bdrv_set_io_limits if a limit is set */
178
void bdrv_io_limits_enable(BlockDriverState *bs)
179
{
180
    assert(!bs->io_limits_enabled);
181
    throttle_init(&bs->throttle_state,
182
                  QEMU_CLOCK_VIRTUAL,
183
                  bdrv_throttle_read_timer_cb,
184
                  bdrv_throttle_write_timer_cb,
185
                  bs);
186
    bs->io_limits_enabled = true;
187
}
188

    
189
/* This function makes an IO wait if needed
190
 *
191
 * @nb_sectors: the number of sectors of the IO
192
 * @is_write:   is the IO a write
193
 */
194
static void bdrv_io_limits_intercept(BlockDriverState *bs,
195
                                     int nb_sectors,
196
                                     bool is_write)
197
{
198
    /* does this io must wait */
199
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200

    
201
    /* if must wait or any request of this type throttled queue the IO */
202
    if (must_wait ||
203
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205
    }
206

    
207
    /* the IO will be executed, do the accounting */
208
    throttle_account(&bs->throttle_state,
209
                     is_write,
210
                     nb_sectors * BDRV_SECTOR_SIZE);
211

    
212
    /* if the next request must wait -> do nothing */
213
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214
        return;
215
    }
216

    
217
    /* else queue next request for execution */
218
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219
}
220

    
221
/* check if the path starts with "<protocol>:" */
222
static int path_has_protocol(const char *path)
223
{
224
    const char *p;
225

    
226
#ifdef _WIN32
227
    if (is_windows_drive(path) ||
228
        is_windows_drive_prefix(path)) {
229
        return 0;
230
    }
231
    p = path + strcspn(path, ":/\\");
232
#else
233
    p = path + strcspn(path, ":/");
234
#endif
235

    
236
    return *p == ':';
237
}
238

    
239
int path_is_absolute(const char *path)
240
{
241
#ifdef _WIN32
242
    /* specific case for names like: "\\.\d:" */
243
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
244
        return 1;
245
    }
246
    return (*path == '/' || *path == '\\');
247
#else
248
    return (*path == '/');
249
#endif
250
}
251

    
252
/* if filename is absolute, just copy it to dest. Otherwise, build a
253
   path to it by considering it is relative to base_path. URL are
254
   supported. */
255
void path_combine(char *dest, int dest_size,
256
                  const char *base_path,
257
                  const char *filename)
258
{
259
    const char *p, *p1;
260
    int len;
261

    
262
    if (dest_size <= 0)
263
        return;
264
    if (path_is_absolute(filename)) {
265
        pstrcpy(dest, dest_size, filename);
266
    } else {
267
        p = strchr(base_path, ':');
268
        if (p)
269
            p++;
270
        else
271
            p = base_path;
272
        p1 = strrchr(base_path, '/');
273
#ifdef _WIN32
274
        {
275
            const char *p2;
276
            p2 = strrchr(base_path, '\\');
277
            if (!p1 || p2 > p1)
278
                p1 = p2;
279
        }
280
#endif
281
        if (p1)
282
            p1++;
283
        else
284
            p1 = base_path;
285
        if (p1 > p)
286
            p = p1;
287
        len = p - base_path;
288
        if (len > dest_size - 1)
289
            len = dest_size - 1;
290
        memcpy(dest, base_path, len);
291
        dest[len] = '\0';
292
        pstrcat(dest, dest_size, filename);
293
    }
294
}
295

    
296
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
297
{
298
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
299
        pstrcpy(dest, sz, bs->backing_file);
300
    } else {
301
        path_combine(dest, sz, bs->filename, bs->backing_file);
302
    }
303
}
304

    
305
void bdrv_register(BlockDriver *bdrv)
306
{
307
    /* Block drivers without coroutine functions need emulation */
308
    if (!bdrv->bdrv_co_readv) {
309
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
310
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
311

    
312
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
313
         * the block driver lacks aio we need to emulate that too.
314
         */
315
        if (!bdrv->bdrv_aio_readv) {
316
            /* add AIO emulation layer */
317
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
318
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
319
        }
320
    }
321

    
322
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
323
}
324

    
325
/* create a new block device (by default it is empty) */
326
BlockDriverState *bdrv_new(const char *device_name)
327
{
328
    BlockDriverState *bs;
329

    
330
    bs = g_malloc0(sizeof(BlockDriverState));
331
    QLIST_INIT(&bs->dirty_bitmaps);
332
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
333
    if (device_name[0] != '\0') {
334
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
335
    }
336
    bdrv_iostatus_disable(bs);
337
    notifier_list_init(&bs->close_notifiers);
338
    notifier_with_return_list_init(&bs->before_write_notifiers);
339
    qemu_co_queue_init(&bs->throttled_reqs[0]);
340
    qemu_co_queue_init(&bs->throttled_reqs[1]);
341
    bs->refcnt = 1;
342

    
343
    return bs;
344
}
345

    
346
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
347
{
348
    notifier_list_add(&bs->close_notifiers, notify);
349
}
350

    
351
BlockDriver *bdrv_find_format(const char *format_name)
352
{
353
    BlockDriver *drv1;
354
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
355
        if (!strcmp(drv1->format_name, format_name)) {
356
            return drv1;
357
        }
358
    }
359
    return NULL;
360
}
361

    
362
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
363
{
364
    static const char *whitelist_rw[] = {
365
        CONFIG_BDRV_RW_WHITELIST
366
    };
367
    static const char *whitelist_ro[] = {
368
        CONFIG_BDRV_RO_WHITELIST
369
    };
370
    const char **p;
371

    
372
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
373
        return 1;               /* no whitelist, anything goes */
374
    }
375

    
376
    for (p = whitelist_rw; *p; p++) {
377
        if (!strcmp(drv->format_name, *p)) {
378
            return 1;
379
        }
380
    }
381
    if (read_only) {
382
        for (p = whitelist_ro; *p; p++) {
383
            if (!strcmp(drv->format_name, *p)) {
384
                return 1;
385
            }
386
        }
387
    }
388
    return 0;
389
}
390

    
391
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
392
                                          bool read_only)
393
{
394
    BlockDriver *drv = bdrv_find_format(format_name);
395
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
396
}
397

    
398
typedef struct CreateCo {
399
    BlockDriver *drv;
400
    char *filename;
401
    QEMUOptionParameter *options;
402
    int ret;
403
    Error *err;
404
} CreateCo;
405

    
406
static void coroutine_fn bdrv_create_co_entry(void *opaque)
407
{
408
    Error *local_err = NULL;
409
    int ret;
410

    
411
    CreateCo *cco = opaque;
412
    assert(cco->drv);
413

    
414
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
415
    if (error_is_set(&local_err)) {
416
        error_propagate(&cco->err, local_err);
417
    }
418
    cco->ret = ret;
419
}
420

    
421
int bdrv_create(BlockDriver *drv, const char* filename,
422
    QEMUOptionParameter *options, Error **errp)
423
{
424
    int ret;
425

    
426
    Coroutine *co;
427
    CreateCo cco = {
428
        .drv = drv,
429
        .filename = g_strdup(filename),
430
        .options = options,
431
        .ret = NOT_DONE,
432
        .err = NULL,
433
    };
434

    
435
    if (!drv->bdrv_create) {
436
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
437
        ret = -ENOTSUP;
438
        goto out;
439
    }
440

    
441
    if (qemu_in_coroutine()) {
442
        /* Fast-path if already in coroutine context */
443
        bdrv_create_co_entry(&cco);
444
    } else {
445
        co = qemu_coroutine_create(bdrv_create_co_entry);
446
        qemu_coroutine_enter(co, &cco);
447
        while (cco.ret == NOT_DONE) {
448
            qemu_aio_wait();
449
        }
450
    }
451

    
452
    ret = cco.ret;
453
    if (ret < 0) {
454
        if (error_is_set(&cco.err)) {
455
            error_propagate(errp, cco.err);
456
        } else {
457
            error_setg_errno(errp, -ret, "Could not create image");
458
        }
459
    }
460

    
461
out:
462
    g_free(cco.filename);
463
    return ret;
464
}
465

    
466
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
467
                     Error **errp)
468
{
469
    BlockDriver *drv;
470
    Error *local_err = NULL;
471
    int ret;
472

    
473
    drv = bdrv_find_protocol(filename, true);
474
    if (drv == NULL) {
475
        error_setg(errp, "Could not find protocol for file '%s'", filename);
476
        return -ENOENT;
477
    }
478

    
479
    ret = bdrv_create(drv, filename, options, &local_err);
480
    if (error_is_set(&local_err)) {
481
        error_propagate(errp, local_err);
482
    }
483
    return ret;
484
}
485

    
486
/*
487
 * Create a uniquely-named empty temporary file.
488
 * Return 0 upon success, otherwise a negative errno value.
489
 */
490
int get_tmp_filename(char *filename, int size)
491
{
492
#ifdef _WIN32
493
    char temp_dir[MAX_PATH];
494
    /* GetTempFileName requires that its output buffer (4th param)
495
       have length MAX_PATH or greater.  */
496
    assert(size >= MAX_PATH);
497
    return (GetTempPath(MAX_PATH, temp_dir)
498
            && GetTempFileName(temp_dir, "qem", 0, filename)
499
            ? 0 : -GetLastError());
500
#else
501
    int fd;
502
    const char *tmpdir;
503
    tmpdir = getenv("TMPDIR");
504
    if (!tmpdir)
505
        tmpdir = "/tmp";
506
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
507
        return -EOVERFLOW;
508
    }
509
    fd = mkstemp(filename);
510
    if (fd < 0) {
511
        return -errno;
512
    }
513
    if (close(fd) != 0) {
514
        unlink(filename);
515
        return -errno;
516
    }
517
    return 0;
518
#endif
519
}
520

    
521
/*
522
 * Detect host devices. By convention, /dev/cdrom[N] is always
523
 * recognized as a host CDROM.
524
 */
525
static BlockDriver *find_hdev_driver(const char *filename)
526
{
527
    int score_max = 0, score;
528
    BlockDriver *drv = NULL, *d;
529

    
530
    QLIST_FOREACH(d, &bdrv_drivers, list) {
531
        if (d->bdrv_probe_device) {
532
            score = d->bdrv_probe_device(filename);
533
            if (score > score_max) {
534
                score_max = score;
535
                drv = d;
536
            }
537
        }
538
    }
539

    
540
    return drv;
541
}
542

    
543
BlockDriver *bdrv_find_protocol(const char *filename,
544
                                bool allow_protocol_prefix)
545
{
546
    BlockDriver *drv1;
547
    char protocol[128];
548
    int len;
549
    const char *p;
550

    
551
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
552

    
553
    /*
554
     * XXX(hch): we really should not let host device detection
555
     * override an explicit protocol specification, but moving this
556
     * later breaks access to device names with colons in them.
557
     * Thanks to the brain-dead persistent naming schemes on udev-
558
     * based Linux systems those actually are quite common.
559
     */
560
    drv1 = find_hdev_driver(filename);
561
    if (drv1) {
562
        return drv1;
563
    }
564

    
565
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
566
        return bdrv_find_format("file");
567
    }
568

    
569
    p = strchr(filename, ':');
570
    assert(p != NULL);
571
    len = p - filename;
572
    if (len > sizeof(protocol) - 1)
573
        len = sizeof(protocol) - 1;
574
    memcpy(protocol, filename, len);
575
    protocol[len] = '\0';
576
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
577
        if (drv1->protocol_name &&
578
            !strcmp(drv1->protocol_name, protocol)) {
579
            return drv1;
580
        }
581
    }
582
    return NULL;
583
}
584

    
585
static int find_image_format(BlockDriverState *bs, const char *filename,
586
                             BlockDriver **pdrv, Error **errp)
587
{
588
    int score, score_max;
589
    BlockDriver *drv1, *drv;
590
    uint8_t buf[2048];
591
    int ret = 0;
592

    
593
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
594
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
595
        drv = bdrv_find_format("raw");
596
        if (!drv) {
597
            error_setg(errp, "Could not find raw image format");
598
            ret = -ENOENT;
599
        }
600
        *pdrv = drv;
601
        return ret;
602
    }
603

    
604
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
605
    if (ret < 0) {
606
        error_setg_errno(errp, -ret, "Could not read image for determining its "
607
                         "format");
608
        *pdrv = NULL;
609
        return ret;
610
    }
611

    
612
    score_max = 0;
613
    drv = NULL;
614
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
615
        if (drv1->bdrv_probe) {
616
            score = drv1->bdrv_probe(buf, ret, filename);
617
            if (score > score_max) {
618
                score_max = score;
619
                drv = drv1;
620
            }
621
        }
622
    }
623
    if (!drv) {
624
        error_setg(errp, "Could not determine image format: No compatible "
625
                   "driver found");
626
        ret = -ENOENT;
627
    }
628
    *pdrv = drv;
629
    return ret;
630
}
631

    
632
/**
633
 * Set the current 'total_sectors' value
634
 */
635
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
636
{
637
    BlockDriver *drv = bs->drv;
638

    
639
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
640
    if (bs->sg)
641
        return 0;
642

    
643
    /* query actual device if possible, otherwise just trust the hint */
644
    if (drv->bdrv_getlength) {
645
        int64_t length = drv->bdrv_getlength(bs);
646
        if (length < 0) {
647
            return length;
648
        }
649
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
650
    }
651

    
652
    bs->total_sectors = hint;
653
    return 0;
654
}
655

    
656
/**
657
 * Set open flags for a given discard mode
658
 *
659
 * Return 0 on success, -1 if the discard mode was invalid.
660
 */
661
int bdrv_parse_discard_flags(const char *mode, int *flags)
662
{
663
    *flags &= ~BDRV_O_UNMAP;
664

    
665
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
666
        /* do nothing */
667
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
668
        *flags |= BDRV_O_UNMAP;
669
    } else {
670
        return -1;
671
    }
672

    
673
    return 0;
674
}
675

    
676
/**
677
 * Set open flags for a given cache mode
678
 *
679
 * Return 0 on success, -1 if the cache mode was invalid.
680
 */
681
int bdrv_parse_cache_flags(const char *mode, int *flags)
682
{
683
    *flags &= ~BDRV_O_CACHE_MASK;
684

    
685
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
686
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
687
    } else if (!strcmp(mode, "directsync")) {
688
        *flags |= BDRV_O_NOCACHE;
689
    } else if (!strcmp(mode, "writeback")) {
690
        *flags |= BDRV_O_CACHE_WB;
691
    } else if (!strcmp(mode, "unsafe")) {
692
        *flags |= BDRV_O_CACHE_WB;
693
        *flags |= BDRV_O_NO_FLUSH;
694
    } else if (!strcmp(mode, "writethrough")) {
695
        /* this is the default */
696
    } else {
697
        return -1;
698
    }
699

    
700
    return 0;
701
}
702

    
703
/**
704
 * The copy-on-read flag is actually a reference count so multiple users may
705
 * use the feature without worrying about clobbering its previous state.
706
 * Copy-on-read stays enabled until all users have called to disable it.
707
 */
708
void bdrv_enable_copy_on_read(BlockDriverState *bs)
709
{
710
    bs->copy_on_read++;
711
}
712

    
713
void bdrv_disable_copy_on_read(BlockDriverState *bs)
714
{
715
    assert(bs->copy_on_read > 0);
716
    bs->copy_on_read--;
717
}
718

    
719
static int bdrv_open_flags(BlockDriverState *bs, int flags)
720
{
721
    int open_flags = flags | BDRV_O_CACHE_WB;
722

    
723
    /*
724
     * Clear flags that are internal to the block layer before opening the
725
     * image.
726
     */
727
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
728

    
729
    /*
730
     * Snapshots should be writable.
731
     */
732
    if (bs->is_temporary) {
733
        open_flags |= BDRV_O_RDWR;
734
    }
735

    
736
    return open_flags;
737
}
738

    
739
static int bdrv_assign_node_name(BlockDriverState *bs,
740
                                 const char *node_name,
741
                                 Error **errp)
742
{
743
    if (!node_name) {
744
        return 0;
745
    }
746

    
747
    /* empty string node name is invalid */
748
    if (node_name[0] == '\0') {
749
        error_setg(errp, "Empty node name");
750
        return -EINVAL;
751
    }
752

    
753
    /* takes care of avoiding duplicates node names */
754
    if (bdrv_find_node(node_name)) {
755
        error_setg(errp, "Duplicate node name");
756
        return -EINVAL;
757
    }
758

    
759
    /* copy node name into the bs and insert it into the graph list */
760
    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
761
    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
762

    
763
    return 0;
764
}
765

    
766
/*
767
 * Common part for opening disk images and files
768
 *
769
 * Removes all processed options from *options.
770
 */
771
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
772
    QDict *options, int flags, BlockDriver *drv, Error **errp)
773
{
774
    int ret, open_flags;
775
    const char *filename;
776
    const char *node_name = NULL;
777
    Error *local_err = NULL;
778

    
779
    assert(drv != NULL);
780
    assert(bs->file == NULL);
781
    assert(options != NULL && bs->options != options);
782

    
783
    if (file != NULL) {
784
        filename = file->filename;
785
    } else {
786
        filename = qdict_get_try_str(options, "filename");
787
    }
788

    
789
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
790

    
791
    node_name = qdict_get_try_str(options, "node-name");
792
    ret = bdrv_assign_node_name(bs, node_name, errp);
793
    if (ret < 0) {
794
        return ret;
795
    }
796
    qdict_del(options, "node-name");
797

    
798
    /* bdrv_open() with directly using a protocol as drv. This layer is already
799
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
800
     * and return immediately. */
801
    if (file != NULL && drv->bdrv_file_open) {
802
        bdrv_swap(file, bs);
803
        return 0;
804
    }
805

    
806
    bs->open_flags = flags;
807
    bs->buffer_alignment = 512;
808
    bs->zero_beyond_eof = true;
809
    open_flags = bdrv_open_flags(bs, flags);
810
    bs->read_only = !(open_flags & BDRV_O_RDWR);
811

    
812
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
813
        error_setg(errp,
814
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
815
                        ? "Driver '%s' can only be used for read-only devices"
816
                        : "Driver '%s' is not whitelisted",
817
                   drv->format_name);
818
        return -ENOTSUP;
819
    }
820

    
821
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
822
    if (flags & BDRV_O_COPY_ON_READ) {
823
        if (!bs->read_only) {
824
            bdrv_enable_copy_on_read(bs);
825
        } else {
826
            error_setg(errp, "Can't use copy-on-read on read-only device");
827
            return -EINVAL;
828
        }
829
    }
830

    
831
    if (filename != NULL) {
832
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
833
    } else {
834
        bs->filename[0] = '\0';
835
    }
836

    
837
    bs->drv = drv;
838
    bs->opaque = g_malloc0(drv->instance_size);
839

    
840
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
841

    
842
    /* Open the image, either directly or using a protocol */
843
    if (drv->bdrv_file_open) {
844
        assert(file == NULL);
845
        assert(!drv->bdrv_needs_filename || filename != NULL);
846
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
847
    } else {
848
        if (file == NULL) {
849
            error_setg(errp, "Can't use '%s' as a block driver for the "
850
                       "protocol level", drv->format_name);
851
            ret = -EINVAL;
852
            goto free_and_fail;
853
        }
854
        bs->file = file;
855
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
856
    }
857

    
858
    if (ret < 0) {
859
        if (error_is_set(&local_err)) {
860
            error_propagate(errp, local_err);
861
        } else if (bs->filename[0]) {
862
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
863
        } else {
864
            error_setg_errno(errp, -ret, "Could not open image");
865
        }
866
        goto free_and_fail;
867
    }
868

    
869
    ret = refresh_total_sectors(bs, bs->total_sectors);
870
    if (ret < 0) {
871
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
872
        goto free_and_fail;
873
    }
874

    
875
#ifndef _WIN32
876
    if (bs->is_temporary) {
877
        assert(bs->filename[0] != '\0');
878
        unlink(bs->filename);
879
    }
880
#endif
881
    return 0;
882

    
883
free_and_fail:
884
    bs->file = NULL;
885
    g_free(bs->opaque);
886
    bs->opaque = NULL;
887
    bs->drv = NULL;
888
    return ret;
889
}
890

    
891
/*
892
 * Opens a file using a protocol (file, host_device, nbd, ...)
893
 *
894
 * options is a QDict of options to pass to the block drivers, or NULL for an
895
 * empty set of options. The reference to the QDict belongs to the block layer
896
 * after the call (even on failure), so if the caller intends to reuse the
897
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
898
 */
899
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
900
                   const char *reference, QDict *options, int flags,
901
                   Error **errp)
902
{
903
    BlockDriverState *bs = NULL;
904
    BlockDriver *drv;
905
    const char *drvname;
906
    bool allow_protocol_prefix = false;
907
    Error *local_err = NULL;
908
    int ret;
909

    
910
    /* NULL means an empty set of options */
911
    if (options == NULL) {
912
        options = qdict_new();
913
    }
914

    
915
    if (reference) {
916
        if (filename || qdict_size(options)) {
917
            error_setg(errp, "Cannot reference an existing block device with "
918
                       "additional options or a new filename");
919
            return -EINVAL;
920
        }
921
        QDECREF(options);
922

    
923
        bs = bdrv_find(reference);
924
        if (!bs) {
925
            error_setg(errp, "Cannot find block device '%s'", reference);
926
            return -ENODEV;
927
        }
928
        bdrv_ref(bs);
929
        *pbs = bs;
930
        return 0;
931
    }
932

    
933
    bs = bdrv_new("");
934
    bs->options = options;
935
    options = qdict_clone_shallow(options);
936

    
937
    /* Fetch the file name from the options QDict if necessary */
938
    if (!filename) {
939
        filename = qdict_get_try_str(options, "filename");
940
    } else if (filename && !qdict_haskey(options, "filename")) {
941
        qdict_put(options, "filename", qstring_from_str(filename));
942
        allow_protocol_prefix = true;
943
    } else {
944
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
945
                   "same time");
946
        ret = -EINVAL;
947
        goto fail;
948
    }
949

    
950
    /* Find the right block driver */
951
    drvname = qdict_get_try_str(options, "driver");
952
    if (drvname) {
953
        drv = bdrv_find_format(drvname);
954
        if (!drv) {
955
            error_setg(errp, "Unknown driver '%s'", drvname);
956
        }
957
        qdict_del(options, "driver");
958
    } else if (filename) {
959
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
960
        if (!drv) {
961
            error_setg(errp, "Unknown protocol");
962
        }
963
    } else {
964
        error_setg(errp, "Must specify either driver or file");
965
        drv = NULL;
966
    }
967

    
968
    if (!drv) {
969
        /* errp has been set already */
970
        ret = -ENOENT;
971
        goto fail;
972
    }
973

    
974
    /* Parse the filename and open it */
975
    if (drv->bdrv_parse_filename && filename) {
976
        drv->bdrv_parse_filename(filename, options, &local_err);
977
        if (error_is_set(&local_err)) {
978
            error_propagate(errp, local_err);
979
            ret = -EINVAL;
980
            goto fail;
981
        }
982
        qdict_del(options, "filename");
983
    } else if (drv->bdrv_needs_filename && !filename) {
984
        error_setg(errp, "The '%s' block driver requires a file name",
985
                   drv->format_name);
986
        ret = -EINVAL;
987
        goto fail;
988
    }
989

    
990
    if (!drv->bdrv_file_open) {
991
        ret = bdrv_open(bs, filename, options, flags, drv, &local_err);
992
        options = NULL;
993
    } else {
994
        ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
995
    }
996
    if (ret < 0) {
997
        error_propagate(errp, local_err);
998
        goto fail;
999
    }
1000

    
1001
    /* Check if any unknown options were used */
1002
    if (options && (qdict_size(options) != 0)) {
1003
        const QDictEntry *entry = qdict_first(options);
1004
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1005
                   drv->format_name, entry->key);
1006
        ret = -EINVAL;
1007
        goto fail;
1008
    }
1009
    QDECREF(options);
1010

    
1011
    bs->growable = 1;
1012
    *pbs = bs;
1013
    return 0;
1014

    
1015
fail:
1016
    QDECREF(options);
1017
    if (!bs->drv) {
1018
        QDECREF(bs->options);
1019
    }
1020
    bdrv_unref(bs);
1021
    return ret;
1022
}
1023

    
1024
/*
1025
 * Opens the backing file for a BlockDriverState if not yet open
1026
 *
1027
 * options is a QDict of options to pass to the block drivers, or NULL for an
1028
 * empty set of options. The reference to the QDict is transferred to this
1029
 * function (even on failure), so if the caller intends to reuse the dictionary,
1030
 * it needs to use QINCREF() before calling bdrv_file_open.
1031
 */
1032
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1033
{
1034
    char backing_filename[PATH_MAX];
1035
    int back_flags, ret;
1036
    BlockDriver *back_drv = NULL;
1037
    Error *local_err = NULL;
1038

    
1039
    if (bs->backing_hd != NULL) {
1040
        QDECREF(options);
1041
        return 0;
1042
    }
1043

    
1044
    /* NULL means an empty set of options */
1045
    if (options == NULL) {
1046
        options = qdict_new();
1047
    }
1048

    
1049
    bs->open_flags &= ~BDRV_O_NO_BACKING;
1050
    if (qdict_haskey(options, "file.filename")) {
1051
        backing_filename[0] = '\0';
1052
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1053
        QDECREF(options);
1054
        return 0;
1055
    } else {
1056
        bdrv_get_full_backing_filename(bs, backing_filename,
1057
                                       sizeof(backing_filename));
1058
    }
1059

    
1060
    bs->backing_hd = bdrv_new("");
1061

    
1062
    if (bs->backing_format[0] != '\0') {
1063
        back_drv = bdrv_find_format(bs->backing_format);
1064
    }
1065

    
1066
    /* backing files always opened read-only */
1067
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1068
                                    BDRV_O_COPY_ON_READ);
1069

    
1070
    ret = bdrv_open(bs->backing_hd,
1071
                    *backing_filename ? backing_filename : NULL, options,
1072
                    back_flags, back_drv, &local_err);
1073
    if (ret < 0) {
1074
        bdrv_unref(bs->backing_hd);
1075
        bs->backing_hd = NULL;
1076
        bs->open_flags |= BDRV_O_NO_BACKING;
1077
        error_setg(errp, "Could not open backing file: %s",
1078
                   error_get_pretty(local_err));
1079
        error_free(local_err);
1080
        return ret;
1081
    }
1082

    
1083
    if (bs->backing_hd->file) {
1084
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1085
                bs->backing_hd->file->filename);
1086
    }
1087

    
1088
    return 0;
1089
}
1090

    
1091
/*
1092
 * Opens a disk image whose options are given as BlockdevRef in another block
1093
 * device's options.
1094
 *
1095
 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1096
 * image format auto-detection. If it is false and a filename is given,
1097
 * bdrv_open() will be used for auto-detection.
1098
 *
1099
 * If allow_none is true, no image will be opened if filename is false and no
1100
 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1101
 *
1102
 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1103
 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1104
 * itself, all options starting with "${bdref_key}." are considered part of the
1105
 * BlockdevRef.
1106
 *
1107
 * The BlockdevRef will be removed from the options QDict.
1108
 */
1109
int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1110
                    QDict *options, const char *bdref_key, int flags,
1111
                    bool force_raw, bool allow_none, Error **errp)
1112
{
1113
    QDict *image_options;
1114
    int ret;
1115
    char *bdref_key_dot;
1116
    const char *reference;
1117

    
1118
    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1119
    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1120
    g_free(bdref_key_dot);
1121

    
1122
    reference = qdict_get_try_str(options, bdref_key);
1123
    if (!filename && !reference && !qdict_size(image_options)) {
1124
        if (allow_none) {
1125
            ret = 0;
1126
        } else {
1127
            error_setg(errp, "A block device must be specified for \"%s\"",
1128
                       bdref_key);
1129
            ret = -EINVAL;
1130
        }
1131
        goto done;
1132
    }
1133

    
1134
    if (filename && !force_raw) {
1135
        /* If a filename is given and the block driver should be detected
1136
           automatically (instead of using none), use bdrv_open() in order to do
1137
           that auto-detection. */
1138
        BlockDriverState *bs;
1139

    
1140
        if (reference) {
1141
            error_setg(errp, "Cannot reference an existing block device while "
1142
                       "giving a filename");
1143
            ret = -EINVAL;
1144
            goto done;
1145
        }
1146

    
1147
        bs = bdrv_new("");
1148
        ret = bdrv_open(bs, filename, image_options, flags, NULL, errp);
1149
        if (ret < 0) {
1150
            bdrv_unref(bs);
1151
        } else {
1152
            *pbs = bs;
1153
        }
1154
    } else {
1155
        ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
1156
                             errp);
1157
    }
1158

    
1159
done:
1160
    qdict_del(options, bdref_key);
1161
    return ret;
1162
}
1163

    
1164
/*
1165
 * Opens a disk image (raw, qcow2, vmdk, ...)
1166
 *
1167
 * options is a QDict of options to pass to the block drivers, or NULL for an
1168
 * empty set of options. The reference to the QDict belongs to the block layer
1169
 * after the call (even on failure), so if the caller intends to reuse the
1170
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1171
 */
1172
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1173
              int flags, BlockDriver *drv, Error **errp)
1174
{
1175
    int ret;
1176
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1177
    char tmp_filename[PATH_MAX + 1];
1178
    BlockDriverState *file = NULL;
1179
    const char *drvname;
1180
    Error *local_err = NULL;
1181

    
1182
    /* NULL means an empty set of options */
1183
    if (options == NULL) {
1184
        options = qdict_new();
1185
    }
1186

    
1187
    bs->options = options;
1188
    options = qdict_clone_shallow(options);
1189

    
1190
    /* For snapshot=on, create a temporary qcow2 overlay */
1191
    if (flags & BDRV_O_SNAPSHOT) {
1192
        BlockDriverState *bs1;
1193
        int64_t total_size;
1194
        BlockDriver *bdrv_qcow2;
1195
        QEMUOptionParameter *create_options;
1196
        QDict *snapshot_options;
1197

    
1198
        /* if snapshot, we create a temporary backing file and open it
1199
           instead of opening 'filename' directly */
1200

    
1201
        /* Get the required size from the image */
1202
        bs1 = bdrv_new("");
1203
        QINCREF(options);
1204
        ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
1205
                        drv, &local_err);
1206
        if (ret < 0) {
1207
            bdrv_unref(bs1);
1208
            goto fail;
1209
        }
1210
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1211

    
1212
        bdrv_unref(bs1);
1213

    
1214
        /* Create the temporary image */
1215
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1216
        if (ret < 0) {
1217
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1218
            goto fail;
1219
        }
1220

    
1221
        bdrv_qcow2 = bdrv_find_format("qcow2");
1222
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1223
                                                 NULL);
1224

    
1225
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1226

    
1227
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1228
        free_option_parameters(create_options);
1229
        if (ret < 0) {
1230
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1231
                             "'%s': %s", tmp_filename,
1232
                             error_get_pretty(local_err));
1233
            error_free(local_err);
1234
            local_err = NULL;
1235
            goto fail;
1236
        }
1237

    
1238
        /* Prepare a new options QDict for the temporary file, where user
1239
         * options refer to the backing file */
1240
        if (filename) {
1241
            qdict_put(options, "file.filename", qstring_from_str(filename));
1242
        }
1243
        if (drv) {
1244
            qdict_put(options, "driver", qstring_from_str(drv->format_name));
1245
        }
1246

    
1247
        snapshot_options = qdict_new();
1248
        qdict_put(snapshot_options, "backing", options);
1249
        qdict_flatten(snapshot_options);
1250

    
1251
        bs->options = snapshot_options;
1252
        options = qdict_clone_shallow(bs->options);
1253

    
1254
        filename = tmp_filename;
1255
        drv = bdrv_qcow2;
1256
        bs->is_temporary = 1;
1257
    }
1258

    
1259
    /* Open image file without format layer */
1260
    if (flags & BDRV_O_RDWR) {
1261
        flags |= BDRV_O_ALLOW_RDWR;
1262
    }
1263

    
1264
    ret = bdrv_open_image(&file, filename, options, "file",
1265
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1266
                          &local_err);
1267
    if (ret < 0) {
1268
        goto fail;
1269
    }
1270

    
1271
    /* Find the right image format driver */
1272
    drvname = qdict_get_try_str(options, "driver");
1273
    if (drvname) {
1274
        drv = bdrv_find_format(drvname);
1275
        qdict_del(options, "driver");
1276
        if (!drv) {
1277
            error_setg(errp, "Invalid driver: '%s'", drvname);
1278
            ret = -EINVAL;
1279
            goto unlink_and_fail;
1280
        }
1281
    }
1282

    
1283
    if (!drv) {
1284
        if (file) {
1285
            ret = find_image_format(file, filename, &drv, &local_err);
1286
        } else {
1287
            error_setg(errp, "Must specify either driver or file");
1288
            ret = -EINVAL;
1289
            goto unlink_and_fail;
1290
        }
1291
    }
1292

    
1293
    if (!drv) {
1294
        goto unlink_and_fail;
1295
    }
1296

    
1297
    /* Open the image */
1298
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1299
    if (ret < 0) {
1300
        goto unlink_and_fail;
1301
    }
1302

    
1303
    if (file && (bs->file != file)) {
1304
        bdrv_unref(file);
1305
        file = NULL;
1306
    }
1307

    
1308
    /* If there is a backing file, use it */
1309
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1310
        QDict *backing_options;
1311

    
1312
        qdict_extract_subqdict(options, &backing_options, "backing.");
1313
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1314
        if (ret < 0) {
1315
            goto close_and_fail;
1316
        }
1317
    }
1318

    
1319
    /* Check if any unknown options were used */
1320
    if (qdict_size(options) != 0) {
1321
        const QDictEntry *entry = qdict_first(options);
1322
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1323
                   "support the option '%s'", drv->format_name, bs->device_name,
1324
                   entry->key);
1325

    
1326
        ret = -EINVAL;
1327
        goto close_and_fail;
1328
    }
1329
    QDECREF(options);
1330

    
1331
    if (!bdrv_key_required(bs)) {
1332
        bdrv_dev_change_media_cb(bs, true);
1333
    }
1334

    
1335
    return 0;
1336

    
1337
unlink_and_fail:
1338
    if (file != NULL) {
1339
        bdrv_unref(file);
1340
    }
1341
    if (bs->is_temporary) {
1342
        unlink(filename);
1343
    }
1344
fail:
1345
    QDECREF(bs->options);
1346
    QDECREF(options);
1347
    bs->options = NULL;
1348
    if (error_is_set(&local_err)) {
1349
        error_propagate(errp, local_err);
1350
    }
1351
    return ret;
1352

    
1353
close_and_fail:
1354
    bdrv_close(bs);
1355
    QDECREF(options);
1356
    if (error_is_set(&local_err)) {
1357
        error_propagate(errp, local_err);
1358
    }
1359
    return ret;
1360
}
1361

    
1362
typedef struct BlockReopenQueueEntry {
1363
     bool prepared;
1364
     BDRVReopenState state;
1365
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1366
} BlockReopenQueueEntry;
1367

    
1368
/*
1369
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1370
 * reopen of multiple devices.
1371
 *
1372
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1373
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1374
 * be created and initialized. This newly created BlockReopenQueue should be
1375
 * passed back in for subsequent calls that are intended to be of the same
1376
 * atomic 'set'.
1377
 *
1378
 * bs is the BlockDriverState to add to the reopen queue.
1379
 *
1380
 * flags contains the open flags for the associated bs
1381
 *
1382
 * returns a pointer to bs_queue, which is either the newly allocated
1383
 * bs_queue, or the existing bs_queue being used.
1384
 *
1385
 */
1386
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1387
                                    BlockDriverState *bs, int flags)
1388
{
1389
    assert(bs != NULL);
1390

    
1391
    BlockReopenQueueEntry *bs_entry;
1392
    if (bs_queue == NULL) {
1393
        bs_queue = g_new0(BlockReopenQueue, 1);
1394
        QSIMPLEQ_INIT(bs_queue);
1395
    }
1396

    
1397
    if (bs->file) {
1398
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1399
    }
1400

    
1401
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1402
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1403

    
1404
    bs_entry->state.bs = bs;
1405
    bs_entry->state.flags = flags;
1406

    
1407
    return bs_queue;
1408
}
1409

    
1410
/*
1411
 * Reopen multiple BlockDriverStates atomically & transactionally.
1412
 *
1413
 * The queue passed in (bs_queue) must have been built up previous
1414
 * via bdrv_reopen_queue().
1415
 *
1416
 * Reopens all BDS specified in the queue, with the appropriate
1417
 * flags.  All devices are prepared for reopen, and failure of any
1418
 * device will cause all device changes to be abandonded, and intermediate
1419
 * data cleaned up.
1420
 *
1421
 * If all devices prepare successfully, then the changes are committed
1422
 * to all devices.
1423
 *
1424
 */
1425
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1426
{
1427
    int ret = -1;
1428
    BlockReopenQueueEntry *bs_entry, *next;
1429
    Error *local_err = NULL;
1430

    
1431
    assert(bs_queue != NULL);
1432

    
1433
    bdrv_drain_all();
1434

    
1435
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1436
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1437
            error_propagate(errp, local_err);
1438
            goto cleanup;
1439
        }
1440
        bs_entry->prepared = true;
1441
    }
1442

    
1443
    /* If we reach this point, we have success and just need to apply the
1444
     * changes
1445
     */
1446
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1447
        bdrv_reopen_commit(&bs_entry->state);
1448
    }
1449

    
1450
    ret = 0;
1451

    
1452
cleanup:
1453
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1454
        if (ret && bs_entry->prepared) {
1455
            bdrv_reopen_abort(&bs_entry->state);
1456
        }
1457
        g_free(bs_entry);
1458
    }
1459
    g_free(bs_queue);
1460
    return ret;
1461
}
1462

    
1463

    
1464
/* Reopen a single BlockDriverState with the specified flags. */
1465
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1466
{
1467
    int ret = -1;
1468
    Error *local_err = NULL;
1469
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1470

    
1471
    ret = bdrv_reopen_multiple(queue, &local_err);
1472
    if (local_err != NULL) {
1473
        error_propagate(errp, local_err);
1474
    }
1475
    return ret;
1476
}
1477

    
1478

    
1479
/*
1480
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1481
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1482
 * the block driver layer .bdrv_reopen_prepare()
1483
 *
1484
 * bs is the BlockDriverState to reopen
1485
 * flags are the new open flags
1486
 * queue is the reopen queue
1487
 *
1488
 * Returns 0 on success, non-zero on error.  On error errp will be set
1489
 * as well.
1490
 *
1491
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1492
 * It is the responsibility of the caller to then call the abort() or
1493
 * commit() for any other BDS that have been left in a prepare() state
1494
 *
1495
 */
1496
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1497
                        Error **errp)
1498
{
1499
    int ret = -1;
1500
    Error *local_err = NULL;
1501
    BlockDriver *drv;
1502

    
1503
    assert(reopen_state != NULL);
1504
    assert(reopen_state->bs->drv != NULL);
1505
    drv = reopen_state->bs->drv;
1506

    
1507
    /* if we are to stay read-only, do not allow permission change
1508
     * to r/w */
1509
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1510
        reopen_state->flags & BDRV_O_RDWR) {
1511
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1512
                  reopen_state->bs->device_name);
1513
        goto error;
1514
    }
1515

    
1516

    
1517
    ret = bdrv_flush(reopen_state->bs);
1518
    if (ret) {
1519
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1520
                  strerror(-ret));
1521
        goto error;
1522
    }
1523

    
1524
    if (drv->bdrv_reopen_prepare) {
1525
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1526
        if (ret) {
1527
            if (local_err != NULL) {
1528
                error_propagate(errp, local_err);
1529
            } else {
1530
                error_setg(errp, "failed while preparing to reopen image '%s'",
1531
                           reopen_state->bs->filename);
1532
            }
1533
            goto error;
1534
        }
1535
    } else {
1536
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1537
         * handler for each supported drv. */
1538
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1539
                  drv->format_name, reopen_state->bs->device_name,
1540
                 "reopening of file");
1541
        ret = -1;
1542
        goto error;
1543
    }
1544

    
1545
    ret = 0;
1546

    
1547
error:
1548
    return ret;
1549
}
1550

    
1551
/*
1552
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1553
 * makes them final by swapping the staging BlockDriverState contents into
1554
 * the active BlockDriverState contents.
1555
 */
1556
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1557
{
1558
    BlockDriver *drv;
1559

    
1560
    assert(reopen_state != NULL);
1561
    drv = reopen_state->bs->drv;
1562
    assert(drv != NULL);
1563

    
1564
    /* If there are any driver level actions to take */
1565
    if (drv->bdrv_reopen_commit) {
1566
        drv->bdrv_reopen_commit(reopen_state);
1567
    }
1568

    
1569
    /* set BDS specific flags now */
1570
    reopen_state->bs->open_flags         = reopen_state->flags;
1571
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1572
                                              BDRV_O_CACHE_WB);
1573
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1574
}
1575

    
1576
/*
1577
 * Abort the reopen, and delete and free the staged changes in
1578
 * reopen_state
1579
 */
1580
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1581
{
1582
    BlockDriver *drv;
1583

    
1584
    assert(reopen_state != NULL);
1585
    drv = reopen_state->bs->drv;
1586
    assert(drv != NULL);
1587

    
1588
    if (drv->bdrv_reopen_abort) {
1589
        drv->bdrv_reopen_abort(reopen_state);
1590
    }
1591
}
1592

    
1593

    
1594
void bdrv_close(BlockDriverState *bs)
1595
{
1596
    if (bs->job) {
1597
        block_job_cancel_sync(bs->job);
1598
    }
1599
    bdrv_drain_all(); /* complete I/O */
1600
    bdrv_flush(bs);
1601
    bdrv_drain_all(); /* in case flush left pending I/O */
1602
    notifier_list_notify(&bs->close_notifiers, bs);
1603

    
1604
    if (bs->drv) {
1605
        if (bs->backing_hd) {
1606
            bdrv_unref(bs->backing_hd);
1607
            bs->backing_hd = NULL;
1608
        }
1609
        bs->drv->bdrv_close(bs);
1610
        g_free(bs->opaque);
1611
#ifdef _WIN32
1612
        if (bs->is_temporary) {
1613
            unlink(bs->filename);
1614
        }
1615
#endif
1616
        bs->opaque = NULL;
1617
        bs->drv = NULL;
1618
        bs->copy_on_read = 0;
1619
        bs->backing_file[0] = '\0';
1620
        bs->backing_format[0] = '\0';
1621
        bs->total_sectors = 0;
1622
        bs->encrypted = 0;
1623
        bs->valid_key = 0;
1624
        bs->sg = 0;
1625
        bs->growable = 0;
1626
        bs->zero_beyond_eof = false;
1627
        QDECREF(bs->options);
1628
        bs->options = NULL;
1629

    
1630
        if (bs->file != NULL) {
1631
            bdrv_unref(bs->file);
1632
            bs->file = NULL;
1633
        }
1634
    }
1635

    
1636
    bdrv_dev_change_media_cb(bs, false);
1637

    
1638
    /*throttling disk I/O limits*/
1639
    if (bs->io_limits_enabled) {
1640
        bdrv_io_limits_disable(bs);
1641
    }
1642
}
1643

    
1644
void bdrv_close_all(void)
1645
{
1646
    BlockDriverState *bs;
1647

    
1648
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1649
        bdrv_close(bs);
1650
    }
1651
}
1652

    
1653
/* Check if any requests are in-flight (including throttled requests) */
1654
static bool bdrv_requests_pending(BlockDriverState *bs)
1655
{
1656
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1657
        return true;
1658
    }
1659
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1660
        return true;
1661
    }
1662
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1663
        return true;
1664
    }
1665
    if (bs->file && bdrv_requests_pending(bs->file)) {
1666
        return true;
1667
    }
1668
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1669
        return true;
1670
    }
1671
    return false;
1672
}
1673

    
1674
static bool bdrv_requests_pending_all(void)
1675
{
1676
    BlockDriverState *bs;
1677
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1678
        if (bdrv_requests_pending(bs)) {
1679
            return true;
1680
        }
1681
    }
1682
    return false;
1683
}
1684

    
1685
/*
1686
 * Wait for pending requests to complete across all BlockDriverStates
1687
 *
1688
 * This function does not flush data to disk, use bdrv_flush_all() for that
1689
 * after calling this function.
1690
 *
1691
 * Note that completion of an asynchronous I/O operation can trigger any
1692
 * number of other I/O operations on other devices---for example a coroutine
1693
 * can be arbitrarily complex and a constant flow of I/O can come until the
1694
 * coroutine is complete.  Because of this, it is not possible to have a
1695
 * function to drain a single device's I/O queue.
1696
 */
1697
void bdrv_drain_all(void)
1698
{
1699
    /* Always run first iteration so any pending completion BHs run */
1700
    bool busy = true;
1701
    BlockDriverState *bs;
1702

    
1703
    while (busy) {
1704
        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1705
            bdrv_start_throttled_reqs(bs);
1706
        }
1707

    
1708
        busy = bdrv_requests_pending_all();
1709
        busy |= aio_poll(qemu_get_aio_context(), busy);
1710
    }
1711
}
1712

    
1713
/* make a BlockDriverState anonymous by removing from bdrv_state and
1714
 * graph_bdrv_state list.
1715
   Also, NULL terminate the device_name to prevent double remove */
1716
void bdrv_make_anon(BlockDriverState *bs)
1717
{
1718
    if (bs->device_name[0] != '\0') {
1719
        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1720
    }
1721
    bs->device_name[0] = '\0';
1722
    if (bs->node_name[0] != '\0') {
1723
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1724
    }
1725
    bs->node_name[0] = '\0';
1726
}
1727

    
1728
static void bdrv_rebind(BlockDriverState *bs)
1729
{
1730
    if (bs->drv && bs->drv->bdrv_rebind) {
1731
        bs->drv->bdrv_rebind(bs);
1732
    }
1733
}
1734

    
1735
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1736
                                     BlockDriverState *bs_src)
1737
{
1738
    /* move some fields that need to stay attached to the device */
1739
    bs_dest->open_flags         = bs_src->open_flags;
1740

    
1741
    /* dev info */
1742
    bs_dest->dev_ops            = bs_src->dev_ops;
1743
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1744
    bs_dest->dev                = bs_src->dev;
1745
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1746
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1747

    
1748
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1749

    
1750
    /* i/o throttled req */
1751
    memcpy(&bs_dest->throttle_state,
1752
           &bs_src->throttle_state,
1753
           sizeof(ThrottleState));
1754
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1755
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1756
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1757

    
1758
    /* r/w error */
1759
    bs_dest->on_read_error      = bs_src->on_read_error;
1760
    bs_dest->on_write_error     = bs_src->on_write_error;
1761

    
1762
    /* i/o status */
1763
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1764
    bs_dest->iostatus           = bs_src->iostatus;
1765

    
1766
    /* dirty bitmap */
1767
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1768

    
1769
    /* reference count */
1770
    bs_dest->refcnt             = bs_src->refcnt;
1771

    
1772
    /* job */
1773
    bs_dest->in_use             = bs_src->in_use;
1774
    bs_dest->job                = bs_src->job;
1775

    
1776
    /* keep the same entry in bdrv_states */
1777
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1778
            bs_src->device_name);
1779
    bs_dest->device_list = bs_src->device_list;
1780

    
1781
    /* keep the same entry in graph_bdrv_states
1782
     * We do want to swap name but don't want to swap linked list entries
1783
     */
1784
    bs_dest->node_list   = bs_src->node_list;
1785
}
1786

    
1787
/*
1788
 * Swap bs contents for two image chains while they are live,
1789
 * while keeping required fields on the BlockDriverState that is
1790
 * actually attached to a device.
1791
 *
1792
 * This will modify the BlockDriverState fields, and swap contents
1793
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1794
 *
1795
 * bs_new is required to be anonymous.
1796
 *
1797
 * This function does not create any image files.
1798
 */
1799
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1800
{
1801
    BlockDriverState tmp;
1802

    
1803
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1804
    assert(bs_new->device_name[0] == '\0');
1805
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1806
    assert(bs_new->job == NULL);
1807
    assert(bs_new->dev == NULL);
1808
    assert(bs_new->in_use == 0);
1809
    assert(bs_new->io_limits_enabled == false);
1810
    assert(!throttle_have_timer(&bs_new->throttle_state));
1811

    
1812
    tmp = *bs_new;
1813
    *bs_new = *bs_old;
1814
    *bs_old = tmp;
1815

    
1816
    /* there are some fields that should not be swapped, move them back */
1817
    bdrv_move_feature_fields(&tmp, bs_old);
1818
    bdrv_move_feature_fields(bs_old, bs_new);
1819
    bdrv_move_feature_fields(bs_new, &tmp);
1820

    
1821
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1822
    assert(bs_new->device_name[0] == '\0');
1823

    
1824
    /* Check a few fields that should remain attached to the device */
1825
    assert(bs_new->dev == NULL);
1826
    assert(bs_new->job == NULL);
1827
    assert(bs_new->in_use == 0);
1828
    assert(bs_new->io_limits_enabled == false);
1829
    assert(!throttle_have_timer(&bs_new->throttle_state));
1830

    
1831
    bdrv_rebind(bs_new);
1832
    bdrv_rebind(bs_old);
1833
}
1834

    
1835
/*
1836
 * Add new bs contents at the top of an image chain while the chain is
1837
 * live, while keeping required fields on the top layer.
1838
 *
1839
 * This will modify the BlockDriverState fields, and swap contents
1840
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1841
 *
1842
 * bs_new is required to be anonymous.
1843
 *
1844
 * This function does not create any image files.
1845
 */
1846
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1847
{
1848
    bdrv_swap(bs_new, bs_top);
1849

    
1850
    /* The contents of 'tmp' will become bs_top, as we are
1851
     * swapping bs_new and bs_top contents. */
1852
    bs_top->backing_hd = bs_new;
1853
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1854
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1855
            bs_new->filename);
1856
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1857
            bs_new->drv ? bs_new->drv->format_name : "");
1858
}
1859

    
1860
static void bdrv_delete(BlockDriverState *bs)
1861
{
1862
    assert(!bs->dev);
1863
    assert(!bs->job);
1864
    assert(!bs->in_use);
1865
    assert(!bs->refcnt);
1866
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1867

    
1868
    bdrv_close(bs);
1869

    
1870
    /* remove from list, if necessary */
1871
    bdrv_make_anon(bs);
1872

    
1873
    g_free(bs);
1874
}
1875

    
1876
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1877
/* TODO change to DeviceState *dev when all users are qdevified */
1878
{
1879
    if (bs->dev) {
1880
        return -EBUSY;
1881
    }
1882
    bs->dev = dev;
1883
    bdrv_iostatus_reset(bs);
1884
    return 0;
1885
}
1886

    
1887
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1888
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1889
{
1890
    if (bdrv_attach_dev(bs, dev) < 0) {
1891
        abort();
1892
    }
1893
}
1894

    
1895
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1896
/* TODO change to DeviceState *dev when all users are qdevified */
1897
{
1898
    assert(bs->dev == dev);
1899
    bs->dev = NULL;
1900
    bs->dev_ops = NULL;
1901
    bs->dev_opaque = NULL;
1902
    bs->buffer_alignment = 512;
1903
}
1904

    
1905
/* TODO change to return DeviceState * when all users are qdevified */
1906
void *bdrv_get_attached_dev(BlockDriverState *bs)
1907
{
1908
    return bs->dev;
1909
}
1910

    
1911
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1912
                      void *opaque)
1913
{
1914
    bs->dev_ops = ops;
1915
    bs->dev_opaque = opaque;
1916
}
1917

    
1918
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1919
                               enum MonitorEvent ev,
1920
                               BlockErrorAction action, bool is_read)
1921
{
1922
    QObject *data;
1923
    const char *action_str;
1924

    
1925
    switch (action) {
1926
    case BDRV_ACTION_REPORT:
1927
        action_str = "report";
1928
        break;
1929
    case BDRV_ACTION_IGNORE:
1930
        action_str = "ignore";
1931
        break;
1932
    case BDRV_ACTION_STOP:
1933
        action_str = "stop";
1934
        break;
1935
    default:
1936
        abort();
1937
    }
1938

    
1939
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1940
                              bdrv->device_name,
1941
                              action_str,
1942
                              is_read ? "read" : "write");
1943
    monitor_protocol_event(ev, data);
1944

    
1945
    qobject_decref(data);
1946
}
1947

    
1948
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1949
{
1950
    QObject *data;
1951

    
1952
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1953
                              bdrv_get_device_name(bs), ejected);
1954
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1955

    
1956
    qobject_decref(data);
1957
}
1958

    
1959
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1960
{
1961
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1962
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1963
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1964
        if (tray_was_closed) {
1965
            /* tray open */
1966
            bdrv_emit_qmp_eject_event(bs, true);
1967
        }
1968
        if (load) {
1969
            /* tray close */
1970
            bdrv_emit_qmp_eject_event(bs, false);
1971
        }
1972
    }
1973
}
1974

    
1975
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1976
{
1977
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1978
}
1979

    
1980
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1981
{
1982
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1983
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1984
    }
1985
}
1986

    
1987
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1988
{
1989
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1990
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1991
    }
1992
    return false;
1993
}
1994

    
1995
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1996
{
1997
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1998
        bs->dev_ops->resize_cb(bs->dev_opaque);
1999
    }
2000
}
2001

    
2002
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2003
{
2004
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2005
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2006
    }
2007
    return false;
2008
}
2009

    
2010
/*
2011
 * Run consistency checks on an image
2012
 *
2013
 * Returns 0 if the check could be completed (it doesn't mean that the image is
2014
 * free of errors) or -errno when an internal error occurred. The results of the
2015
 * check are stored in res.
2016
 */
2017
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2018
{
2019
    if (bs->drv->bdrv_check == NULL) {
2020
        return -ENOTSUP;
2021
    }
2022

    
2023
    memset(res, 0, sizeof(*res));
2024
    return bs->drv->bdrv_check(bs, res, fix);
2025
}
2026

    
2027
#define COMMIT_BUF_SECTORS 2048
2028

    
2029
/* commit COW file into the raw image */
2030
int bdrv_commit(BlockDriverState *bs)
2031
{
2032
    BlockDriver *drv = bs->drv;
2033
    int64_t sector, total_sectors;
2034
    int n, ro, open_flags;
2035
    int ret = 0;
2036
    uint8_t *buf;
2037
    char filename[PATH_MAX];
2038

    
2039
    if (!drv)
2040
        return -ENOMEDIUM;
2041
    
2042
    if (!bs->backing_hd) {
2043
        return -ENOTSUP;
2044
    }
2045

    
2046
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2047
        return -EBUSY;
2048
    }
2049

    
2050
    ro = bs->backing_hd->read_only;
2051
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2052
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2053
    open_flags =  bs->backing_hd->open_flags;
2054

    
2055
    if (ro) {
2056
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2057
            return -EACCES;
2058
        }
2059
    }
2060

    
2061
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
2062
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2063

    
2064
    for (sector = 0; sector < total_sectors; sector += n) {
2065
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2066
        if (ret < 0) {
2067
            goto ro_cleanup;
2068
        }
2069
        if (ret) {
2070
            if (bdrv_read(bs, sector, buf, n) != 0) {
2071
                ret = -EIO;
2072
                goto ro_cleanup;
2073
            }
2074

    
2075
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
2076
                ret = -EIO;
2077
                goto ro_cleanup;
2078
            }
2079
        }
2080
    }
2081

    
2082
    if (drv->bdrv_make_empty) {
2083
        ret = drv->bdrv_make_empty(bs);
2084
        bdrv_flush(bs);
2085
    }
2086

    
2087
    /*
2088
     * Make sure all data we wrote to the backing device is actually
2089
     * stable on disk.
2090
     */
2091
    if (bs->backing_hd)
2092
        bdrv_flush(bs->backing_hd);
2093

    
2094
ro_cleanup:
2095
    g_free(buf);
2096

    
2097
    if (ro) {
2098
        /* ignoring error return here */
2099
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2100
    }
2101

    
2102
    return ret;
2103
}
2104

    
2105
int bdrv_commit_all(void)
2106
{
2107
    BlockDriverState *bs;
2108

    
2109
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2110
        if (bs->drv && bs->backing_hd) {
2111
            int ret = bdrv_commit(bs);
2112
            if (ret < 0) {
2113
                return ret;
2114
            }
2115
        }
2116
    }
2117
    return 0;
2118
}
2119

    
2120
/**
2121
 * Remove an active request from the tracked requests list
2122
 *
2123
 * This function should be called when a tracked request is completing.
2124
 */
2125
static void tracked_request_end(BdrvTrackedRequest *req)
2126
{
2127
    QLIST_REMOVE(req, list);
2128
    qemu_co_queue_restart_all(&req->wait_queue);
2129
}
2130

    
2131
/**
2132
 * Add an active request to the tracked requests list
2133
 */
2134
static void tracked_request_begin(BdrvTrackedRequest *req,
2135
                                  BlockDriverState *bs,
2136
                                  int64_t sector_num,
2137
                                  int nb_sectors, bool is_write)
2138
{
2139
    *req = (BdrvTrackedRequest){
2140
        .bs = bs,
2141
        .sector_num = sector_num,
2142
        .nb_sectors = nb_sectors,
2143
        .is_write = is_write,
2144
        .co = qemu_coroutine_self(),
2145
    };
2146

    
2147
    qemu_co_queue_init(&req->wait_queue);
2148

    
2149
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2150
}
2151

    
2152
/**
2153
 * Round a region to cluster boundaries
2154
 */
2155
void bdrv_round_to_clusters(BlockDriverState *bs,
2156
                            int64_t sector_num, int nb_sectors,
2157
                            int64_t *cluster_sector_num,
2158
                            int *cluster_nb_sectors)
2159
{
2160
    BlockDriverInfo bdi;
2161

    
2162
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2163
        *cluster_sector_num = sector_num;
2164
        *cluster_nb_sectors = nb_sectors;
2165
    } else {
2166
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2167
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2168
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2169
                                            nb_sectors, c);
2170
    }
2171
}
2172

    
2173
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2174
                                     int64_t sector_num, int nb_sectors) {
2175
    /*        aaaa   bbbb */
2176
    if (sector_num >= req->sector_num + req->nb_sectors) {
2177
        return false;
2178
    }
2179
    /* bbbb   aaaa        */
2180
    if (req->sector_num >= sector_num + nb_sectors) {
2181
        return false;
2182
    }
2183
    return true;
2184
}
2185

    
2186
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2187
        int64_t sector_num, int nb_sectors)
2188
{
2189
    BdrvTrackedRequest *req;
2190
    int64_t cluster_sector_num;
2191
    int cluster_nb_sectors;
2192
    bool retry;
2193

    
2194
    /* If we touch the same cluster it counts as an overlap.  This guarantees
2195
     * that allocating writes will be serialized and not race with each other
2196
     * for the same cluster.  For example, in copy-on-read it ensures that the
2197
     * CoR read and write operations are atomic and guest writes cannot
2198
     * interleave between them.
2199
     */
2200
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2201
                           &cluster_sector_num, &cluster_nb_sectors);
2202

    
2203
    do {
2204
        retry = false;
2205
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2206
            if (tracked_request_overlaps(req, cluster_sector_num,
2207
                                         cluster_nb_sectors)) {
2208
                /* Hitting this means there was a reentrant request, for
2209
                 * example, a block driver issuing nested requests.  This must
2210
                 * never happen since it means deadlock.
2211
                 */
2212
                assert(qemu_coroutine_self() != req->co);
2213

    
2214
                qemu_co_queue_wait(&req->wait_queue);
2215
                retry = true;
2216
                break;
2217
            }
2218
        }
2219
    } while (retry);
2220
}
2221

    
2222
/*
2223
 * Return values:
2224
 * 0        - success
2225
 * -EINVAL  - backing format specified, but no file
2226
 * -ENOSPC  - can't update the backing file because no space is left in the
2227
 *            image file header
2228
 * -ENOTSUP - format driver doesn't support changing the backing file
2229
 */
2230
int bdrv_change_backing_file(BlockDriverState *bs,
2231
    const char *backing_file, const char *backing_fmt)
2232
{
2233
    BlockDriver *drv = bs->drv;
2234
    int ret;
2235

    
2236
    /* Backing file format doesn't make sense without a backing file */
2237
    if (backing_fmt && !backing_file) {
2238
        return -EINVAL;
2239
    }
2240

    
2241
    if (drv->bdrv_change_backing_file != NULL) {
2242
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2243
    } else {
2244
        ret = -ENOTSUP;
2245
    }
2246

    
2247
    if (ret == 0) {
2248
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2249
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2250
    }
2251
    return ret;
2252
}
2253

    
2254
/*
2255
 * Finds the image layer in the chain that has 'bs' as its backing file.
2256
 *
2257
 * active is the current topmost image.
2258
 *
2259
 * Returns NULL if bs is not found in active's image chain,
2260
 * or if active == bs.
2261
 */
2262
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2263
                                    BlockDriverState *bs)
2264
{
2265
    BlockDriverState *overlay = NULL;
2266
    BlockDriverState *intermediate;
2267

    
2268
    assert(active != NULL);
2269
    assert(bs != NULL);
2270

    
2271
    /* if bs is the same as active, then by definition it has no overlay
2272
     */
2273
    if (active == bs) {
2274
        return NULL;
2275
    }
2276

    
2277
    intermediate = active;
2278
    while (intermediate->backing_hd) {
2279
        if (intermediate->backing_hd == bs) {
2280
            overlay = intermediate;
2281
            break;
2282
        }
2283
        intermediate = intermediate->backing_hd;
2284
    }
2285

    
2286
    return overlay;
2287
}
2288

    
2289
typedef struct BlkIntermediateStates {
2290
    BlockDriverState *bs;
2291
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2292
} BlkIntermediateStates;
2293

    
2294

    
2295
/*
2296
 * Drops images above 'base' up to and including 'top', and sets the image
2297
 * above 'top' to have base as its backing file.
2298
 *
2299
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2300
 * information in 'bs' can be properly updated.
2301
 *
2302
 * E.g., this will convert the following chain:
2303
 * bottom <- base <- intermediate <- top <- active
2304
 *
2305
 * to
2306
 *
2307
 * bottom <- base <- active
2308
 *
2309
 * It is allowed for bottom==base, in which case it converts:
2310
 *
2311
 * base <- intermediate <- top <- active
2312
 *
2313
 * to
2314
 *
2315
 * base <- active
2316
 *
2317
 * Error conditions:
2318
 *  if active == top, that is considered an error
2319
 *
2320
 */
2321
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2322
                           BlockDriverState *base)
2323
{
2324
    BlockDriverState *intermediate;
2325
    BlockDriverState *base_bs = NULL;
2326
    BlockDriverState *new_top_bs = NULL;
2327
    BlkIntermediateStates *intermediate_state, *next;
2328
    int ret = -EIO;
2329

    
2330
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2331
    QSIMPLEQ_INIT(&states_to_delete);
2332

    
2333
    if (!top->drv || !base->drv) {
2334
        goto exit;
2335
    }
2336

    
2337
    new_top_bs = bdrv_find_overlay(active, top);
2338

    
2339
    if (new_top_bs == NULL) {
2340
        /* we could not find the image above 'top', this is an error */
2341
        goto exit;
2342
    }
2343

    
2344
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2345
     * to do, no intermediate images */
2346
    if (new_top_bs->backing_hd == base) {
2347
        ret = 0;
2348
        goto exit;
2349
    }
2350

    
2351
    intermediate = top;
2352

    
2353
    /* now we will go down through the list, and add each BDS we find
2354
     * into our deletion queue, until we hit the 'base'
2355
     */
2356
    while (intermediate) {
2357
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2358
        intermediate_state->bs = intermediate;
2359
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2360

    
2361
        if (intermediate->backing_hd == base) {
2362
            base_bs = intermediate->backing_hd;
2363
            break;
2364
        }
2365
        intermediate = intermediate->backing_hd;
2366
    }
2367
    if (base_bs == NULL) {
2368
        /* something went wrong, we did not end at the base. safely
2369
         * unravel everything, and exit with error */
2370
        goto exit;
2371
    }
2372

    
2373
    /* success - we can delete the intermediate states, and link top->base */
2374
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2375
                                   base_bs->drv ? base_bs->drv->format_name : "");
2376
    if (ret) {
2377
        goto exit;
2378
    }
2379
    new_top_bs->backing_hd = base_bs;
2380

    
2381

    
2382
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2383
        /* so that bdrv_close() does not recursively close the chain */
2384
        intermediate_state->bs->backing_hd = NULL;
2385
        bdrv_unref(intermediate_state->bs);
2386
    }
2387
    ret = 0;
2388

    
2389
exit:
2390
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2391
        g_free(intermediate_state);
2392
    }
2393
    return ret;
2394
}
2395

    
2396

    
2397
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2398
                                   size_t size)
2399
{
2400
    int64_t len;
2401

    
2402
    if (!bdrv_is_inserted(bs))
2403
        return -ENOMEDIUM;
2404

    
2405
    if (bs->growable)
2406
        return 0;
2407

    
2408
    len = bdrv_getlength(bs);
2409

    
2410
    if (offset < 0)
2411
        return -EIO;
2412

    
2413
    if ((offset > len) || (len - offset < size))
2414
        return -EIO;
2415

    
2416
    return 0;
2417
}
2418

    
2419
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2420
                              int nb_sectors)
2421
{
2422
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2423
                                   nb_sectors * BDRV_SECTOR_SIZE);
2424
}
2425

    
2426
typedef struct RwCo {
2427
    BlockDriverState *bs;
2428
    int64_t sector_num;
2429
    int nb_sectors;
2430
    QEMUIOVector *qiov;
2431
    bool is_write;
2432
    int ret;
2433
    BdrvRequestFlags flags;
2434
} RwCo;
2435

    
2436
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2437
{
2438
    RwCo *rwco = opaque;
2439

    
2440
    if (!rwco->is_write) {
2441
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2442
                                     rwco->nb_sectors, rwco->qiov,
2443
                                     rwco->flags);
2444
    } else {
2445
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2446
                                      rwco->nb_sectors, rwco->qiov,
2447
                                      rwco->flags);
2448
    }
2449
}
2450

    
2451
/*
2452
 * Process a vectored synchronous request using coroutines
2453
 */
2454
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2455
                       QEMUIOVector *qiov, bool is_write,
2456
                       BdrvRequestFlags flags)
2457
{
2458
    Coroutine *co;
2459
    RwCo rwco = {
2460
        .bs = bs,
2461
        .sector_num = sector_num,
2462
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2463
        .qiov = qiov,
2464
        .is_write = is_write,
2465
        .ret = NOT_DONE,
2466
        .flags = flags,
2467
    };
2468
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2469

    
2470
    /**
2471
     * In sync call context, when the vcpu is blocked, this throttling timer
2472
     * will not fire; so the I/O throttling function has to be disabled here
2473
     * if it has been enabled.
2474
     */
2475
    if (bs->io_limits_enabled) {
2476
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2477
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2478
        bdrv_io_limits_disable(bs);
2479
    }
2480

    
2481
    if (qemu_in_coroutine()) {
2482
        /* Fast-path if already in coroutine context */
2483
        bdrv_rw_co_entry(&rwco);
2484
    } else {
2485
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2486
        qemu_coroutine_enter(co, &rwco);
2487
        while (rwco.ret == NOT_DONE) {
2488
            qemu_aio_wait();
2489
        }
2490
    }
2491
    return rwco.ret;
2492
}
2493

    
2494
/*
2495
 * Process a synchronous request using coroutines
2496
 */
2497
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2498
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2499
{
2500
    QEMUIOVector qiov;
2501
    struct iovec iov = {
2502
        .iov_base = (void *)buf,
2503
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2504
    };
2505

    
2506
    qemu_iovec_init_external(&qiov, &iov, 1);
2507
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2508
}
2509

    
2510
/* return < 0 if error. See bdrv_write() for the return codes */
2511
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2512
              uint8_t *buf, int nb_sectors)
2513
{
2514
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2515
}
2516

    
2517
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2518
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2519
                          uint8_t *buf, int nb_sectors)
2520
{
2521
    bool enabled;
2522
    int ret;
2523

    
2524
    enabled = bs->io_limits_enabled;
2525
    bs->io_limits_enabled = false;
2526
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2527
    bs->io_limits_enabled = enabled;
2528
    return ret;
2529
}
2530

    
2531
/* Return < 0 if error. Important errors are:
2532
  -EIO         generic I/O error (may happen for all errors)
2533
  -ENOMEDIUM   No media inserted.
2534
  -EINVAL      Invalid sector number or nb_sectors
2535
  -EACCES      Trying to write a read-only device
2536
*/
2537
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2538
               const uint8_t *buf, int nb_sectors)
2539
{
2540
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2541
}
2542

    
2543
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2544
{
2545
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2546
}
2547

    
2548
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2549
                      int nb_sectors, BdrvRequestFlags flags)
2550
{
2551
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2552
                      BDRV_REQ_ZERO_WRITE | flags);
2553
}
2554

    
2555
/*
2556
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2557
 * The operation is sped up by checking the block status and only writing
2558
 * zeroes to the device if they currently do not return zeroes. Optional
2559
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2560
 *
2561
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2562
 */
2563
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2564
{
2565
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2566
    int64_t ret, nb_sectors, sector_num = 0;
2567
    int n;
2568

    
2569
    for (;;) {
2570
        nb_sectors = target_size - sector_num;
2571
        if (nb_sectors <= 0) {
2572
            return 0;
2573
        }
2574
        if (nb_sectors > INT_MAX) {
2575
            nb_sectors = INT_MAX;
2576
        }
2577
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2578
        if (ret < 0) {
2579
            error_report("error getting block status at sector %" PRId64 ": %s",
2580
                         sector_num, strerror(-ret));
2581
            return ret;
2582
        }
2583
        if (ret & BDRV_BLOCK_ZERO) {
2584
            sector_num += n;
2585
            continue;
2586
        }
2587
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2588
        if (ret < 0) {
2589
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2590
                         sector_num, strerror(-ret));
2591
            return ret;
2592
        }
2593
        sector_num += n;
2594
    }
2595
}
2596

    
2597
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2598
               void *buf, int count1)
2599
{
2600
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2601
    int len, nb_sectors, count;
2602
    int64_t sector_num;
2603
    int ret;
2604

    
2605
    count = count1;
2606
    /* first read to align to sector start */
2607
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2608
    if (len > count)
2609
        len = count;
2610
    sector_num = offset >> BDRV_SECTOR_BITS;
2611
    if (len > 0) {
2612
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2613
            return ret;
2614
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2615
        count -= len;
2616
        if (count == 0)
2617
            return count1;
2618
        sector_num++;
2619
        buf += len;
2620
    }
2621

    
2622
    /* read the sectors "in place" */
2623
    nb_sectors = count >> BDRV_SECTOR_BITS;
2624
    if (nb_sectors > 0) {
2625
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2626
            return ret;
2627
        sector_num += nb_sectors;
2628
        len = nb_sectors << BDRV_SECTOR_BITS;
2629
        buf += len;
2630
        count -= len;
2631
    }
2632

    
2633
    /* add data from the last sector */
2634
    if (count > 0) {
2635
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2636
            return ret;
2637
        memcpy(buf, tmp_buf, count);
2638
    }
2639
    return count1;
2640
}
2641

    
2642
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2643
{
2644
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2645
    int len, nb_sectors, count;
2646
    int64_t sector_num;
2647
    int ret;
2648

    
2649
    count = qiov->size;
2650

    
2651
    /* first write to align to sector start */
2652
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2653
    if (len > count)
2654
        len = count;
2655
    sector_num = offset >> BDRV_SECTOR_BITS;
2656
    if (len > 0) {
2657
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2658
            return ret;
2659
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2660
                          len);
2661
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2662
            return ret;
2663
        count -= len;
2664
        if (count == 0)
2665
            return qiov->size;
2666
        sector_num++;
2667
    }
2668

    
2669
    /* write the sectors "in place" */
2670
    nb_sectors = count >> BDRV_SECTOR_BITS;
2671
    if (nb_sectors > 0) {
2672
        QEMUIOVector qiov_inplace;
2673

    
2674
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2675
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2676
                          nb_sectors << BDRV_SECTOR_BITS);
2677
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2678
        qemu_iovec_destroy(&qiov_inplace);
2679
        if (ret < 0) {
2680
            return ret;
2681
        }
2682

    
2683
        sector_num += nb_sectors;
2684
        len = nb_sectors << BDRV_SECTOR_BITS;
2685
        count -= len;
2686
    }
2687

    
2688
    /* add data from the last sector */
2689
    if (count > 0) {
2690
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2691
            return ret;
2692
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2693
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2694
            return ret;
2695
    }
2696
    return qiov->size;
2697
}
2698

    
2699
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2700
                const void *buf, int count1)
2701
{
2702
    QEMUIOVector qiov;
2703
    struct iovec iov = {
2704
        .iov_base   = (void *) buf,
2705
        .iov_len    = count1,
2706
    };
2707

    
2708
    qemu_iovec_init_external(&qiov, &iov, 1);
2709
    return bdrv_pwritev(bs, offset, &qiov);
2710
}
2711

    
2712
/*
2713
 * Writes to the file and ensures that no writes are reordered across this
2714
 * request (acts as a barrier)
2715
 *
2716
 * Returns 0 on success, -errno in error cases.
2717
 */
2718
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2719
    const void *buf, int count)
2720
{
2721
    int ret;
2722

    
2723
    ret = bdrv_pwrite(bs, offset, buf, count);
2724
    if (ret < 0) {
2725
        return ret;
2726
    }
2727

    
2728
    /* No flush needed for cache modes that already do it */
2729
    if (bs->enable_write_cache) {
2730
        bdrv_flush(bs);
2731
    }
2732

    
2733
    return 0;
2734
}
2735

    
2736
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2737
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2738
{
2739
    /* Perform I/O through a temporary buffer so that users who scribble over
2740
     * their read buffer while the operation is in progress do not end up
2741
     * modifying the image file.  This is critical for zero-copy guest I/O
2742
     * where anything might happen inside guest memory.
2743
     */
2744
    void *bounce_buffer;
2745

    
2746
    BlockDriver *drv = bs->drv;
2747
    struct iovec iov;
2748
    QEMUIOVector bounce_qiov;
2749
    int64_t cluster_sector_num;
2750
    int cluster_nb_sectors;
2751
    size_t skip_bytes;
2752
    int ret;
2753

    
2754
    /* Cover entire cluster so no additional backing file I/O is required when
2755
     * allocating cluster in the image file.
2756
     */
2757
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2758
                           &cluster_sector_num, &cluster_nb_sectors);
2759

    
2760
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2761
                                   cluster_sector_num, cluster_nb_sectors);
2762

    
2763
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2764
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2765
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2766

    
2767
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2768
                             &bounce_qiov);
2769
    if (ret < 0) {
2770
        goto err;
2771
    }
2772

    
2773
    if (drv->bdrv_co_write_zeroes &&
2774
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2775
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2776
                                      cluster_nb_sectors, 0);
2777
    } else {
2778
        /* This does not change the data on the disk, it is not necessary
2779
         * to flush even in cache=writethrough mode.
2780
         */
2781
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2782
                                  &bounce_qiov);
2783
    }
2784

    
2785
    if (ret < 0) {
2786
        /* It might be okay to ignore write errors for guest requests.  If this
2787
         * is a deliberate copy-on-read then we don't want to ignore the error.
2788
         * Simply report it in all cases.
2789
         */
2790
        goto err;
2791
    }
2792

    
2793
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2794
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2795
                        nb_sectors * BDRV_SECTOR_SIZE);
2796

    
2797
err:
2798
    qemu_vfree(bounce_buffer);
2799
    return ret;
2800
}
2801

    
2802
/*
2803
 * Handle a read request in coroutine context
2804
 */
2805
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2806
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2807
    BdrvRequestFlags flags)
2808
{
2809
    BlockDriver *drv = bs->drv;
2810
    BdrvTrackedRequest req;
2811
    int ret;
2812

    
2813
    if (!drv) {
2814
        return -ENOMEDIUM;
2815
    }
2816
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2817
        return -EIO;
2818
    }
2819

    
2820
    if (bs->copy_on_read) {
2821
        flags |= BDRV_REQ_COPY_ON_READ;
2822
    }
2823
    if (flags & BDRV_REQ_COPY_ON_READ) {
2824
        bs->copy_on_read_in_flight++;
2825
    }
2826

    
2827
    if (bs->copy_on_read_in_flight) {
2828
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2829
    }
2830

    
2831
    /* throttling disk I/O */
2832
    if (bs->io_limits_enabled) {
2833
        bdrv_io_limits_intercept(bs, nb_sectors, false);
2834
    }
2835

    
2836
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2837

    
2838
    if (flags & BDRV_REQ_COPY_ON_READ) {
2839
        int pnum;
2840

    
2841
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2842
        if (ret < 0) {
2843
            goto out;
2844
        }
2845

    
2846
        if (!ret || pnum != nb_sectors) {
2847
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2848
            goto out;
2849
        }
2850
    }
2851

    
2852
    if (!(bs->zero_beyond_eof && bs->growable)) {
2853
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2854
    } else {
2855
        /* Read zeros after EOF of growable BDSes */
2856
        int64_t len, total_sectors, max_nb_sectors;
2857

    
2858
        len = bdrv_getlength(bs);
2859
        if (len < 0) {
2860
            ret = len;
2861
            goto out;
2862
        }
2863

    
2864
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2865
        max_nb_sectors = MAX(0, total_sectors - sector_num);
2866
        if (max_nb_sectors > 0) {
2867
            ret = drv->bdrv_co_readv(bs, sector_num,
2868
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2869
        } else {
2870
            ret = 0;
2871
        }
2872

    
2873
        /* Reading beyond end of file is supposed to produce zeroes */
2874
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2875
            uint64_t offset = MAX(0, total_sectors - sector_num);
2876
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2877
                              BDRV_SECTOR_SIZE;
2878
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2879
        }
2880
    }
2881

    
2882
out:
2883
    tracked_request_end(&req);
2884

    
2885
    if (flags & BDRV_REQ_COPY_ON_READ) {
2886
        bs->copy_on_read_in_flight--;
2887
    }
2888

    
2889
    return ret;
2890
}
2891

    
2892
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2893
    int nb_sectors, QEMUIOVector *qiov)
2894
{
2895
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2896

    
2897
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2898
}
2899

    
2900
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2901
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2902
{
2903
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2904

    
2905
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2906
                            BDRV_REQ_COPY_ON_READ);
2907
}
2908

    
2909
/* if no limit is specified in the BlockLimits use a default
2910
 * of 32768 512-byte sectors (16 MiB) per request.
2911
 */
2912
#define MAX_WRITE_ZEROES_DEFAULT 32768
2913

    
2914
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2915
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
2916
{
2917
    BlockDriver *drv = bs->drv;
2918
    QEMUIOVector qiov;
2919
    struct iovec iov = {0};
2920
    int ret = 0;
2921

    
2922
    int max_write_zeroes = bs->bl.max_write_zeroes ?
2923
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
2924

    
2925
    while (nb_sectors > 0 && !ret) {
2926
        int num = nb_sectors;
2927

    
2928
        /* Align request.  Block drivers can expect the "bulk" of the request
2929
         * to be aligned.
2930
         */
2931
        if (bs->bl.write_zeroes_alignment
2932
            && num > bs->bl.write_zeroes_alignment) {
2933
            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
2934
                /* Make a small request up to the first aligned sector.  */
2935
                num = bs->bl.write_zeroes_alignment;
2936
                num -= sector_num % bs->bl.write_zeroes_alignment;
2937
            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
2938
                /* Shorten the request to the last aligned sector.  num cannot
2939
                 * underflow because num > bs->bl.write_zeroes_alignment.
2940
                 */
2941
                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
2942
            }
2943
        }
2944

    
2945
        /* limit request size */
2946
        if (num > max_write_zeroes) {
2947
            num = max_write_zeroes;
2948
        }
2949

    
2950
        ret = -ENOTSUP;
2951
        /* First try the efficient write zeroes operation */
2952
        if (drv->bdrv_co_write_zeroes) {
2953
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
2954
        }
2955

    
2956
        if (ret == -ENOTSUP) {
2957
            /* Fall back to bounce buffer if write zeroes is unsupported */
2958
            iov.iov_len = num * BDRV_SECTOR_SIZE;
2959
            if (iov.iov_base == NULL) {
2960
                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
2961
                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
2962
            }
2963
            qemu_iovec_init_external(&qiov, &iov, 1);
2964

    
2965
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
2966

    
2967
            /* Keep bounce buffer around if it is big enough for all
2968
             * all future requests.
2969
             */
2970
            if (num < max_write_zeroes) {
2971
                qemu_vfree(iov.iov_base);
2972
                iov.iov_base = NULL;
2973
            }
2974
        }
2975

    
2976
        sector_num += num;
2977
        nb_sectors -= num;
2978
    }
2979

    
2980
    qemu_vfree(iov.iov_base);
2981
    return ret;
2982
}
2983

    
2984
/*
2985
 * Handle a write request in coroutine context
2986
 */
2987
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2988
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2989
    BdrvRequestFlags flags)
2990
{
2991
    BlockDriver *drv = bs->drv;
2992
    BdrvTrackedRequest req;
2993
    int ret;
2994

    
2995
    if (!bs->drv) {
2996
        return -ENOMEDIUM;
2997
    }
2998
    if (bs->read_only) {
2999
        return -EACCES;
3000
    }
3001
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3002
        return -EIO;
3003
    }
3004

    
3005
    if (bs->copy_on_read_in_flight) {
3006
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
3007
    }
3008

    
3009
    /* throttling disk I/O */
3010
    if (bs->io_limits_enabled) {
3011
        bdrv_io_limits_intercept(bs, nb_sectors, true);
3012
    }
3013

    
3014
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
3015

    
3016
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
3017

    
3018
    if (ret < 0) {
3019
        /* Do nothing, write notifier decided to fail this request */
3020
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
3021
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3022
    } else {
3023
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3024
    }
3025

    
3026
    if (ret == 0 && !bs->enable_write_cache) {
3027
        ret = bdrv_co_flush(bs);
3028
    }
3029

    
3030
    bdrv_set_dirty(bs, sector_num, nb_sectors);
3031

    
3032
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3033
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
3034
    }
3035
    if (bs->growable && ret >= 0) {
3036
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3037
    }
3038

    
3039
    tracked_request_end(&req);
3040

    
3041
    return ret;
3042
}
3043

    
3044
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3045
    int nb_sectors, QEMUIOVector *qiov)
3046
{
3047
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3048

    
3049
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3050
}
3051

    
3052
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3053
                                      int64_t sector_num, int nb_sectors,
3054
                                      BdrvRequestFlags flags)
3055
{
3056
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3057

    
3058
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
3059
        flags &= ~BDRV_REQ_MAY_UNMAP;
3060
    }
3061

    
3062
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3063
                             BDRV_REQ_ZERO_WRITE | flags);
3064
}
3065

    
3066
/**
3067
 * Truncate file to 'offset' bytes (needed only for file protocols)
3068
 */
3069
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3070
{
3071
    BlockDriver *drv = bs->drv;
3072
    int ret;
3073
    if (!drv)
3074
        return -ENOMEDIUM;
3075
    if (!drv->bdrv_truncate)
3076
        return -ENOTSUP;
3077
    if (bs->read_only)
3078
        return -EACCES;
3079
    if (bdrv_in_use(bs))
3080
        return -EBUSY;
3081
    ret = drv->bdrv_truncate(bs, offset);
3082
    if (ret == 0) {
3083
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3084
        bdrv_dev_resize_cb(bs);
3085
    }
3086
    return ret;
3087
}
3088

    
3089
/**
3090
 * Length of a allocated file in bytes. Sparse files are counted by actual
3091
 * allocated space. Return < 0 if error or unknown.
3092
 */
3093
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3094
{
3095
    BlockDriver *drv = bs->drv;
3096
    if (!drv) {
3097
        return -ENOMEDIUM;
3098
    }
3099
    if (drv->bdrv_get_allocated_file_size) {
3100
        return drv->bdrv_get_allocated_file_size(bs);
3101
    }
3102
    if (bs->file) {
3103
        return bdrv_get_allocated_file_size(bs->file);
3104
    }
3105
    return -ENOTSUP;
3106
}
3107

    
3108
/**
3109
 * Length of a file in bytes. Return < 0 if error or unknown.
3110
 */
3111
int64_t bdrv_getlength(BlockDriverState *bs)
3112
{
3113
    BlockDriver *drv = bs->drv;
3114
    if (!drv)
3115
        return -ENOMEDIUM;
3116

    
3117
    if (drv->has_variable_length) {
3118
        int ret = refresh_total_sectors(bs, bs->total_sectors);
3119
        if (ret < 0) {
3120
            return ret;
3121
        }
3122
    }
3123
    return bs->total_sectors * BDRV_SECTOR_SIZE;
3124
}
3125

    
3126
/* return 0 as number of sectors if no device present or error */
3127
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3128
{
3129
    int64_t length;
3130
    length = bdrv_getlength(bs);
3131
    if (length < 0)
3132
        length = 0;
3133
    else
3134
        length = length >> BDRV_SECTOR_BITS;
3135
    *nb_sectors_ptr = length;
3136
}
3137

    
3138
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3139
                       BlockdevOnError on_write_error)
3140
{
3141
    bs->on_read_error = on_read_error;
3142
    bs->on_write_error = on_write_error;
3143
}
3144

    
3145
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3146
{
3147
    return is_read ? bs->on_read_error : bs->on_write_error;
3148
}
3149

    
3150
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3151
{
3152
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3153

    
3154
    switch (on_err) {
3155
    case BLOCKDEV_ON_ERROR_ENOSPC:
3156
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3157
    case BLOCKDEV_ON_ERROR_STOP:
3158
        return BDRV_ACTION_STOP;
3159
    case BLOCKDEV_ON_ERROR_REPORT:
3160
        return BDRV_ACTION_REPORT;
3161
    case BLOCKDEV_ON_ERROR_IGNORE:
3162
        return BDRV_ACTION_IGNORE;
3163
    default:
3164
        abort();
3165
    }
3166
}
3167

    
3168
/* This is done by device models because, while the block layer knows
3169
 * about the error, it does not know whether an operation comes from
3170
 * the device or the block layer (from a job, for example).
3171
 */
3172
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3173
                       bool is_read, int error)
3174
{
3175
    assert(error >= 0);
3176
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3177
    if (action == BDRV_ACTION_STOP) {
3178
        vm_stop(RUN_STATE_IO_ERROR);
3179
        bdrv_iostatus_set_err(bs, error);
3180
    }
3181
}
3182

    
3183
int bdrv_is_read_only(BlockDriverState *bs)
3184
{
3185
    return bs->read_only;
3186
}
3187

    
3188
int bdrv_is_sg(BlockDriverState *bs)
3189
{
3190
    return bs->sg;
3191
}
3192

    
3193
int bdrv_enable_write_cache(BlockDriverState *bs)
3194
{
3195
    return bs->enable_write_cache;
3196
}
3197

    
3198
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3199
{
3200
    bs->enable_write_cache = wce;
3201

    
3202
    /* so a reopen() will preserve wce */
3203
    if (wce) {
3204
        bs->open_flags |= BDRV_O_CACHE_WB;
3205
    } else {
3206
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3207
    }
3208
}
3209

    
3210
int bdrv_is_encrypted(BlockDriverState *bs)
3211
{
3212
    if (bs->backing_hd && bs->backing_hd->encrypted)
3213
        return 1;
3214
    return bs->encrypted;
3215
}
3216

    
3217
int bdrv_key_required(BlockDriverState *bs)
3218
{
3219
    BlockDriverState *backing_hd = bs->backing_hd;
3220

    
3221
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3222
        return 1;
3223
    return (bs->encrypted && !bs->valid_key);
3224
}
3225

    
3226
int bdrv_set_key(BlockDriverState *bs, const char *key)
3227
{
3228
    int ret;
3229
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3230
        ret = bdrv_set_key(bs->backing_hd, key);
3231
        if (ret < 0)
3232
            return ret;
3233
        if (!bs->encrypted)
3234
            return 0;
3235
    }
3236
    if (!bs->encrypted) {
3237
        return -EINVAL;
3238
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3239
        return -ENOMEDIUM;
3240
    }
3241
    ret = bs->drv->bdrv_set_key(bs, key);
3242
    if (ret < 0) {
3243
        bs->valid_key = 0;
3244
    } else if (!bs->valid_key) {
3245
        bs->valid_key = 1;
3246
        /* call the change callback now, we skipped it on open */
3247
        bdrv_dev_change_media_cb(bs, true);
3248
    }
3249
    return ret;
3250
}
3251

    
3252
const char *bdrv_get_format_name(BlockDriverState *bs)
3253
{
3254
    return bs->drv ? bs->drv->format_name : NULL;
3255
}
3256

    
3257
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3258
                         void *opaque)
3259
{
3260
    BlockDriver *drv;
3261

    
3262
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3263
        it(opaque, drv->format_name);
3264
    }
3265
}
3266

    
3267
/* This function is to find block backend bs */
3268
BlockDriverState *bdrv_find(const char *name)
3269
{
3270
    BlockDriverState *bs;
3271

    
3272
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3273
        if (!strcmp(name, bs->device_name)) {
3274
            return bs;
3275
        }
3276
    }
3277
    return NULL;
3278
}
3279

    
3280
/* This function is to find a node in the bs graph */
3281
BlockDriverState *bdrv_find_node(const char *node_name)
3282
{
3283
    BlockDriverState *bs;
3284

    
3285
    assert(node_name);
3286

    
3287
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3288
        if (!strcmp(node_name, bs->node_name)) {
3289
            return bs;
3290
        }
3291
    }
3292
    return NULL;
3293
}
3294

    
3295
/* Put this QMP function here so it can access the static graph_bdrv_states. */
3296
BlockDeviceInfoList *bdrv_named_nodes_list(void)
3297
{
3298
    BlockDeviceInfoList *list, *entry;
3299
    BlockDriverState *bs;
3300

    
3301
    list = NULL;
3302
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3303
        entry = g_malloc0(sizeof(*entry));
3304
        entry->value = bdrv_block_device_info(bs);
3305
        entry->next = list;
3306
        list = entry;
3307
    }
3308

    
3309
    return list;
3310
}
3311

    
3312
BlockDriverState *bdrv_lookup_bs(const char *device,
3313
                                 const char *node_name,
3314
                                 Error **errp)
3315
{
3316
    BlockDriverState *bs = NULL;
3317

    
3318
    if ((!device && !node_name) || (device && node_name)) {
3319
        error_setg(errp, "Use either device or node-name but not both");
3320
        return NULL;
3321
    }
3322

    
3323
    if (device) {
3324
        bs = bdrv_find(device);
3325

    
3326
        if (!bs) {
3327
            error_set(errp, QERR_DEVICE_NOT_FOUND, device);
3328
            return NULL;
3329
        }
3330

    
3331
        return bs;
3332
    }
3333

    
3334
    bs = bdrv_find_node(node_name);
3335

    
3336
    if (!bs) {
3337
        error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
3338
        return NULL;
3339
    }
3340

    
3341
    return bs;
3342
}
3343

    
3344
BlockDriverState *bdrv_next(BlockDriverState *bs)
3345
{
3346
    if (!bs) {
3347
        return QTAILQ_FIRST(&bdrv_states);
3348
    }
3349
    return QTAILQ_NEXT(bs, device_list);
3350
}
3351

    
3352
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3353
{
3354
    BlockDriverState *bs;
3355

    
3356
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3357
        it(opaque, bs);
3358
    }
3359
}
3360

    
3361
const char *bdrv_get_device_name(BlockDriverState *bs)
3362
{
3363
    return bs->device_name;
3364
}
3365

    
3366
int bdrv_get_flags(BlockDriverState *bs)
3367
{
3368
    return bs->open_flags;
3369
}
3370

    
3371
int bdrv_flush_all(void)
3372
{
3373
    BlockDriverState *bs;
3374
    int result = 0;
3375

    
3376
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3377
        int ret = bdrv_flush(bs);
3378
        if (ret < 0 && !result) {
3379
            result = ret;
3380
        }
3381
    }
3382

    
3383
    return result;
3384
}
3385

    
3386
int bdrv_has_zero_init_1(BlockDriverState *bs)
3387
{
3388
    return 1;
3389
}
3390

    
3391
int bdrv_has_zero_init(BlockDriverState *bs)
3392
{
3393
    assert(bs->drv);
3394

    
3395
    /* If BS is a copy on write image, it is initialized to
3396
       the contents of the base image, which may not be zeroes.  */
3397
    if (bs->backing_hd) {
3398
        return 0;
3399
    }
3400
    if (bs->drv->bdrv_has_zero_init) {
3401
        return bs->drv->bdrv_has_zero_init(bs);
3402
    }
3403

    
3404
    /* safe default */
3405
    return 0;
3406
}
3407

    
3408
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3409
{
3410
    BlockDriverInfo bdi;
3411

    
3412
    if (bs->backing_hd) {
3413
        return false;
3414
    }
3415

    
3416
    if (bdrv_get_info(bs, &bdi) == 0) {
3417
        return bdi.unallocated_blocks_are_zero;
3418
    }
3419

    
3420
    return false;
3421
}
3422

    
3423
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3424
{
3425
    BlockDriverInfo bdi;
3426

    
3427
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3428
        return false;
3429
    }
3430

    
3431
    if (bdrv_get_info(bs, &bdi) == 0) {
3432
        return bdi.can_write_zeroes_with_unmap;
3433
    }
3434

    
3435
    return false;
3436
}
3437

    
3438
typedef struct BdrvCoGetBlockStatusData {
3439
    BlockDriverState *bs;
3440
    BlockDriverState *base;
3441
    int64_t sector_num;
3442
    int nb_sectors;
3443
    int *pnum;
3444
    int64_t ret;
3445
    bool done;
3446
} BdrvCoGetBlockStatusData;
3447

    
3448
/*
3449
 * Returns true iff the specified sector is present in the disk image. Drivers
3450
 * not implementing the functionality are assumed to not support backing files,
3451
 * hence all their sectors are reported as allocated.
3452
 *
3453
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3454
 * and 'pnum' is set to 0.
3455
 *
3456
 * 'pnum' is set to the number of sectors (including and immediately following
3457
 * the specified sector) that are known to be in the same
3458
 * allocated/unallocated state.
3459
 *
3460
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3461
 * beyond the end of the disk image it will be clamped.
3462
 */
3463
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3464
                                                     int64_t sector_num,
3465
                                                     int nb_sectors, int *pnum)
3466
{
3467
    int64_t length;
3468
    int64_t n;
3469
    int64_t ret, ret2;
3470

    
3471
    length = bdrv_getlength(bs);
3472
    if (length < 0) {
3473
        return length;
3474
    }
3475

    
3476
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3477
        *pnum = 0;
3478
        return 0;
3479
    }
3480

    
3481
    n = bs->total_sectors - sector_num;
3482
    if (n < nb_sectors) {
3483
        nb_sectors = n;
3484
    }
3485

    
3486
    if (!bs->drv->bdrv_co_get_block_status) {
3487
        *pnum = nb_sectors;
3488
        ret = BDRV_BLOCK_DATA;
3489
        if (bs->drv->protocol_name) {
3490
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3491
        }
3492
        return ret;
3493
    }
3494

    
3495
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3496
    if (ret < 0) {
3497
        *pnum = 0;
3498
        return ret;
3499
    }
3500

    
3501
    if (ret & BDRV_BLOCK_RAW) {
3502
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3503
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3504
                                     *pnum, pnum);
3505
    }
3506

    
3507
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3508
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3509
            ret |= BDRV_BLOCK_ZERO;
3510
        } else if (bs->backing_hd) {
3511
            BlockDriverState *bs2 = bs->backing_hd;
3512
            int64_t length2 = bdrv_getlength(bs2);
3513
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3514
                ret |= BDRV_BLOCK_ZERO;
3515
            }
3516
        }
3517
    }
3518

    
3519
    if (bs->file &&
3520
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3521
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3522
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3523
                                        *pnum, pnum);
3524
        if (ret2 >= 0) {
3525
            /* Ignore errors.  This is just providing extra information, it
3526
             * is useful but not necessary.
3527
             */
3528
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3529
        }
3530
    }
3531

    
3532
    return ret;
3533
}
3534

    
3535
/* Coroutine wrapper for bdrv_get_block_status() */
3536
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3537
{
3538
    BdrvCoGetBlockStatusData *data = opaque;
3539
    BlockDriverState *bs = data->bs;
3540

    
3541
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3542
                                         data->pnum);
3543
    data->done = true;
3544
}
3545

    
3546
/*
3547
 * Synchronous wrapper around bdrv_co_get_block_status().
3548
 *
3549
 * See bdrv_co_get_block_status() for details.
3550
 */
3551
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3552
                              int nb_sectors, int *pnum)
3553
{
3554
    Coroutine *co;
3555
    BdrvCoGetBlockStatusData data = {
3556
        .bs = bs,
3557
        .sector_num = sector_num,
3558
        .nb_sectors = nb_sectors,
3559
        .pnum = pnum,
3560
        .done = false,
3561
    };
3562

    
3563
    if (qemu_in_coroutine()) {
3564
        /* Fast-path if already in coroutine context */
3565
        bdrv_get_block_status_co_entry(&data);
3566
    } else {
3567
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3568
        qemu_coroutine_enter(co, &data);
3569
        while (!data.done) {
3570
            qemu_aio_wait();
3571
        }
3572
    }
3573
    return data.ret;
3574
}
3575

    
3576
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3577
                                   int nb_sectors, int *pnum)
3578
{
3579
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3580
    if (ret < 0) {
3581
        return ret;
3582
    }
3583
    return
3584
        (ret & BDRV_BLOCK_DATA) ||
3585
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3586
}
3587

    
3588
/*
3589
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3590
 *
3591
 * Return true if the given sector is allocated in any image between
3592
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3593
 * sector is allocated in any image of the chain.  Return false otherwise.
3594
 *
3595
 * 'pnum' is set to the number of sectors (including and immediately following
3596
 *  the specified sector) that are known to be in the same
3597
 *  allocated/unallocated state.
3598
 *
3599
 */
3600
int bdrv_is_allocated_above(BlockDriverState *top,
3601
                            BlockDriverState *base,
3602
                            int64_t sector_num,
3603
                            int nb_sectors, int *pnum)
3604
{
3605
    BlockDriverState *intermediate;
3606
    int ret, n = nb_sectors;
3607

    
3608
    intermediate = top;
3609
    while (intermediate && intermediate != base) {
3610
        int pnum_inter;
3611
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3612
                                &pnum_inter);
3613
        if (ret < 0) {
3614
            return ret;
3615
        } else if (ret) {
3616
            *pnum = pnum_inter;
3617
            return 1;
3618
        }
3619

    
3620
        /*
3621
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3622
         * might have
3623
         *
3624
         * [sector_num+x, nr_sectors] allocated.
3625
         */
3626
        if (n > pnum_inter &&
3627
            (intermediate == top ||
3628
             sector_num + pnum_inter < intermediate->total_sectors)) {
3629
            n = pnum_inter;
3630
        }
3631

    
3632
        intermediate = intermediate->backing_hd;
3633
    }
3634

    
3635
    *pnum = n;
3636
    return 0;
3637
}
3638

    
3639
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3640
{
3641
    if (bs->backing_hd && bs->backing_hd->encrypted)
3642
        return bs->backing_file;
3643
    else if (bs->encrypted)
3644
        return bs->filename;
3645
    else
3646
        return NULL;
3647
}
3648

    
3649
void bdrv_get_backing_filename(BlockDriverState *bs,
3650
                               char *filename, int filename_size)
3651
{
3652
    pstrcpy(filename, filename_size, bs->backing_file);
3653
}
3654

    
3655
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3656
                          const uint8_t *buf, int nb_sectors)
3657
{
3658
    BlockDriver *drv = bs->drv;
3659
    if (!drv)
3660
        return -ENOMEDIUM;
3661
    if (!drv->bdrv_write_compressed)
3662
        return -ENOTSUP;
3663
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3664
        return -EIO;
3665

    
3666
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3667

    
3668
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3669
}
3670

    
3671
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3672
{
3673
    BlockDriver *drv = bs->drv;
3674
    if (!drv)
3675
        return -ENOMEDIUM;
3676
    if (!drv->bdrv_get_info)
3677
        return -ENOTSUP;
3678
    memset(bdi, 0, sizeof(*bdi));
3679
    return drv->bdrv_get_info(bs, bdi);
3680
}
3681

    
3682
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3683
{
3684
    BlockDriver *drv = bs->drv;
3685
    if (drv && drv->bdrv_get_specific_info) {
3686
        return drv->bdrv_get_specific_info(bs);
3687
    }
3688
    return NULL;
3689
}
3690

    
3691
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3692
                      int64_t pos, int size)
3693
{
3694
    QEMUIOVector qiov;
3695
    struct iovec iov = {
3696
        .iov_base   = (void *) buf,
3697
        .iov_len    = size,
3698
    };
3699

    
3700
    qemu_iovec_init_external(&qiov, &iov, 1);
3701
    return bdrv_writev_vmstate(bs, &qiov, pos);
3702
}
3703

    
3704
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3705
{
3706
    BlockDriver *drv = bs->drv;
3707

    
3708
    if (!drv) {
3709
        return -ENOMEDIUM;
3710
    } else if (drv->bdrv_save_vmstate) {
3711
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3712
    } else if (bs->file) {
3713
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3714
    }
3715

    
3716
    return -ENOTSUP;
3717
}
3718

    
3719
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3720
                      int64_t pos, int size)
3721
{
3722
    BlockDriver *drv = bs->drv;
3723
    if (!drv)
3724
        return -ENOMEDIUM;
3725
    if (drv->bdrv_load_vmstate)
3726
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3727
    if (bs->file)
3728
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3729
    return -ENOTSUP;
3730
}
3731

    
3732
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3733
{
3734
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3735
        return;
3736
    }
3737

    
3738
    bs->drv->bdrv_debug_event(bs, event);
3739
}
3740

    
3741
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3742
                          const char *tag)
3743
{
3744
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3745
        bs = bs->file;
3746
    }
3747

    
3748
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3749
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3750
    }
3751

    
3752
    return -ENOTSUP;
3753
}
3754

    
3755
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
3756
{
3757
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
3758
        bs = bs->file;
3759
    }
3760

    
3761
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
3762
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
3763
    }
3764

    
3765
    return -ENOTSUP;
3766
}
3767

    
3768
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3769
{
3770
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3771
        bs = bs->file;
3772
    }
3773

    
3774
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3775
        return bs->drv->bdrv_debug_resume(bs, tag);
3776
    }
3777

    
3778
    return -ENOTSUP;
3779
}
3780

    
3781
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3782
{
3783
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3784
        bs = bs->file;
3785
    }
3786

    
3787
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3788
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
3789
    }
3790

    
3791
    return false;
3792
}
3793

    
3794
int bdrv_is_snapshot(BlockDriverState *bs)
3795
{
3796
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3797
}
3798

    
3799
/* backing_file can either be relative, or absolute, or a protocol.  If it is
3800
 * relative, it must be relative to the chain.  So, passing in bs->filename
3801
 * from a BDS as backing_file should not be done, as that may be relative to
3802
 * the CWD rather than the chain. */
3803
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3804
        const char *backing_file)
3805
{
3806
    char *filename_full = NULL;
3807
    char *backing_file_full = NULL;
3808
    char *filename_tmp = NULL;
3809
    int is_protocol = 0;
3810
    BlockDriverState *curr_bs = NULL;
3811
    BlockDriverState *retval = NULL;
3812

    
3813
    if (!bs || !bs->drv || !backing_file) {
3814
        return NULL;
3815
    }
3816

    
3817
    filename_full     = g_malloc(PATH_MAX);
3818
    backing_file_full = g_malloc(PATH_MAX);
3819
    filename_tmp      = g_malloc(PATH_MAX);
3820

    
3821
    is_protocol = path_has_protocol(backing_file);
3822

    
3823
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3824

    
3825
        /* If either of the filename paths is actually a protocol, then
3826
         * compare unmodified paths; otherwise make paths relative */
3827
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3828
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3829
                retval = curr_bs->backing_hd;
3830
                break;
3831
            }
3832
        } else {
3833
            /* If not an absolute filename path, make it relative to the current
3834
             * image's filename path */
3835
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3836
                         backing_file);
3837

    
3838
            /* We are going to compare absolute pathnames */
3839
            if (!realpath(filename_tmp, filename_full)) {
3840
                continue;
3841
            }
3842

    
3843
            /* We need to make sure the backing filename we are comparing against
3844
             * is relative to the current image filename (or absolute) */
3845
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3846
                         curr_bs->backing_file);
3847

    
3848
            if (!realpath(filename_tmp, backing_file_full)) {
3849
                continue;
3850
            }
3851

    
3852
            if (strcmp(backing_file_full, filename_full) == 0) {
3853
                retval = curr_bs->backing_hd;
3854
                break;
3855
            }
3856
        }
3857
    }
3858

    
3859
    g_free(filename_full);
3860
    g_free(backing_file_full);
3861
    g_free(filename_tmp);
3862
    return retval;
3863
}
3864

    
3865
int bdrv_get_backing_file_depth(BlockDriverState *bs)
3866
{
3867
    if (!bs->drv) {
3868
        return 0;
3869
    }
3870

    
3871
    if (!bs->backing_hd) {
3872
        return 0;
3873
    }
3874

    
3875
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3876
}
3877

    
3878
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3879
{
3880
    BlockDriverState *curr_bs = NULL;
3881

    
3882
    if (!bs) {
3883
        return NULL;
3884
    }
3885

    
3886
    curr_bs = bs;
3887

    
3888
    while (curr_bs->backing_hd) {
3889
        curr_bs = curr_bs->backing_hd;
3890
    }
3891
    return curr_bs;
3892
}
3893

    
3894
/**************************************************************/
3895
/* async I/Os */
3896

    
3897
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3898
                                 QEMUIOVector *qiov, int nb_sectors,
3899
                                 BlockDriverCompletionFunc *cb, void *opaque)
3900
{
3901
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3902

    
3903
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3904
                                 cb, opaque, false);
3905
}
3906

    
3907
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3908
                                  QEMUIOVector *qiov, int nb_sectors,
3909
                                  BlockDriverCompletionFunc *cb, void *opaque)
3910
{
3911
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3912

    
3913
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3914
                                 cb, opaque, true);
3915
}
3916

    
3917
BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
3918
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
3919
        BlockDriverCompletionFunc *cb, void *opaque)
3920
{
3921
    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
3922

    
3923
    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
3924
                                 BDRV_REQ_ZERO_WRITE | flags,
3925
                                 cb, opaque, true);
3926
}
3927

    
3928

    
3929
typedef struct MultiwriteCB {
3930
    int error;
3931
    int num_requests;
3932
    int num_callbacks;
3933
    struct {
3934
        BlockDriverCompletionFunc *cb;
3935
        void *opaque;
3936
        QEMUIOVector *free_qiov;
3937
    } callbacks[];
3938
} MultiwriteCB;
3939

    
3940
static void multiwrite_user_cb(MultiwriteCB *mcb)
3941
{
3942
    int i;
3943

    
3944
    for (i = 0; i < mcb->num_callbacks; i++) {
3945
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3946
        if (mcb->callbacks[i].free_qiov) {
3947
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3948
        }
3949
        g_free(mcb->callbacks[i].free_qiov);
3950
    }
3951
}
3952

    
3953
static void multiwrite_cb(void *opaque, int ret)
3954
{
3955
    MultiwriteCB *mcb = opaque;
3956

    
3957
    trace_multiwrite_cb(mcb, ret);
3958

    
3959
    if (ret < 0 && !mcb->error) {
3960
        mcb->error = ret;
3961
    }
3962

    
3963
    mcb->num_requests--;
3964
    if (mcb->num_requests == 0) {
3965
        multiwrite_user_cb(mcb);
3966
        g_free(mcb);
3967
    }
3968
}
3969

    
3970
static int multiwrite_req_compare(const void *a, const void *b)
3971
{
3972
    const BlockRequest *req1 = a, *req2 = b;
3973

    
3974
    /*
3975
     * Note that we can't simply subtract req2->sector from req1->sector
3976
     * here as that could overflow the return value.
3977
     */
3978
    if (req1->sector > req2->sector) {
3979
        return 1;
3980
    } else if (req1->sector < req2->sector) {
3981
        return -1;
3982
    } else {
3983
        return 0;
3984
    }
3985
}
3986

    
3987
/*
3988
 * Takes a bunch of requests and tries to merge them. Returns the number of
3989
 * requests that remain after merging.
3990
 */
3991
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3992
    int num_reqs, MultiwriteCB *mcb)
3993
{
3994
    int i, outidx;
3995

    
3996
    // Sort requests by start sector
3997
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3998

    
3999
    // Check if adjacent requests touch the same clusters. If so, combine them,
4000
    // filling up gaps with zero sectors.
4001
    outidx = 0;
4002
    for (i = 1; i < num_reqs; i++) {
4003
        int merge = 0;
4004
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4005

    
4006
        // Handle exactly sequential writes and overlapping writes.
4007
        if (reqs[i].sector <= oldreq_last) {
4008
            merge = 1;
4009
        }
4010

    
4011
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4012
            merge = 0;
4013
        }
4014

    
4015
        if (merge) {
4016
            size_t size;
4017
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4018
            qemu_iovec_init(qiov,
4019
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4020

    
4021
            // Add the first request to the merged one. If the requests are
4022
            // overlapping, drop the last sectors of the first request.
4023
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
4024
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4025

    
4026
            // We should need to add any zeros between the two requests
4027
            assert (reqs[i].sector <= oldreq_last);
4028

    
4029
            // Add the second request
4030
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4031

    
4032
            reqs[outidx].nb_sectors = qiov->size >> 9;
4033
            reqs[outidx].qiov = qiov;
4034

    
4035
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4036
        } else {
4037
            outidx++;
4038
            reqs[outidx].sector     = reqs[i].sector;
4039
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4040
            reqs[outidx].qiov       = reqs[i].qiov;
4041
        }
4042
    }
4043

    
4044
    return outidx + 1;
4045
}
4046

    
4047
/*
4048
 * Submit multiple AIO write requests at once.
4049
 *
4050
 * On success, the function returns 0 and all requests in the reqs array have
4051
 * been submitted. In error case this function returns -1, and any of the
4052
 * requests may or may not be submitted yet. In particular, this means that the
4053
 * callback will be called for some of the requests, for others it won't. The
4054
 * caller must check the error field of the BlockRequest to wait for the right
4055
 * callbacks (if error != 0, no callback will be called).
4056
 *
4057
 * The implementation may modify the contents of the reqs array, e.g. to merge
4058
 * requests. However, the fields opaque and error are left unmodified as they
4059
 * are used to signal failure for a single request to the caller.
4060
 */
4061
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4062
{
4063
    MultiwriteCB *mcb;
4064
    int i;
4065

    
4066
    /* don't submit writes if we don't have a medium */
4067
    if (bs->drv == NULL) {
4068
        for (i = 0; i < num_reqs; i++) {
4069
            reqs[i].error = -ENOMEDIUM;
4070
        }
4071
        return -1;
4072
    }
4073

    
4074
    if (num_reqs == 0) {
4075
        return 0;
4076
    }
4077

    
4078
    // Create MultiwriteCB structure
4079
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4080
    mcb->num_requests = 0;
4081
    mcb->num_callbacks = num_reqs;
4082

    
4083
    for (i = 0; i < num_reqs; i++) {
4084
        mcb->callbacks[i].cb = reqs[i].cb;
4085
        mcb->callbacks[i].opaque = reqs[i].opaque;
4086
    }
4087

    
4088
    // Check for mergable requests
4089
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4090

    
4091
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4092

    
4093
    /* Run the aio requests. */
4094
    mcb->num_requests = num_reqs;
4095
    for (i = 0; i < num_reqs; i++) {
4096
        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4097
                              reqs[i].nb_sectors, reqs[i].flags,
4098
                              multiwrite_cb, mcb,
4099
                              true);
4100
    }
4101

    
4102
    return 0;
4103
}
4104

    
4105
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4106
{
4107
    acb->aiocb_info->cancel(acb);
4108
}
4109

    
4110
/**************************************************************/
4111
/* async block device emulation */
4112

    
4113
typedef struct BlockDriverAIOCBSync {
4114
    BlockDriverAIOCB common;
4115
    QEMUBH *bh;
4116
    int ret;
4117
    /* vector translation state */
4118
    QEMUIOVector *qiov;
4119
    uint8_t *bounce;
4120
    int is_write;
4121
} BlockDriverAIOCBSync;
4122

    
4123
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4124
{
4125
    BlockDriverAIOCBSync *acb =
4126
        container_of(blockacb, BlockDriverAIOCBSync, common);
4127
    qemu_bh_delete(acb->bh);
4128
    acb->bh = NULL;
4129
    qemu_aio_release(acb);
4130
}
4131

    
4132
static const AIOCBInfo bdrv_em_aiocb_info = {
4133
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4134
    .cancel             = bdrv_aio_cancel_em,
4135
};
4136

    
4137
static void bdrv_aio_bh_cb(void *opaque)
4138
{
4139
    BlockDriverAIOCBSync *acb = opaque;
4140

    
4141
    if (!acb->is_write)
4142
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4143
    qemu_vfree(acb->bounce);
4144
    acb->common.cb(acb->common.opaque, acb->ret);
4145
    qemu_bh_delete(acb->bh);
4146
    acb->bh = NULL;
4147
    qemu_aio_release(acb);
4148
}
4149

    
4150
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4151
                                            int64_t sector_num,
4152
                                            QEMUIOVector *qiov,
4153
                                            int nb_sectors,
4154
                                            BlockDriverCompletionFunc *cb,
4155
                                            void *opaque,
4156
                                            int is_write)
4157

    
4158
{
4159
    BlockDriverAIOCBSync *acb;
4160

    
4161
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4162
    acb->is_write = is_write;
4163
    acb->qiov = qiov;
4164
    acb->bounce = qemu_blockalign(bs, qiov->size);
4165
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4166

    
4167
    if (is_write) {
4168
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4169
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4170
    } else {
4171
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4172
    }
4173

    
4174
    qemu_bh_schedule(acb->bh);
4175

    
4176
    return &acb->common;
4177
}
4178

    
4179
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4180
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4181
        BlockDriverCompletionFunc *cb, void *opaque)
4182
{
4183
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4184
}
4185

    
4186
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4187
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4188
        BlockDriverCompletionFunc *cb, void *opaque)
4189
{
4190
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4191
}
4192

    
4193

    
4194
typedef struct BlockDriverAIOCBCoroutine {
4195
    BlockDriverAIOCB common;
4196
    BlockRequest req;
4197
    bool is_write;
4198
    bool *done;
4199
    QEMUBH* bh;
4200
} BlockDriverAIOCBCoroutine;
4201

    
4202
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4203
{
4204
    BlockDriverAIOCBCoroutine *acb =
4205
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4206
    bool done = false;
4207

    
4208
    acb->done = &done;
4209
    while (!done) {
4210
        qemu_aio_wait();
4211
    }
4212
}
4213

    
4214
static const AIOCBInfo bdrv_em_co_aiocb_info = {
4215
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4216
    .cancel             = bdrv_aio_co_cancel_em,
4217
};
4218

    
4219
static void bdrv_co_em_bh(void *opaque)
4220
{
4221
    BlockDriverAIOCBCoroutine *acb = opaque;
4222

    
4223
    acb->common.cb(acb->common.opaque, acb->req.error);
4224

    
4225
    if (acb->done) {
4226
        *acb->done = true;
4227
    }
4228

    
4229
    qemu_bh_delete(acb->bh);
4230
    qemu_aio_release(acb);
4231
}
4232

    
4233
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4234
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4235
{
4236
    BlockDriverAIOCBCoroutine *acb = opaque;
4237
    BlockDriverState *bs = acb->common.bs;
4238

    
4239
    if (!acb->is_write) {
4240
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4241
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4242
    } else {
4243
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4244
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4245
    }
4246

    
4247
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4248
    qemu_bh_schedule(acb->bh);
4249
}
4250

    
4251
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4252
                                               int64_t sector_num,
4253
                                               QEMUIOVector *qiov,
4254
                                               int nb_sectors,
4255
                                               BdrvRequestFlags flags,
4256
                                               BlockDriverCompletionFunc *cb,
4257
                                               void *opaque,
4258
                                               bool is_write)
4259
{
4260
    Coroutine *co;
4261
    BlockDriverAIOCBCoroutine *acb;
4262

    
4263
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4264
    acb->req.sector = sector_num;
4265
    acb->req.nb_sectors = nb_sectors;
4266
    acb->req.qiov = qiov;
4267
    acb->req.flags = flags;
4268
    acb->is_write = is_write;
4269
    acb->done = NULL;
4270

    
4271
    co = qemu_coroutine_create(bdrv_co_do_rw);
4272
    qemu_coroutine_enter(co, acb);
4273

    
4274
    return &acb->common;
4275
}
4276

    
4277
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4278
{
4279
    BlockDriverAIOCBCoroutine *acb = opaque;
4280
    BlockDriverState *bs = acb->common.bs;
4281

    
4282
    acb->req.error = bdrv_co_flush(bs);
4283
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4284
    qemu_bh_schedule(acb->bh);
4285
}
4286

    
4287
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4288
        BlockDriverCompletionFunc *cb, void *opaque)
4289
{
4290
    trace_bdrv_aio_flush(bs, opaque);
4291

    
4292
    Coroutine *co;
4293
    BlockDriverAIOCBCoroutine *acb;
4294

    
4295
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4296
    acb->done = NULL;
4297

    
4298
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4299
    qemu_coroutine_enter(co, acb);
4300

    
4301
    return &acb->common;
4302
}
4303

    
4304
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4305
{
4306
    BlockDriverAIOCBCoroutine *acb = opaque;
4307
    BlockDriverState *bs = acb->common.bs;
4308

    
4309
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4310
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4311
    qemu_bh_schedule(acb->bh);
4312
}
4313

    
4314
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4315
        int64_t sector_num, int nb_sectors,
4316
        BlockDriverCompletionFunc *cb, void *opaque)
4317
{
4318
    Coroutine *co;
4319
    BlockDriverAIOCBCoroutine *acb;
4320

    
4321
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4322

    
4323
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4324
    acb->req.sector = sector_num;
4325
    acb->req.nb_sectors = nb_sectors;
4326
    acb->done = NULL;
4327
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4328
    qemu_coroutine_enter(co, acb);
4329

    
4330
    return &acb->common;
4331
}
4332

    
4333
void bdrv_init(void)
4334
{
4335
    module_call_init(MODULE_INIT_BLOCK);
4336
}
4337

    
4338
void bdrv_init_with_whitelist(void)
4339
{
4340
    use_bdrv_whitelist = 1;
4341
    bdrv_init();
4342
}
4343

    
4344
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4345
                   BlockDriverCompletionFunc *cb, void *opaque)
4346
{
4347
    BlockDriverAIOCB *acb;
4348

    
4349
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4350
    acb->aiocb_info = aiocb_info;
4351
    acb->bs = bs;
4352
    acb->cb = cb;
4353
    acb->opaque = opaque;
4354
    return acb;
4355
}
4356

    
4357
void qemu_aio_release(void *p)
4358
{
4359
    BlockDriverAIOCB *acb = p;
4360
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4361
}
4362

    
4363
/**************************************************************/
4364
/* Coroutine block device emulation */
4365

    
4366
typedef struct CoroutineIOCompletion {
4367
    Coroutine *coroutine;
4368
    int ret;
4369
} CoroutineIOCompletion;
4370

    
4371
static void bdrv_co_io_em_complete(void *opaque, int ret)
4372
{
4373
    CoroutineIOCompletion *co = opaque;
4374

    
4375
    co->ret = ret;
4376
    qemu_coroutine_enter(co->coroutine, NULL);
4377
}
4378

    
4379
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4380
                                      int nb_sectors, QEMUIOVector *iov,
4381
                                      bool is_write)
4382
{
4383
    CoroutineIOCompletion co = {
4384
        .coroutine = qemu_coroutine_self(),
4385
    };
4386
    BlockDriverAIOCB *acb;
4387

    
4388
    if (is_write) {
4389
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4390
                                       bdrv_co_io_em_complete, &co);
4391
    } else {
4392
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4393
                                      bdrv_co_io_em_complete, &co);
4394
    }
4395

    
4396
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4397
    if (!acb) {
4398
        return -EIO;
4399
    }
4400
    qemu_coroutine_yield();
4401

    
4402
    return co.ret;
4403
}
4404

    
4405
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4406
                                         int64_t sector_num, int nb_sectors,
4407
                                         QEMUIOVector *iov)
4408
{
4409
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4410
}
4411

    
4412
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4413
                                         int64_t sector_num, int nb_sectors,
4414
                                         QEMUIOVector *iov)
4415
{
4416
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4417
}
4418

    
4419
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4420
{
4421
    RwCo *rwco = opaque;
4422

    
4423
    rwco->ret = bdrv_co_flush(rwco->bs);
4424
}
4425

    
4426
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4427
{
4428
    int ret;
4429

    
4430
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4431
        return 0;
4432
    }
4433

    
4434
    /* Write back cached data to the OS even with cache=unsafe */
4435
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4436
    if (bs->drv->bdrv_co_flush_to_os) {
4437
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4438
        if (ret < 0) {
4439
            return ret;
4440
        }
4441
    }
4442

    
4443
    /* But don't actually force it to the disk with cache=unsafe */
4444
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4445
        goto flush_parent;
4446
    }
4447

    
4448
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4449
    if (bs->drv->bdrv_co_flush_to_disk) {
4450
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4451
    } else if (bs->drv->bdrv_aio_flush) {
4452
        BlockDriverAIOCB *acb;
4453
        CoroutineIOCompletion co = {
4454
            .coroutine = qemu_coroutine_self(),
4455
        };
4456

    
4457
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4458
        if (acb == NULL) {
4459
            ret = -EIO;
4460
        } else {
4461
            qemu_coroutine_yield();
4462
            ret = co.ret;
4463
        }
4464
    } else {
4465
        /*
4466
         * Some block drivers always operate in either writethrough or unsafe
4467
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4468
         * know how the server works (because the behaviour is hardcoded or
4469
         * depends on server-side configuration), so we can't ensure that
4470
         * everything is safe on disk. Returning an error doesn't work because
4471
         * that would break guests even if the server operates in writethrough
4472
         * mode.
4473
         *
4474
         * Let's hope the user knows what he's doing.
4475
         */
4476
        ret = 0;
4477
    }
4478
    if (ret < 0) {
4479
        return ret;
4480
    }
4481

    
4482
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4483
     * in the case of cache=unsafe, so there are no useless flushes.
4484
     */
4485
flush_parent:
4486
    return bdrv_co_flush(bs->file);
4487
}
4488

    
4489
void bdrv_invalidate_cache(BlockDriverState *bs)
4490
{
4491
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4492
        bs->drv->bdrv_invalidate_cache(bs);
4493
    }
4494
}
4495

    
4496
void bdrv_invalidate_cache_all(void)
4497
{
4498
    BlockDriverState *bs;
4499

    
4500
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4501
        bdrv_invalidate_cache(bs);
4502
    }
4503
}
4504

    
4505
void bdrv_clear_incoming_migration_all(void)
4506
{
4507
    BlockDriverState *bs;
4508

    
4509
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4510
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4511
    }
4512
}
4513

    
4514
int bdrv_flush(BlockDriverState *bs)
4515
{
4516
    Coroutine *co;
4517
    RwCo rwco = {
4518
        .bs = bs,
4519
        .ret = NOT_DONE,
4520
    };
4521

    
4522
    if (qemu_in_coroutine()) {
4523
        /* Fast-path if already in coroutine context */
4524
        bdrv_flush_co_entry(&rwco);
4525
    } else {
4526
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4527
        qemu_coroutine_enter(co, &rwco);
4528
        while (rwco.ret == NOT_DONE) {
4529
            qemu_aio_wait();
4530
        }
4531
    }
4532

    
4533
    return rwco.ret;
4534
}
4535

    
4536
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4537
{
4538
    RwCo *rwco = opaque;
4539

    
4540
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4541
}
4542

    
4543
/* if no limit is specified in the BlockLimits use a default
4544
 * of 32768 512-byte sectors (16 MiB) per request.
4545
 */
4546
#define MAX_DISCARD_DEFAULT 32768
4547

    
4548
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4549
                                 int nb_sectors)
4550
{
4551
    int max_discard;
4552

    
4553
    if (!bs->drv) {
4554
        return -ENOMEDIUM;
4555
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4556
        return -EIO;
4557
    } else if (bs->read_only) {
4558
        return -EROFS;
4559
    }
4560

    
4561
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4562

    
4563
    /* Do nothing if disabled.  */
4564
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4565
        return 0;
4566
    }
4567

    
4568
    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4569
        return 0;
4570
    }
4571

    
4572
    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4573
    while (nb_sectors > 0) {
4574
        int ret;
4575
        int num = nb_sectors;
4576

    
4577
        /* align request */
4578
        if (bs->bl.discard_alignment &&
4579
            num >= bs->bl.discard_alignment &&
4580
            sector_num % bs->bl.discard_alignment) {
4581
            if (num > bs->bl.discard_alignment) {
4582
                num = bs->bl.discard_alignment;
4583
            }
4584
            num -= sector_num % bs->bl.discard_alignment;
4585
        }
4586

    
4587
        /* limit request size */
4588
        if (num > max_discard) {
4589
            num = max_discard;
4590
        }
4591

    
4592
        if (bs->drv->bdrv_co_discard) {
4593
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4594
        } else {
4595
            BlockDriverAIOCB *acb;
4596
            CoroutineIOCompletion co = {
4597
                .coroutine = qemu_coroutine_self(),
4598
            };
4599

    
4600
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4601
                                            bdrv_co_io_em_complete, &co);
4602
            if (acb == NULL) {
4603
                return -EIO;
4604
            } else {
4605
                qemu_coroutine_yield();
4606
                ret = co.ret;
4607
            }
4608
        }
4609
        if (ret && ret != -ENOTSUP) {
4610
            return ret;
4611
        }
4612

    
4613
        sector_num += num;
4614
        nb_sectors -= num;
4615
    }
4616
    return 0;
4617
}
4618

    
4619
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4620
{
4621
    Coroutine *co;
4622
    RwCo rwco = {
4623
        .bs = bs,
4624
        .sector_num = sector_num,
4625
        .nb_sectors = nb_sectors,
4626
        .ret = NOT_DONE,
4627
    };
4628

    
4629
    if (qemu_in_coroutine()) {
4630
        /* Fast-path if already in coroutine context */
4631
        bdrv_discard_co_entry(&rwco);
4632
    } else {
4633
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4634
        qemu_coroutine_enter(co, &rwco);
4635
        while (rwco.ret == NOT_DONE) {
4636
            qemu_aio_wait();
4637
        }
4638
    }
4639

    
4640
    return rwco.ret;
4641
}
4642

    
4643
/**************************************************************/
4644
/* removable device support */
4645

    
4646
/**
4647
 * Return TRUE if the media is present
4648
 */
4649
int bdrv_is_inserted(BlockDriverState *bs)
4650
{
4651
    BlockDriver *drv = bs->drv;
4652

    
4653
    if (!drv)
4654
        return 0;
4655
    if (!drv->bdrv_is_inserted)
4656
        return 1;
4657
    return drv->bdrv_is_inserted(bs);
4658
}
4659

    
4660
/**
4661
 * Return whether the media changed since the last call to this
4662
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4663
 */
4664
int bdrv_media_changed(BlockDriverState *bs)
4665
{
4666
    BlockDriver *drv = bs->drv;
4667

    
4668
    if (drv && drv->bdrv_media_changed) {
4669
        return drv->bdrv_media_changed(bs);
4670
    }
4671
    return -ENOTSUP;
4672
}
4673

    
4674
/**
4675
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4676
 */
4677
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4678
{
4679
    BlockDriver *drv = bs->drv;
4680

    
4681
    if (drv && drv->bdrv_eject) {
4682
        drv->bdrv_eject(bs, eject_flag);
4683
    }
4684

    
4685
    if (bs->device_name[0] != '\0') {
4686
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4687
    }
4688
}
4689

    
4690
/**
4691
 * Lock or unlock the media (if it is locked, the user won't be able
4692
 * to eject it manually).
4693
 */
4694
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4695
{
4696
    BlockDriver *drv = bs->drv;
4697

    
4698
    trace_bdrv_lock_medium(bs, locked);
4699

    
4700
    if (drv && drv->bdrv_lock_medium) {
4701
        drv->bdrv_lock_medium(bs, locked);
4702
    }
4703
}
4704

    
4705
/* needed for generic scsi interface */
4706

    
4707
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4708
{
4709
    BlockDriver *drv = bs->drv;
4710

    
4711
    if (drv && drv->bdrv_ioctl)
4712
        return drv->bdrv_ioctl(bs, req, buf);
4713
    return -ENOTSUP;
4714
}
4715

    
4716
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4717
        unsigned long int req, void *buf,
4718
        BlockDriverCompletionFunc *cb, void *opaque)
4719
{
4720
    BlockDriver *drv = bs->drv;
4721

    
4722
    if (drv && drv->bdrv_aio_ioctl)
4723
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4724
    return NULL;
4725
}
4726

    
4727
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4728
{
4729
    bs->buffer_alignment = align;
4730
}
4731

    
4732
void *qemu_blockalign(BlockDriverState *bs, size_t size)
4733
{
4734
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4735
}
4736

    
4737
/*
4738
 * Check if all memory in this vector is sector aligned.
4739
 */
4740
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4741
{
4742
    int i;
4743

    
4744
    for (i = 0; i < qiov->niov; i++) {
4745
        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4746
            return false;
4747
        }
4748
    }
4749

    
4750
    return true;
4751
}
4752

    
4753
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
4754
{
4755
    int64_t bitmap_size;
4756
    BdrvDirtyBitmap *bitmap;
4757

    
4758
    assert((granularity & (granularity - 1)) == 0);
4759

    
4760
    granularity >>= BDRV_SECTOR_BITS;
4761
    assert(granularity);
4762
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4763
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
4764
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4765
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
4766
    return bitmap;
4767
}
4768

    
4769
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4770
{
4771
    BdrvDirtyBitmap *bm, *next;
4772
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
4773
        if (bm == bitmap) {
4774
            QLIST_REMOVE(bitmap, list);
4775
            hbitmap_free(bitmap->bitmap);
4776
            g_free(bitmap);
4777
            return;
4778
        }
4779
    }
4780
}
4781

    
4782
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
4783
{
4784
    BdrvDirtyBitmap *bm;
4785
    BlockDirtyInfoList *list = NULL;
4786
    BlockDirtyInfoList **plist = &list;
4787

    
4788
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
4789
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
4790
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
4791
        info->count = bdrv_get_dirty_count(bs, bm);
4792
        info->granularity =
4793
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
4794
        entry->value = info;
4795
        *plist = entry;
4796
        plist = &entry->next;
4797
    }
4798

    
4799
    return list;
4800
}
4801

    
4802
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
4803
{
4804
    if (bitmap) {
4805
        return hbitmap_get(bitmap->bitmap, sector);
4806
    } else {
4807
        return 0;
4808
    }
4809
}
4810

    
4811
void bdrv_dirty_iter_init(BlockDriverState *bs,
4812
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
4813
{
4814
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
4815
}
4816

    
4817
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4818
                    int nr_sectors)
4819
{
4820
    BdrvDirtyBitmap *bitmap;
4821
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4822
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
4823
    }
4824
}
4825

    
4826
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
4827
{
4828
    BdrvDirtyBitmap *bitmap;
4829
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4830
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
4831
    }
4832
}
4833

    
4834
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4835
{
4836
    return hbitmap_count(bitmap->bitmap);
4837
}
4838

    
4839
/* Get a reference to bs */
4840
void bdrv_ref(BlockDriverState *bs)
4841
{
4842
    bs->refcnt++;
4843
}
4844

    
4845
/* Release a previously grabbed reference to bs.
4846
 * If after releasing, reference count is zero, the BlockDriverState is
4847
 * deleted. */
4848
void bdrv_unref(BlockDriverState *bs)
4849
{
4850
    assert(bs->refcnt > 0);
4851
    if (--bs->refcnt == 0) {
4852
        bdrv_delete(bs);
4853
    }
4854
}
4855

    
4856
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4857
{
4858
    assert(bs->in_use != in_use);
4859
    bs->in_use = in_use;
4860
}
4861

    
4862
int bdrv_in_use(BlockDriverState *bs)
4863
{
4864
    return bs->in_use;
4865
}
4866

    
4867
void bdrv_iostatus_enable(BlockDriverState *bs)
4868
{
4869
    bs->iostatus_enabled = true;
4870
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4871
}
4872

    
4873
/* The I/O status is only enabled if the drive explicitly
4874
 * enables it _and_ the VM is configured to stop on errors */
4875
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4876
{
4877
    return (bs->iostatus_enabled &&
4878
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4879
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4880
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4881
}
4882

    
4883
void bdrv_iostatus_disable(BlockDriverState *bs)
4884
{
4885
    bs->iostatus_enabled = false;
4886
}
4887

    
4888
void bdrv_iostatus_reset(BlockDriverState *bs)
4889
{
4890
    if (bdrv_iostatus_is_enabled(bs)) {
4891
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4892
        if (bs->job) {
4893
            block_job_iostatus_reset(bs->job);
4894
        }
4895
    }
4896
}
4897

    
4898
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4899
{
4900
    assert(bdrv_iostatus_is_enabled(bs));
4901
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4902
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4903
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4904
    }
4905
}
4906

    
4907
void
4908
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4909
        enum BlockAcctType type)
4910
{
4911
    assert(type < BDRV_MAX_IOTYPE);
4912

    
4913
    cookie->bytes = bytes;
4914
    cookie->start_time_ns = get_clock();
4915
    cookie->type = type;
4916
}
4917

    
4918
void
4919
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4920
{
4921
    assert(cookie->type < BDRV_MAX_IOTYPE);
4922

    
4923
    bs->nr_bytes[cookie->type] += cookie->bytes;
4924
    bs->nr_ops[cookie->type]++;
4925
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4926
}
4927

    
4928
void bdrv_img_create(const char *filename, const char *fmt,
4929
                     const char *base_filename, const char *base_fmt,
4930
                     char *options, uint64_t img_size, int flags,
4931
                     Error **errp, bool quiet)
4932
{
4933
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4934
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4935
    BlockDriver *drv, *proto_drv;
4936
    BlockDriver *backing_drv = NULL;
4937
    Error *local_err = NULL;
4938
    int ret = 0;
4939

    
4940
    /* Find driver and parse its options */
4941
    drv = bdrv_find_format(fmt);
4942
    if (!drv) {
4943
        error_setg(errp, "Unknown file format '%s'", fmt);
4944
        return;
4945
    }
4946

    
4947
    proto_drv = bdrv_find_protocol(filename, true);
4948
    if (!proto_drv) {
4949
        error_setg(errp, "Unknown protocol '%s'", filename);
4950
        return;
4951
    }
4952

    
4953
    create_options = append_option_parameters(create_options,
4954
                                              drv->create_options);
4955
    create_options = append_option_parameters(create_options,
4956
                                              proto_drv->create_options);
4957

    
4958
    /* Create parameter list with default values */
4959
    param = parse_option_parameters("", create_options, param);
4960

    
4961
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4962

    
4963
    /* Parse -o options */
4964
    if (options) {
4965
        param = parse_option_parameters(options, create_options, param);
4966
        if (param == NULL) {
4967
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
4968
            goto out;
4969
        }
4970
    }
4971

    
4972
    if (base_filename) {
4973
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4974
                                 base_filename)) {
4975
            error_setg(errp, "Backing file not supported for file format '%s'",
4976
                       fmt);
4977
            goto out;
4978
        }
4979
    }
4980

    
4981
    if (base_fmt) {
4982
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4983
            error_setg(errp, "Backing file format not supported for file "
4984
                             "format '%s'", fmt);
4985
            goto out;
4986
        }
4987
    }
4988

    
4989
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4990
    if (backing_file && backing_file->value.s) {
4991
        if (!strcmp(filename, backing_file->value.s)) {
4992
            error_setg(errp, "Error: Trying to create an image with the "
4993
                             "same filename as the backing file");
4994
            goto out;
4995
        }
4996
    }
4997

    
4998
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4999
    if (backing_fmt && backing_fmt->value.s) {
5000
        backing_drv = bdrv_find_format(backing_fmt->value.s);
5001
        if (!backing_drv) {
5002
            error_setg(errp, "Unknown backing file format '%s'",
5003
                       backing_fmt->value.s);
5004
            goto out;
5005
        }
5006
    }
5007

    
5008
    // The size for the image must always be specified, with one exception:
5009
    // If we are using a backing file, we can obtain the size from there
5010
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
5011
    if (size && size->value.n == -1) {
5012
        if (backing_file && backing_file->value.s) {
5013
            BlockDriverState *bs;
5014
            uint64_t size;
5015
            char buf[32];
5016
            int back_flags;
5017

    
5018
            /* backing files always opened read-only */
5019
            back_flags =
5020
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5021

    
5022
            bs = bdrv_new("");
5023

    
5024
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
5025
                            backing_drv, &local_err);
5026
            if (ret < 0) {
5027
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
5028
                                 backing_file->value.s,
5029
                                 error_get_pretty(local_err));
5030
                error_free(local_err);
5031
                local_err = NULL;
5032
                bdrv_unref(bs);
5033
                goto out;
5034
            }
5035
            bdrv_get_geometry(bs, &size);
5036
            size *= 512;
5037

    
5038
            snprintf(buf, sizeof(buf), "%" PRId64, size);
5039
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5040

    
5041
            bdrv_unref(bs);
5042
        } else {
5043
            error_setg(errp, "Image creation needs a size parameter");
5044
            goto out;
5045
        }
5046
    }
5047

    
5048
    if (!quiet) {
5049
        printf("Formatting '%s', fmt=%s ", filename, fmt);
5050
        print_option_parameters(param);
5051
        puts("");
5052
    }
5053
    ret = bdrv_create(drv, filename, param, &local_err);
5054
    if (ret == -EFBIG) {
5055
        /* This is generally a better message than whatever the driver would
5056
         * deliver (especially because of the cluster_size_hint), since that
5057
         * is most probably not much different from "image too large". */
5058
        const char *cluster_size_hint = "";
5059
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5060
            cluster_size_hint = " (try using a larger cluster size)";
5061
        }
5062
        error_setg(errp, "The image size is too large for file format '%s'"
5063
                   "%s", fmt, cluster_size_hint);
5064
        error_free(local_err);
5065
        local_err = NULL;
5066
    }
5067

    
5068
out:
5069
    free_option_parameters(create_options);
5070
    free_option_parameters(param);
5071

    
5072
    if (error_is_set(&local_err)) {
5073
        error_propagate(errp, local_err);
5074
    }
5075
}
5076

    
5077
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5078
{
5079
    /* Currently BlockDriverState always uses the main loop AioContext */
5080
    return qemu_get_aio_context();
5081
}
5082

    
5083
void bdrv_add_before_write_notifier(BlockDriverState *bs,
5084
                                    NotifierWithReturn *notifier)
5085
{
5086
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5087
}
5088

    
5089
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5090
{
5091
    if (bs->drv->bdrv_amend_options == NULL) {
5092
        return -ENOTSUP;
5093
    }
5094
    return bs->drv->bdrv_amend_options(bs, options);
5095
}
5096

    
5097
/* Used to recurse on single child block filters.
5098
 * Single child block filter will store their child in bs->file.
5099
 */
5100
bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5101
                                      BlockDriverState *candidate)
5102
{
5103
    if (!bs->drv) {
5104
        return false;
5105
    }
5106

    
5107
    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5108
        if (bs == candidate) {
5109
            return true;
5110
        } else {
5111
            return false;
5112
        }
5113
    }
5114

    
5115
    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5116
        return false;
5117
    }
5118

    
5119
    if (!bs->file) {
5120
        return false;
5121
    }
5122

    
5123
    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5124
}
5125

    
5126
bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5127
                                      BlockDriverState *candidate)
5128
{
5129
    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5130
        return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5131
    }
5132

    
5133
    return bdrv_generic_is_first_non_filter(bs, candidate);
5134
}
5135

    
5136
/* This function checks if the candidate is the first non filter bs down it's
5137
 * bs chain. Since we don't have pointers to parents it explore all bs chains
5138
 * from the top. Some filters can choose not to pass down the recursion.
5139
 */
5140
bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5141
{
5142
    BlockDriverState *bs;
5143

    
5144
    /* walk down the bs forest recursively */
5145
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5146
        bool perm;
5147

    
5148
        if (!bs->file) {
5149
            continue;
5150
        }
5151

    
5152
        perm = bdrv_recurse_is_first_non_filter(bs->file, candidate);
5153

    
5154
        /* candidate is the first non filter */
5155
        if (perm) {
5156
            return true;
5157
        }
5158
    }
5159

    
5160
    return false;
5161
}