Statistics
| Branch: | Revision:

root / block.c @ f67503e5

History | View | Annotate | Download (151.1 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "block/qapi.h"
36
#include "qmp-commands.h"
37
#include "qemu/timer.h"
38

    
39
#ifdef CONFIG_BSD
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <sys/ioctl.h>
43
#include <sys/queue.h>
44
#ifndef __DragonFly__
45
#include <sys/disk.h>
46
#endif
47
#endif
48

    
49
#ifdef _WIN32
50
#include <windows.h>
51
#endif
52

    
53
struct BdrvDirtyBitmap {
54
    HBitmap *bitmap;
55
    QLIST_ENTRY(BdrvDirtyBitmap) list;
56
};
57

    
58
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59

    
60
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63
        BlockDriverCompletionFunc *cb, void *opaque);
64
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66
        BlockDriverCompletionFunc *cb, void *opaque);
67
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68
                                         int64_t sector_num, int nb_sectors,
69
                                         QEMUIOVector *iov);
70
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71
                                         int64_t sector_num, int nb_sectors,
72
                                         QEMUIOVector *iov);
73
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75
    BdrvRequestFlags flags);
76
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78
    BdrvRequestFlags flags);
79
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80
                                               int64_t sector_num,
81
                                               QEMUIOVector *qiov,
82
                                               int nb_sectors,
83
                                               BdrvRequestFlags flags,
84
                                               BlockDriverCompletionFunc *cb,
85
                                               void *opaque,
86
                                               bool is_write);
87
static void coroutine_fn bdrv_co_do_rw(void *opaque);
88
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96

    
97
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_set_io_limits(BlockDriverState *bs,
125
                        ThrottleConfig *cfg)
126
{
127
    int i;
128

    
129
    throttle_config(&bs->throttle_state, cfg);
130

    
131
    for (i = 0; i < 2; i++) {
132
        qemu_co_enter_next(&bs->throttled_reqs[i]);
133
    }
134
}
135

    
136
/* this function drain all the throttled IOs */
137
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138
{
139
    bool drained = false;
140
    bool enabled = bs->io_limits_enabled;
141
    int i;
142

    
143
    bs->io_limits_enabled = false;
144

    
145
    for (i = 0; i < 2; i++) {
146
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147
            drained = true;
148
        }
149
    }
150

    
151
    bs->io_limits_enabled = enabled;
152

    
153
    return drained;
154
}
155

    
156
void bdrv_io_limits_disable(BlockDriverState *bs)
157
{
158
    bs->io_limits_enabled = false;
159

    
160
    bdrv_start_throttled_reqs(bs);
161

    
162
    throttle_destroy(&bs->throttle_state);
163
}
164

    
165
static void bdrv_throttle_read_timer_cb(void *opaque)
166
{
167
    BlockDriverState *bs = opaque;
168
    qemu_co_enter_next(&bs->throttled_reqs[0]);
169
}
170

    
171
static void bdrv_throttle_write_timer_cb(void *opaque)
172
{
173
    BlockDriverState *bs = opaque;
174
    qemu_co_enter_next(&bs->throttled_reqs[1]);
175
}
176

    
177
/* should be called before bdrv_set_io_limits if a limit is set */
178
void bdrv_io_limits_enable(BlockDriverState *bs)
179
{
180
    assert(!bs->io_limits_enabled);
181
    throttle_init(&bs->throttle_state,
182
                  QEMU_CLOCK_VIRTUAL,
183
                  bdrv_throttle_read_timer_cb,
184
                  bdrv_throttle_write_timer_cb,
185
                  bs);
186
    bs->io_limits_enabled = true;
187
}
188

    
189
/* This function makes an IO wait if needed
190
 *
191
 * @nb_sectors: the number of sectors of the IO
192
 * @is_write:   is the IO a write
193
 */
194
static void bdrv_io_limits_intercept(BlockDriverState *bs,
195
                                     unsigned int bytes,
196
                                     bool is_write)
197
{
198
    /* does this io must wait */
199
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200

    
201
    /* if must wait or any request of this type throttled queue the IO */
202
    if (must_wait ||
203
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205
    }
206

    
207
    /* the IO will be executed, do the accounting */
208
    throttle_account(&bs->throttle_state, is_write, bytes);
209

    
210

    
211
    /* if the next request must wait -> do nothing */
212
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213
        return;
214
    }
215

    
216
    /* else queue next request for execution */
217
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218
}
219

    
220
size_t bdrv_opt_mem_align(BlockDriverState *bs)
221
{
222
    if (!bs || !bs->drv) {
223
        /* 4k should be on the safe side */
224
        return 4096;
225
    }
226

    
227
    return bs->bl.opt_mem_alignment;
228
}
229

    
230
/* check if the path starts with "<protocol>:" */
231
static int path_has_protocol(const char *path)
232
{
233
    const char *p;
234

    
235
#ifdef _WIN32
236
    if (is_windows_drive(path) ||
237
        is_windows_drive_prefix(path)) {
238
        return 0;
239
    }
240
    p = path + strcspn(path, ":/\\");
241
#else
242
    p = path + strcspn(path, ":/");
243
#endif
244

    
245
    return *p == ':';
246
}
247

    
248
int path_is_absolute(const char *path)
249
{
250
#ifdef _WIN32
251
    /* specific case for names like: "\\.\d:" */
252
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253
        return 1;
254
    }
255
    return (*path == '/' || *path == '\\');
256
#else
257
    return (*path == '/');
258
#endif
259
}
260

    
261
/* if filename is absolute, just copy it to dest. Otherwise, build a
262
   path to it by considering it is relative to base_path. URL are
263
   supported. */
264
void path_combine(char *dest, int dest_size,
265
                  const char *base_path,
266
                  const char *filename)
267
{
268
    const char *p, *p1;
269
    int len;
270

    
271
    if (dest_size <= 0)
272
        return;
273
    if (path_is_absolute(filename)) {
274
        pstrcpy(dest, dest_size, filename);
275
    } else {
276
        p = strchr(base_path, ':');
277
        if (p)
278
            p++;
279
        else
280
            p = base_path;
281
        p1 = strrchr(base_path, '/');
282
#ifdef _WIN32
283
        {
284
            const char *p2;
285
            p2 = strrchr(base_path, '\\');
286
            if (!p1 || p2 > p1)
287
                p1 = p2;
288
        }
289
#endif
290
        if (p1)
291
            p1++;
292
        else
293
            p1 = base_path;
294
        if (p1 > p)
295
            p = p1;
296
        len = p - base_path;
297
        if (len > dest_size - 1)
298
            len = dest_size - 1;
299
        memcpy(dest, base_path, len);
300
        dest[len] = '\0';
301
        pstrcat(dest, dest_size, filename);
302
    }
303
}
304

    
305
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306
{
307
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308
        pstrcpy(dest, sz, bs->backing_file);
309
    } else {
310
        path_combine(dest, sz, bs->filename, bs->backing_file);
311
    }
312
}
313

    
314
void bdrv_register(BlockDriver *bdrv)
315
{
316
    /* Block drivers without coroutine functions need emulation */
317
    if (!bdrv->bdrv_co_readv) {
318
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
319
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
320

    
321
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322
         * the block driver lacks aio we need to emulate that too.
323
         */
324
        if (!bdrv->bdrv_aio_readv) {
325
            /* add AIO emulation layer */
326
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328
        }
329
    }
330

    
331
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332
}
333

    
334
/* create a new block device (by default it is empty) */
335
BlockDriverState *bdrv_new(const char *device_name)
336
{
337
    BlockDriverState *bs;
338

    
339
    bs = g_malloc0(sizeof(BlockDriverState));
340
    QLIST_INIT(&bs->dirty_bitmaps);
341
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342
    if (device_name[0] != '\0') {
343
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344
    }
345
    bdrv_iostatus_disable(bs);
346
    notifier_list_init(&bs->close_notifiers);
347
    notifier_with_return_list_init(&bs->before_write_notifiers);
348
    qemu_co_queue_init(&bs->throttled_reqs[0]);
349
    qemu_co_queue_init(&bs->throttled_reqs[1]);
350
    bs->refcnt = 1;
351

    
352
    return bs;
353
}
354

    
355
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356
{
357
    notifier_list_add(&bs->close_notifiers, notify);
358
}
359

    
360
BlockDriver *bdrv_find_format(const char *format_name)
361
{
362
    BlockDriver *drv1;
363
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364
        if (!strcmp(drv1->format_name, format_name)) {
365
            return drv1;
366
        }
367
    }
368
    return NULL;
369
}
370

    
371
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372
{
373
    static const char *whitelist_rw[] = {
374
        CONFIG_BDRV_RW_WHITELIST
375
    };
376
    static const char *whitelist_ro[] = {
377
        CONFIG_BDRV_RO_WHITELIST
378
    };
379
    const char **p;
380

    
381
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
382
        return 1;               /* no whitelist, anything goes */
383
    }
384

    
385
    for (p = whitelist_rw; *p; p++) {
386
        if (!strcmp(drv->format_name, *p)) {
387
            return 1;
388
        }
389
    }
390
    if (read_only) {
391
        for (p = whitelist_ro; *p; p++) {
392
            if (!strcmp(drv->format_name, *p)) {
393
                return 1;
394
            }
395
        }
396
    }
397
    return 0;
398
}
399

    
400
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401
                                          bool read_only)
402
{
403
    BlockDriver *drv = bdrv_find_format(format_name);
404
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405
}
406

    
407
typedef struct CreateCo {
408
    BlockDriver *drv;
409
    char *filename;
410
    QEMUOptionParameter *options;
411
    int ret;
412
    Error *err;
413
} CreateCo;
414

    
415
static void coroutine_fn bdrv_create_co_entry(void *opaque)
416
{
417
    Error *local_err = NULL;
418
    int ret;
419

    
420
    CreateCo *cco = opaque;
421
    assert(cco->drv);
422

    
423
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424
    if (local_err) {
425
        error_propagate(&cco->err, local_err);
426
    }
427
    cco->ret = ret;
428
}
429

    
430
int bdrv_create(BlockDriver *drv, const char* filename,
431
    QEMUOptionParameter *options, Error **errp)
432
{
433
    int ret;
434

    
435
    Coroutine *co;
436
    CreateCo cco = {
437
        .drv = drv,
438
        .filename = g_strdup(filename),
439
        .options = options,
440
        .ret = NOT_DONE,
441
        .err = NULL,
442
    };
443

    
444
    if (!drv->bdrv_create) {
445
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446
        ret = -ENOTSUP;
447
        goto out;
448
    }
449

    
450
    if (qemu_in_coroutine()) {
451
        /* Fast-path if already in coroutine context */
452
        bdrv_create_co_entry(&cco);
453
    } else {
454
        co = qemu_coroutine_create(bdrv_create_co_entry);
455
        qemu_coroutine_enter(co, &cco);
456
        while (cco.ret == NOT_DONE) {
457
            qemu_aio_wait();
458
        }
459
    }
460

    
461
    ret = cco.ret;
462
    if (ret < 0) {
463
        if (cco.err) {
464
            error_propagate(errp, cco.err);
465
        } else {
466
            error_setg_errno(errp, -ret, "Could not create image");
467
        }
468
    }
469

    
470
out:
471
    g_free(cco.filename);
472
    return ret;
473
}
474

    
475
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476
                     Error **errp)
477
{
478
    BlockDriver *drv;
479
    Error *local_err = NULL;
480
    int ret;
481

    
482
    drv = bdrv_find_protocol(filename, true);
483
    if (drv == NULL) {
484
        error_setg(errp, "Could not find protocol for file '%s'", filename);
485
        return -ENOENT;
486
    }
487

    
488
    ret = bdrv_create(drv, filename, options, &local_err);
489
    if (local_err) {
490
        error_propagate(errp, local_err);
491
    }
492
    return ret;
493
}
494

    
495
int bdrv_refresh_limits(BlockDriverState *bs)
496
{
497
    BlockDriver *drv = bs->drv;
498

    
499
    memset(&bs->bl, 0, sizeof(bs->bl));
500

    
501
    if (!drv) {
502
        return 0;
503
    }
504

    
505
    /* Take some limits from the children as a default */
506
    if (bs->file) {
507
        bdrv_refresh_limits(bs->file);
508
        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509
        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510
    } else {
511
        bs->bl.opt_mem_alignment = 512;
512
    }
513

    
514
    if (bs->backing_hd) {
515
        bdrv_refresh_limits(bs->backing_hd);
516
        bs->bl.opt_transfer_length =
517
            MAX(bs->bl.opt_transfer_length,
518
                bs->backing_hd->bl.opt_transfer_length);
519
        bs->bl.opt_mem_alignment =
520
            MAX(bs->bl.opt_mem_alignment,
521
                bs->backing_hd->bl.opt_mem_alignment);
522
    }
523

    
524
    /* Then let the driver override it */
525
    if (drv->bdrv_refresh_limits) {
526
        return drv->bdrv_refresh_limits(bs);
527
    }
528

    
529
    return 0;
530
}
531

    
532
/*
533
 * Create a uniquely-named empty temporary file.
534
 * Return 0 upon success, otherwise a negative errno value.
535
 */
536
int get_tmp_filename(char *filename, int size)
537
{
538
#ifdef _WIN32
539
    char temp_dir[MAX_PATH];
540
    /* GetTempFileName requires that its output buffer (4th param)
541
       have length MAX_PATH or greater.  */
542
    assert(size >= MAX_PATH);
543
    return (GetTempPath(MAX_PATH, temp_dir)
544
            && GetTempFileName(temp_dir, "qem", 0, filename)
545
            ? 0 : -GetLastError());
546
#else
547
    int fd;
548
    const char *tmpdir;
549
    tmpdir = getenv("TMPDIR");
550
    if (!tmpdir)
551
        tmpdir = "/tmp";
552
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
553
        return -EOVERFLOW;
554
    }
555
    fd = mkstemp(filename);
556
    if (fd < 0) {
557
        return -errno;
558
    }
559
    if (close(fd) != 0) {
560
        unlink(filename);
561
        return -errno;
562
    }
563
    return 0;
564
#endif
565
}
566

    
567
/*
568
 * Detect host devices. By convention, /dev/cdrom[N] is always
569
 * recognized as a host CDROM.
570
 */
571
static BlockDriver *find_hdev_driver(const char *filename)
572
{
573
    int score_max = 0, score;
574
    BlockDriver *drv = NULL, *d;
575

    
576
    QLIST_FOREACH(d, &bdrv_drivers, list) {
577
        if (d->bdrv_probe_device) {
578
            score = d->bdrv_probe_device(filename);
579
            if (score > score_max) {
580
                score_max = score;
581
                drv = d;
582
            }
583
        }
584
    }
585

    
586
    return drv;
587
}
588

    
589
BlockDriver *bdrv_find_protocol(const char *filename,
590
                                bool allow_protocol_prefix)
591
{
592
    BlockDriver *drv1;
593
    char protocol[128];
594
    int len;
595
    const char *p;
596

    
597
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
598

    
599
    /*
600
     * XXX(hch): we really should not let host device detection
601
     * override an explicit protocol specification, but moving this
602
     * later breaks access to device names with colons in them.
603
     * Thanks to the brain-dead persistent naming schemes on udev-
604
     * based Linux systems those actually are quite common.
605
     */
606
    drv1 = find_hdev_driver(filename);
607
    if (drv1) {
608
        return drv1;
609
    }
610

    
611
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
612
        return bdrv_find_format("file");
613
    }
614

    
615
    p = strchr(filename, ':');
616
    assert(p != NULL);
617
    len = p - filename;
618
    if (len > sizeof(protocol) - 1)
619
        len = sizeof(protocol) - 1;
620
    memcpy(protocol, filename, len);
621
    protocol[len] = '\0';
622
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
623
        if (drv1->protocol_name &&
624
            !strcmp(drv1->protocol_name, protocol)) {
625
            return drv1;
626
        }
627
    }
628
    return NULL;
629
}
630

    
631
static int find_image_format(BlockDriverState *bs, const char *filename,
632
                             BlockDriver **pdrv, Error **errp)
633
{
634
    int score, score_max;
635
    BlockDriver *drv1, *drv;
636
    uint8_t buf[2048];
637
    int ret = 0;
638

    
639
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
640
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
641
        drv = bdrv_find_format("raw");
642
        if (!drv) {
643
            error_setg(errp, "Could not find raw image format");
644
            ret = -ENOENT;
645
        }
646
        *pdrv = drv;
647
        return ret;
648
    }
649

    
650
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
651
    if (ret < 0) {
652
        error_setg_errno(errp, -ret, "Could not read image for determining its "
653
                         "format");
654
        *pdrv = NULL;
655
        return ret;
656
    }
657

    
658
    score_max = 0;
659
    drv = NULL;
660
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
661
        if (drv1->bdrv_probe) {
662
            score = drv1->bdrv_probe(buf, ret, filename);
663
            if (score > score_max) {
664
                score_max = score;
665
                drv = drv1;
666
            }
667
        }
668
    }
669
    if (!drv) {
670
        error_setg(errp, "Could not determine image format: No compatible "
671
                   "driver found");
672
        ret = -ENOENT;
673
    }
674
    *pdrv = drv;
675
    return ret;
676
}
677

    
678
/**
679
 * Set the current 'total_sectors' value
680
 */
681
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
682
{
683
    BlockDriver *drv = bs->drv;
684

    
685
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
686
    if (bs->sg)
687
        return 0;
688

    
689
    /* query actual device if possible, otherwise just trust the hint */
690
    if (drv->bdrv_getlength) {
691
        int64_t length = drv->bdrv_getlength(bs);
692
        if (length < 0) {
693
            return length;
694
        }
695
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
696
    }
697

    
698
    bs->total_sectors = hint;
699
    return 0;
700
}
701

    
702
/**
703
 * Set open flags for a given discard mode
704
 *
705
 * Return 0 on success, -1 if the discard mode was invalid.
706
 */
707
int bdrv_parse_discard_flags(const char *mode, int *flags)
708
{
709
    *flags &= ~BDRV_O_UNMAP;
710

    
711
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
712
        /* do nothing */
713
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
714
        *flags |= BDRV_O_UNMAP;
715
    } else {
716
        return -1;
717
    }
718

    
719
    return 0;
720
}
721

    
722
/**
723
 * Set open flags for a given cache mode
724
 *
725
 * Return 0 on success, -1 if the cache mode was invalid.
726
 */
727
int bdrv_parse_cache_flags(const char *mode, int *flags)
728
{
729
    *flags &= ~BDRV_O_CACHE_MASK;
730

    
731
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
732
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
733
    } else if (!strcmp(mode, "directsync")) {
734
        *flags |= BDRV_O_NOCACHE;
735
    } else if (!strcmp(mode, "writeback")) {
736
        *flags |= BDRV_O_CACHE_WB;
737
    } else if (!strcmp(mode, "unsafe")) {
738
        *flags |= BDRV_O_CACHE_WB;
739
        *flags |= BDRV_O_NO_FLUSH;
740
    } else if (!strcmp(mode, "writethrough")) {
741
        /* this is the default */
742
    } else {
743
        return -1;
744
    }
745

    
746
    return 0;
747
}
748

    
749
/**
750
 * The copy-on-read flag is actually a reference count so multiple users may
751
 * use the feature without worrying about clobbering its previous state.
752
 * Copy-on-read stays enabled until all users have called to disable it.
753
 */
754
void bdrv_enable_copy_on_read(BlockDriverState *bs)
755
{
756
    bs->copy_on_read++;
757
}
758

    
759
void bdrv_disable_copy_on_read(BlockDriverState *bs)
760
{
761
    assert(bs->copy_on_read > 0);
762
    bs->copy_on_read--;
763
}
764

    
765
static int bdrv_open_flags(BlockDriverState *bs, int flags)
766
{
767
    int open_flags = flags | BDRV_O_CACHE_WB;
768

    
769
    /*
770
     * Clear flags that are internal to the block layer before opening the
771
     * image.
772
     */
773
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
774

    
775
    /*
776
     * Snapshots should be writable.
777
     */
778
    if (bs->is_temporary) {
779
        open_flags |= BDRV_O_RDWR;
780
    }
781

    
782
    return open_flags;
783
}
784

    
785
static int bdrv_assign_node_name(BlockDriverState *bs,
786
                                 const char *node_name,
787
                                 Error **errp)
788
{
789
    if (!node_name) {
790
        return 0;
791
    }
792

    
793
    /* empty string node name is invalid */
794
    if (node_name[0] == '\0') {
795
        error_setg(errp, "Empty node name");
796
        return -EINVAL;
797
    }
798

    
799
    /* takes care of avoiding namespaces collisions */
800
    if (bdrv_find(node_name)) {
801
        error_setg(errp, "node-name=%s is conflicting with a device id",
802
                   node_name);
803
        return -EINVAL;
804
    }
805

    
806
    /* takes care of avoiding duplicates node names */
807
    if (bdrv_find_node(node_name)) {
808
        error_setg(errp, "Duplicate node name");
809
        return -EINVAL;
810
    }
811

    
812
    /* copy node name into the bs and insert it into the graph list */
813
    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
814
    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
815

    
816
    return 0;
817
}
818

    
819
/*
820
 * Common part for opening disk images and files
821
 *
822
 * Removes all processed options from *options.
823
 */
824
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
825
    QDict *options, int flags, BlockDriver *drv, Error **errp)
826
{
827
    int ret, open_flags;
828
    const char *filename;
829
    const char *node_name = NULL;
830
    Error *local_err = NULL;
831

    
832
    assert(drv != NULL);
833
    assert(bs->file == NULL);
834
    assert(options != NULL && bs->options != options);
835

    
836
    if (file != NULL) {
837
        filename = file->filename;
838
    } else {
839
        filename = qdict_get_try_str(options, "filename");
840
    }
841

    
842
    if (drv->bdrv_needs_filename && !filename) {
843
        error_setg(errp, "The '%s' block driver requires a file name",
844
                   drv->format_name);
845
        return -EINVAL;
846
    }
847

    
848
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
849

    
850
    node_name = qdict_get_try_str(options, "node-name");
851
    ret = bdrv_assign_node_name(bs, node_name, errp);
852
    if (ret < 0) {
853
        return ret;
854
    }
855
    qdict_del(options, "node-name");
856

    
857
    /* bdrv_open() with directly using a protocol as drv. This layer is already
858
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
859
     * and return immediately. */
860
    if (file != NULL && drv->bdrv_file_open) {
861
        bdrv_swap(file, bs);
862
        return 0;
863
    }
864

    
865
    bs->open_flags = flags;
866
    bs->guest_block_size = 512;
867
    bs->request_alignment = 512;
868
    bs->zero_beyond_eof = true;
869
    open_flags = bdrv_open_flags(bs, flags);
870
    bs->read_only = !(open_flags & BDRV_O_RDWR);
871

    
872
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
873
        error_setg(errp,
874
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
875
                        ? "Driver '%s' can only be used for read-only devices"
876
                        : "Driver '%s' is not whitelisted",
877
                   drv->format_name);
878
        return -ENOTSUP;
879
    }
880

    
881
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
882
    if (flags & BDRV_O_COPY_ON_READ) {
883
        if (!bs->read_only) {
884
            bdrv_enable_copy_on_read(bs);
885
        } else {
886
            error_setg(errp, "Can't use copy-on-read on read-only device");
887
            return -EINVAL;
888
        }
889
    }
890

    
891
    if (filename != NULL) {
892
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
893
    } else {
894
        bs->filename[0] = '\0';
895
    }
896

    
897
    bs->drv = drv;
898
    bs->opaque = g_malloc0(drv->instance_size);
899

    
900
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
901

    
902
    /* Open the image, either directly or using a protocol */
903
    if (drv->bdrv_file_open) {
904
        assert(file == NULL);
905
        assert(!drv->bdrv_needs_filename || filename != NULL);
906
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
907
    } else {
908
        if (file == NULL) {
909
            error_setg(errp, "Can't use '%s' as a block driver for the "
910
                       "protocol level", drv->format_name);
911
            ret = -EINVAL;
912
            goto free_and_fail;
913
        }
914
        bs->file = file;
915
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
916
    }
917

    
918
    if (ret < 0) {
919
        if (local_err) {
920
            error_propagate(errp, local_err);
921
        } else if (bs->filename[0]) {
922
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
923
        } else {
924
            error_setg_errno(errp, -ret, "Could not open image");
925
        }
926
        goto free_and_fail;
927
    }
928

    
929
    ret = refresh_total_sectors(bs, bs->total_sectors);
930
    if (ret < 0) {
931
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
932
        goto free_and_fail;
933
    }
934

    
935
    bdrv_refresh_limits(bs);
936
    assert(bdrv_opt_mem_align(bs) != 0);
937
    assert(bs->request_alignment != 0);
938

    
939
#ifndef _WIN32
940
    if (bs->is_temporary) {
941
        assert(bs->filename[0] != '\0');
942
        unlink(bs->filename);
943
    }
944
#endif
945
    return 0;
946

    
947
free_and_fail:
948
    bs->file = NULL;
949
    g_free(bs->opaque);
950
    bs->opaque = NULL;
951
    bs->drv = NULL;
952
    return ret;
953
}
954

    
955
/*
956
 * Opens a file using a protocol (file, host_device, nbd, ...)
957
 *
958
 * options is a QDict of options to pass to the block drivers, or NULL for an
959
 * empty set of options. The reference to the QDict belongs to the block layer
960
 * after the call (even on failure), so if the caller intends to reuse the
961
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
962
 */
963
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
964
                   const char *reference, QDict *options, int flags,
965
                   Error **errp)
966
{
967
    BlockDriverState *bs = NULL;
968
    BlockDriver *drv;
969
    const char *drvname;
970
    bool allow_protocol_prefix = false;
971
    Error *local_err = NULL;
972
    int ret;
973

    
974
    /* NULL means an empty set of options */
975
    if (options == NULL) {
976
        options = qdict_new();
977
    }
978

    
979
    if (reference) {
980
        if (filename || qdict_size(options)) {
981
            error_setg(errp, "Cannot reference an existing block device with "
982
                       "additional options or a new filename");
983
            return -EINVAL;
984
        }
985
        QDECREF(options);
986

    
987
        bs = bdrv_lookup_bs(reference, reference, errp);
988
        if (!bs) {
989
            return -ENODEV;
990
        }
991
        bdrv_ref(bs);
992
        *pbs = bs;
993
        return 0;
994
    }
995

    
996
    bs = bdrv_new("");
997
    bs->options = options;
998
    options = qdict_clone_shallow(options);
999

    
1000
    /* Fetch the file name from the options QDict if necessary */
1001
    if (!filename) {
1002
        filename = qdict_get_try_str(options, "filename");
1003
    } else if (filename && !qdict_haskey(options, "filename")) {
1004
        qdict_put(options, "filename", qstring_from_str(filename));
1005
        allow_protocol_prefix = true;
1006
    } else {
1007
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1008
                   "same time");
1009
        ret = -EINVAL;
1010
        goto fail;
1011
    }
1012

    
1013
    /* Find the right block driver */
1014
    drvname = qdict_get_try_str(options, "driver");
1015
    if (drvname) {
1016
        drv = bdrv_find_format(drvname);
1017
        if (!drv) {
1018
            error_setg(errp, "Unknown driver '%s'", drvname);
1019
        }
1020
        qdict_del(options, "driver");
1021
    } else if (filename) {
1022
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1023
        if (!drv) {
1024
            error_setg(errp, "Unknown protocol");
1025
        }
1026
    } else {
1027
        error_setg(errp, "Must specify either driver or file");
1028
        drv = NULL;
1029
    }
1030

    
1031
    if (!drv) {
1032
        /* errp has been set already */
1033
        ret = -ENOENT;
1034
        goto fail;
1035
    }
1036

    
1037
    /* Parse the filename and open it */
1038
    if (drv->bdrv_parse_filename && filename) {
1039
        drv->bdrv_parse_filename(filename, options, &local_err);
1040
        if (local_err) {
1041
            error_propagate(errp, local_err);
1042
            ret = -EINVAL;
1043
            goto fail;
1044
        }
1045
        qdict_del(options, "filename");
1046
    }
1047

    
1048
    if (!drv->bdrv_file_open) {
1049
        ret = bdrv_open(&bs, filename, options, flags, drv, &local_err);
1050
        options = NULL;
1051
    } else {
1052
        ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1053
    }
1054
    if (ret < 0) {
1055
        error_propagate(errp, local_err);
1056
        goto fail;
1057
    }
1058

    
1059
    /* Check if any unknown options were used */
1060
    if (options && (qdict_size(options) != 0)) {
1061
        const QDictEntry *entry = qdict_first(options);
1062
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1063
                   drv->format_name, entry->key);
1064
        ret = -EINVAL;
1065
        goto fail;
1066
    }
1067
    QDECREF(options);
1068

    
1069
    bs->growable = 1;
1070
    *pbs = bs;
1071
    return 0;
1072

    
1073
fail:
1074
    QDECREF(options);
1075
    if (!bs->drv) {
1076
        QDECREF(bs->options);
1077
    }
1078
    bdrv_unref(bs);
1079
    return ret;
1080
}
1081

    
1082
/*
1083
 * Opens the backing file for a BlockDriverState if not yet open
1084
 *
1085
 * options is a QDict of options to pass to the block drivers, or NULL for an
1086
 * empty set of options. The reference to the QDict is transferred to this
1087
 * function (even on failure), so if the caller intends to reuse the dictionary,
1088
 * it needs to use QINCREF() before calling bdrv_file_open.
1089
 */
1090
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1091
{
1092
    char backing_filename[PATH_MAX];
1093
    int back_flags, ret;
1094
    BlockDriver *back_drv = NULL;
1095
    Error *local_err = NULL;
1096

    
1097
    if (bs->backing_hd != NULL) {
1098
        QDECREF(options);
1099
        return 0;
1100
    }
1101

    
1102
    /* NULL means an empty set of options */
1103
    if (options == NULL) {
1104
        options = qdict_new();
1105
    }
1106

    
1107
    bs->open_flags &= ~BDRV_O_NO_BACKING;
1108
    if (qdict_haskey(options, "file.filename")) {
1109
        backing_filename[0] = '\0';
1110
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1111
        QDECREF(options);
1112
        return 0;
1113
    } else {
1114
        bdrv_get_full_backing_filename(bs, backing_filename,
1115
                                       sizeof(backing_filename));
1116
    }
1117

    
1118
    if (bs->backing_format[0] != '\0') {
1119
        back_drv = bdrv_find_format(bs->backing_format);
1120
    }
1121

    
1122
    /* backing files always opened read-only */
1123
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1124
                                    BDRV_O_COPY_ON_READ);
1125

    
1126
    assert(bs->backing_hd == NULL);
1127
    ret = bdrv_open(&bs->backing_hd,
1128
                    *backing_filename ? backing_filename : NULL, options,
1129
                    back_flags, back_drv, &local_err);
1130
    if (ret < 0) {
1131
        bs->backing_hd = NULL;
1132
        bs->open_flags |= BDRV_O_NO_BACKING;
1133
        error_setg(errp, "Could not open backing file: %s",
1134
                   error_get_pretty(local_err));
1135
        error_free(local_err);
1136
        return ret;
1137
    }
1138

    
1139
    if (bs->backing_hd->file) {
1140
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1141
                bs->backing_hd->file->filename);
1142
    }
1143

    
1144
    /* Recalculate the BlockLimits with the backing file */
1145
    bdrv_refresh_limits(bs);
1146

    
1147
    return 0;
1148
}
1149

    
1150
/*
1151
 * Opens a disk image whose options are given as BlockdevRef in another block
1152
 * device's options.
1153
 *
1154
 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1155
 * image format auto-detection. If it is false and a filename is given,
1156
 * bdrv_open() will be used for auto-detection.
1157
 *
1158
 * If allow_none is true, no image will be opened if filename is false and no
1159
 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1160
 *
1161
 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1162
 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1163
 * itself, all options starting with "${bdref_key}." are considered part of the
1164
 * BlockdevRef.
1165
 *
1166
 * The BlockdevRef will be removed from the options QDict.
1167
 *
1168
 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1169
 */
1170
int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1171
                    QDict *options, const char *bdref_key, int flags,
1172
                    bool force_raw, bool allow_none, Error **errp)
1173
{
1174
    QDict *image_options;
1175
    int ret;
1176
    char *bdref_key_dot;
1177
    const char *reference;
1178

    
1179
    assert(pbs);
1180
    assert(*pbs == NULL);
1181

    
1182
    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1183
    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1184
    g_free(bdref_key_dot);
1185

    
1186
    reference = qdict_get_try_str(options, bdref_key);
1187
    if (!filename && !reference && !qdict_size(image_options)) {
1188
        if (allow_none) {
1189
            ret = 0;
1190
        } else {
1191
            error_setg(errp, "A block device must be specified for \"%s\"",
1192
                       bdref_key);
1193
            ret = -EINVAL;
1194
        }
1195
        goto done;
1196
    }
1197

    
1198
    if (filename && !force_raw) {
1199
        /* If a filename is given and the block driver should be detected
1200
           automatically (instead of using none), use bdrv_open() in order to do
1201
           that auto-detection. */
1202
        if (reference) {
1203
            error_setg(errp, "Cannot reference an existing block device while "
1204
                       "giving a filename");
1205
            ret = -EINVAL;
1206
            goto done;
1207
        }
1208

    
1209
        ret = bdrv_open(pbs, filename, image_options, flags, NULL, errp);
1210
    } else {
1211
        ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
1212
                             errp);
1213
    }
1214

    
1215
done:
1216
    qdict_del(options, bdref_key);
1217
    return ret;
1218
}
1219

    
1220
/*
1221
 * Opens a disk image (raw, qcow2, vmdk, ...)
1222
 *
1223
 * options is a QDict of options to pass to the block drivers, or NULL for an
1224
 * empty set of options. The reference to the QDict belongs to the block layer
1225
 * after the call (even on failure), so if the caller intends to reuse the
1226
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1227
 *
1228
 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1229
 * If it is not NULL, the referenced BDS will be reused.
1230
 */
1231
int bdrv_open(BlockDriverState **pbs, const char *filename, QDict *options,
1232
              int flags, BlockDriver *drv, Error **errp)
1233
{
1234
    int ret;
1235
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1236
    char tmp_filename[PATH_MAX + 1];
1237
    BlockDriverState *file = NULL, *bs;
1238
    const char *drvname;
1239
    Error *local_err = NULL;
1240

    
1241
    assert(pbs);
1242

    
1243
    if (*pbs) {
1244
        bs = *pbs;
1245
    } else {
1246
        bs = bdrv_new("");
1247
    }
1248

    
1249
    /* NULL means an empty set of options */
1250
    if (options == NULL) {
1251
        options = qdict_new();
1252
    }
1253

    
1254
    bs->options = options;
1255
    options = qdict_clone_shallow(options);
1256

    
1257
    /* For snapshot=on, create a temporary qcow2 overlay */
1258
    if (flags & BDRV_O_SNAPSHOT) {
1259
        BlockDriverState *bs1;
1260
        int64_t total_size;
1261
        BlockDriver *bdrv_qcow2;
1262
        QEMUOptionParameter *create_options;
1263
        QDict *snapshot_options;
1264

    
1265
        /* if snapshot, we create a temporary backing file and open it
1266
           instead of opening 'filename' directly */
1267

    
1268
        /* Get the required size from the image */
1269
        QINCREF(options);
1270
        bs1 = NULL;
1271
        ret = bdrv_open(&bs1, filename, options, BDRV_O_NO_BACKING,
1272
                        drv, &local_err);
1273
        if (ret < 0) {
1274
            goto fail;
1275
        }
1276
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1277

    
1278
        bdrv_unref(bs1);
1279

    
1280
        /* Create the temporary image */
1281
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1282
        if (ret < 0) {
1283
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1284
            goto fail;
1285
        }
1286

    
1287
        bdrv_qcow2 = bdrv_find_format("qcow2");
1288
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1289
                                                 NULL);
1290

    
1291
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1292

    
1293
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1294
        free_option_parameters(create_options);
1295
        if (ret < 0) {
1296
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1297
                             "'%s': %s", tmp_filename,
1298
                             error_get_pretty(local_err));
1299
            error_free(local_err);
1300
            local_err = NULL;
1301
            goto fail;
1302
        }
1303

    
1304
        /* Prepare a new options QDict for the temporary file, where user
1305
         * options refer to the backing file */
1306
        if (filename) {
1307
            qdict_put(options, "file.filename", qstring_from_str(filename));
1308
        }
1309
        if (drv) {
1310
            qdict_put(options, "driver", qstring_from_str(drv->format_name));
1311
        }
1312

    
1313
        snapshot_options = qdict_new();
1314
        qdict_put(snapshot_options, "backing", options);
1315
        qdict_flatten(snapshot_options);
1316

    
1317
        bs->options = snapshot_options;
1318
        options = qdict_clone_shallow(bs->options);
1319

    
1320
        filename = tmp_filename;
1321
        drv = bdrv_qcow2;
1322
        bs->is_temporary = 1;
1323
    }
1324

    
1325
    /* Open image file without format layer */
1326
    if (flags & BDRV_O_RDWR) {
1327
        flags |= BDRV_O_ALLOW_RDWR;
1328
    }
1329

    
1330
    assert(file == NULL);
1331
    ret = bdrv_open_image(&file, filename, options, "file",
1332
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1333
                          &local_err);
1334
    if (ret < 0) {
1335
        goto fail;
1336
    }
1337

    
1338
    /* Find the right image format driver */
1339
    drvname = qdict_get_try_str(options, "driver");
1340
    if (drvname) {
1341
        drv = bdrv_find_format(drvname);
1342
        qdict_del(options, "driver");
1343
        if (!drv) {
1344
            error_setg(errp, "Invalid driver: '%s'", drvname);
1345
            ret = -EINVAL;
1346
            goto unlink_and_fail;
1347
        }
1348
    }
1349

    
1350
    if (!drv) {
1351
        if (file) {
1352
            ret = find_image_format(file, filename, &drv, &local_err);
1353
        } else {
1354
            error_setg(errp, "Must specify either driver or file");
1355
            ret = -EINVAL;
1356
            goto unlink_and_fail;
1357
        }
1358
    }
1359

    
1360
    if (!drv) {
1361
        goto unlink_and_fail;
1362
    }
1363

    
1364
    /* Open the image */
1365
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1366
    if (ret < 0) {
1367
        goto unlink_and_fail;
1368
    }
1369

    
1370
    if (file && (bs->file != file)) {
1371
        bdrv_unref(file);
1372
        file = NULL;
1373
    }
1374

    
1375
    /* If there is a backing file, use it */
1376
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1377
        QDict *backing_options;
1378

    
1379
        qdict_extract_subqdict(options, &backing_options, "backing.");
1380
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1381
        if (ret < 0) {
1382
            goto close_and_fail;
1383
        }
1384
    }
1385

    
1386
    /* Check if any unknown options were used */
1387
    if (qdict_size(options) != 0) {
1388
        const QDictEntry *entry = qdict_first(options);
1389
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1390
                   "support the option '%s'", drv->format_name, bs->device_name,
1391
                   entry->key);
1392

    
1393
        ret = -EINVAL;
1394
        goto close_and_fail;
1395
    }
1396
    QDECREF(options);
1397

    
1398
    if (!bdrv_key_required(bs)) {
1399
        bdrv_dev_change_media_cb(bs, true);
1400
    }
1401

    
1402
    *pbs = bs;
1403
    return 0;
1404

    
1405
unlink_and_fail:
1406
    if (file != NULL) {
1407
        bdrv_unref(file);
1408
    }
1409
    if (bs->is_temporary) {
1410
        unlink(filename);
1411
    }
1412
fail:
1413
    QDECREF(bs->options);
1414
    QDECREF(options);
1415
    bs->options = NULL;
1416
    if (!*pbs) {
1417
        /* If *pbs is NULL, a new BDS has been created in this function and
1418
           needs to be freed now. Otherwise, it does not need to be closed,
1419
           since it has not really been opened yet. */
1420
        bdrv_unref(bs);
1421
    }
1422
    if (local_err) {
1423
        error_propagate(errp, local_err);
1424
    }
1425
    return ret;
1426

    
1427
close_and_fail:
1428
    /* See fail path, but now the BDS has to be always closed */
1429
    if (*pbs) {
1430
        bdrv_close(bs);
1431
    } else {
1432
        bdrv_unref(bs);
1433
    }
1434
    QDECREF(options);
1435
    if (local_err) {
1436
        error_propagate(errp, local_err);
1437
    }
1438
    return ret;
1439
}
1440

    
1441
typedef struct BlockReopenQueueEntry {
1442
     bool prepared;
1443
     BDRVReopenState state;
1444
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1445
} BlockReopenQueueEntry;
1446

    
1447
/*
1448
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1449
 * reopen of multiple devices.
1450
 *
1451
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1452
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1453
 * be created and initialized. This newly created BlockReopenQueue should be
1454
 * passed back in for subsequent calls that are intended to be of the same
1455
 * atomic 'set'.
1456
 *
1457
 * bs is the BlockDriverState to add to the reopen queue.
1458
 *
1459
 * flags contains the open flags for the associated bs
1460
 *
1461
 * returns a pointer to bs_queue, which is either the newly allocated
1462
 * bs_queue, or the existing bs_queue being used.
1463
 *
1464
 */
1465
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1466
                                    BlockDriverState *bs, int flags)
1467
{
1468
    assert(bs != NULL);
1469

    
1470
    BlockReopenQueueEntry *bs_entry;
1471
    if (bs_queue == NULL) {
1472
        bs_queue = g_new0(BlockReopenQueue, 1);
1473
        QSIMPLEQ_INIT(bs_queue);
1474
    }
1475

    
1476
    if (bs->file) {
1477
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1478
    }
1479

    
1480
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1481
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1482

    
1483
    bs_entry->state.bs = bs;
1484
    bs_entry->state.flags = flags;
1485

    
1486
    return bs_queue;
1487
}
1488

    
1489
/*
1490
 * Reopen multiple BlockDriverStates atomically & transactionally.
1491
 *
1492
 * The queue passed in (bs_queue) must have been built up previous
1493
 * via bdrv_reopen_queue().
1494
 *
1495
 * Reopens all BDS specified in the queue, with the appropriate
1496
 * flags.  All devices are prepared for reopen, and failure of any
1497
 * device will cause all device changes to be abandonded, and intermediate
1498
 * data cleaned up.
1499
 *
1500
 * If all devices prepare successfully, then the changes are committed
1501
 * to all devices.
1502
 *
1503
 */
1504
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1505
{
1506
    int ret = -1;
1507
    BlockReopenQueueEntry *bs_entry, *next;
1508
    Error *local_err = NULL;
1509

    
1510
    assert(bs_queue != NULL);
1511

    
1512
    bdrv_drain_all();
1513

    
1514
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1515
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1516
            error_propagate(errp, local_err);
1517
            goto cleanup;
1518
        }
1519
        bs_entry->prepared = true;
1520
    }
1521

    
1522
    /* If we reach this point, we have success and just need to apply the
1523
     * changes
1524
     */
1525
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1526
        bdrv_reopen_commit(&bs_entry->state);
1527
    }
1528

    
1529
    ret = 0;
1530

    
1531
cleanup:
1532
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1533
        if (ret && bs_entry->prepared) {
1534
            bdrv_reopen_abort(&bs_entry->state);
1535
        }
1536
        g_free(bs_entry);
1537
    }
1538
    g_free(bs_queue);
1539
    return ret;
1540
}
1541

    
1542

    
1543
/* Reopen a single BlockDriverState with the specified flags. */
1544
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1545
{
1546
    int ret = -1;
1547
    Error *local_err = NULL;
1548
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1549

    
1550
    ret = bdrv_reopen_multiple(queue, &local_err);
1551
    if (local_err != NULL) {
1552
        error_propagate(errp, local_err);
1553
    }
1554
    return ret;
1555
}
1556

    
1557

    
1558
/*
1559
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1560
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1561
 * the block driver layer .bdrv_reopen_prepare()
1562
 *
1563
 * bs is the BlockDriverState to reopen
1564
 * flags are the new open flags
1565
 * queue is the reopen queue
1566
 *
1567
 * Returns 0 on success, non-zero on error.  On error errp will be set
1568
 * as well.
1569
 *
1570
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1571
 * It is the responsibility of the caller to then call the abort() or
1572
 * commit() for any other BDS that have been left in a prepare() state
1573
 *
1574
 */
1575
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1576
                        Error **errp)
1577
{
1578
    int ret = -1;
1579
    Error *local_err = NULL;
1580
    BlockDriver *drv;
1581

    
1582
    assert(reopen_state != NULL);
1583
    assert(reopen_state->bs->drv != NULL);
1584
    drv = reopen_state->bs->drv;
1585

    
1586
    /* if we are to stay read-only, do not allow permission change
1587
     * to r/w */
1588
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1589
        reopen_state->flags & BDRV_O_RDWR) {
1590
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1591
                  reopen_state->bs->device_name);
1592
        goto error;
1593
    }
1594

    
1595

    
1596
    ret = bdrv_flush(reopen_state->bs);
1597
    if (ret) {
1598
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1599
                  strerror(-ret));
1600
        goto error;
1601
    }
1602

    
1603
    if (drv->bdrv_reopen_prepare) {
1604
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1605
        if (ret) {
1606
            if (local_err != NULL) {
1607
                error_propagate(errp, local_err);
1608
            } else {
1609
                error_setg(errp, "failed while preparing to reopen image '%s'",
1610
                           reopen_state->bs->filename);
1611
            }
1612
            goto error;
1613
        }
1614
    } else {
1615
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1616
         * handler for each supported drv. */
1617
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1618
                  drv->format_name, reopen_state->bs->device_name,
1619
                 "reopening of file");
1620
        ret = -1;
1621
        goto error;
1622
    }
1623

    
1624
    ret = 0;
1625

    
1626
error:
1627
    return ret;
1628
}
1629

    
1630
/*
1631
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1632
 * makes them final by swapping the staging BlockDriverState contents into
1633
 * the active BlockDriverState contents.
1634
 */
1635
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1636
{
1637
    BlockDriver *drv;
1638

    
1639
    assert(reopen_state != NULL);
1640
    drv = reopen_state->bs->drv;
1641
    assert(drv != NULL);
1642

    
1643
    /* If there are any driver level actions to take */
1644
    if (drv->bdrv_reopen_commit) {
1645
        drv->bdrv_reopen_commit(reopen_state);
1646
    }
1647

    
1648
    /* set BDS specific flags now */
1649
    reopen_state->bs->open_flags         = reopen_state->flags;
1650
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1651
                                              BDRV_O_CACHE_WB);
1652
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1653

    
1654
    bdrv_refresh_limits(reopen_state->bs);
1655
}
1656

    
1657
/*
1658
 * Abort the reopen, and delete and free the staged changes in
1659
 * reopen_state
1660
 */
1661
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1662
{
1663
    BlockDriver *drv;
1664

    
1665
    assert(reopen_state != NULL);
1666
    drv = reopen_state->bs->drv;
1667
    assert(drv != NULL);
1668

    
1669
    if (drv->bdrv_reopen_abort) {
1670
        drv->bdrv_reopen_abort(reopen_state);
1671
    }
1672
}
1673

    
1674

    
1675
void bdrv_close(BlockDriverState *bs)
1676
{
1677
    if (bs->job) {
1678
        block_job_cancel_sync(bs->job);
1679
    }
1680
    bdrv_drain_all(); /* complete I/O */
1681
    bdrv_flush(bs);
1682
    bdrv_drain_all(); /* in case flush left pending I/O */
1683
    notifier_list_notify(&bs->close_notifiers, bs);
1684

    
1685
    if (bs->drv) {
1686
        if (bs->backing_hd) {
1687
            bdrv_unref(bs->backing_hd);
1688
            bs->backing_hd = NULL;
1689
        }
1690
        bs->drv->bdrv_close(bs);
1691
        g_free(bs->opaque);
1692
#ifdef _WIN32
1693
        if (bs->is_temporary) {
1694
            unlink(bs->filename);
1695
        }
1696
#endif
1697
        bs->opaque = NULL;
1698
        bs->drv = NULL;
1699
        bs->copy_on_read = 0;
1700
        bs->backing_file[0] = '\0';
1701
        bs->backing_format[0] = '\0';
1702
        bs->total_sectors = 0;
1703
        bs->encrypted = 0;
1704
        bs->valid_key = 0;
1705
        bs->sg = 0;
1706
        bs->growable = 0;
1707
        bs->zero_beyond_eof = false;
1708
        QDECREF(bs->options);
1709
        bs->options = NULL;
1710

    
1711
        if (bs->file != NULL) {
1712
            bdrv_unref(bs->file);
1713
            bs->file = NULL;
1714
        }
1715
    }
1716

    
1717
    bdrv_dev_change_media_cb(bs, false);
1718

    
1719
    /*throttling disk I/O limits*/
1720
    if (bs->io_limits_enabled) {
1721
        bdrv_io_limits_disable(bs);
1722
    }
1723
}
1724

    
1725
void bdrv_close_all(void)
1726
{
1727
    BlockDriverState *bs;
1728

    
1729
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1730
        bdrv_close(bs);
1731
    }
1732
}
1733

    
1734
/* Check if any requests are in-flight (including throttled requests) */
1735
static bool bdrv_requests_pending(BlockDriverState *bs)
1736
{
1737
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1738
        return true;
1739
    }
1740
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1741
        return true;
1742
    }
1743
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1744
        return true;
1745
    }
1746
    if (bs->file && bdrv_requests_pending(bs->file)) {
1747
        return true;
1748
    }
1749
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1750
        return true;
1751
    }
1752
    return false;
1753
}
1754

    
1755
static bool bdrv_requests_pending_all(void)
1756
{
1757
    BlockDriverState *bs;
1758
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1759
        if (bdrv_requests_pending(bs)) {
1760
            return true;
1761
        }
1762
    }
1763
    return false;
1764
}
1765

    
1766
/*
1767
 * Wait for pending requests to complete across all BlockDriverStates
1768
 *
1769
 * This function does not flush data to disk, use bdrv_flush_all() for that
1770
 * after calling this function.
1771
 *
1772
 * Note that completion of an asynchronous I/O operation can trigger any
1773
 * number of other I/O operations on other devices---for example a coroutine
1774
 * can be arbitrarily complex and a constant flow of I/O can come until the
1775
 * coroutine is complete.  Because of this, it is not possible to have a
1776
 * function to drain a single device's I/O queue.
1777
 */
1778
void bdrv_drain_all(void)
1779
{
1780
    /* Always run first iteration so any pending completion BHs run */
1781
    bool busy = true;
1782
    BlockDriverState *bs;
1783

    
1784
    while (busy) {
1785
        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1786
            bdrv_start_throttled_reqs(bs);
1787
        }
1788

    
1789
        busy = bdrv_requests_pending_all();
1790
        busy |= aio_poll(qemu_get_aio_context(), busy);
1791
    }
1792
}
1793

    
1794
/* make a BlockDriverState anonymous by removing from bdrv_state and
1795
 * graph_bdrv_state list.
1796
   Also, NULL terminate the device_name to prevent double remove */
1797
void bdrv_make_anon(BlockDriverState *bs)
1798
{
1799
    if (bs->device_name[0] != '\0') {
1800
        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1801
    }
1802
    bs->device_name[0] = '\0';
1803
    if (bs->node_name[0] != '\0') {
1804
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1805
    }
1806
    bs->node_name[0] = '\0';
1807
}
1808

    
1809
static void bdrv_rebind(BlockDriverState *bs)
1810
{
1811
    if (bs->drv && bs->drv->bdrv_rebind) {
1812
        bs->drv->bdrv_rebind(bs);
1813
    }
1814
}
1815

    
1816
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1817
                                     BlockDriverState *bs_src)
1818
{
1819
    /* move some fields that need to stay attached to the device */
1820
    bs_dest->open_flags         = bs_src->open_flags;
1821

    
1822
    /* dev info */
1823
    bs_dest->dev_ops            = bs_src->dev_ops;
1824
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1825
    bs_dest->dev                = bs_src->dev;
1826
    bs_dest->guest_block_size   = bs_src->guest_block_size;
1827
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1828

    
1829
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1830

    
1831
    /* i/o throttled req */
1832
    memcpy(&bs_dest->throttle_state,
1833
           &bs_src->throttle_state,
1834
           sizeof(ThrottleState));
1835
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1836
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1837
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1838

    
1839
    /* r/w error */
1840
    bs_dest->on_read_error      = bs_src->on_read_error;
1841
    bs_dest->on_write_error     = bs_src->on_write_error;
1842

    
1843
    /* i/o status */
1844
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1845
    bs_dest->iostatus           = bs_src->iostatus;
1846

    
1847
    /* dirty bitmap */
1848
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1849

    
1850
    /* reference count */
1851
    bs_dest->refcnt             = bs_src->refcnt;
1852

    
1853
    /* job */
1854
    bs_dest->in_use             = bs_src->in_use;
1855
    bs_dest->job                = bs_src->job;
1856

    
1857
    /* keep the same entry in bdrv_states */
1858
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1859
            bs_src->device_name);
1860
    bs_dest->device_list = bs_src->device_list;
1861

    
1862
    /* keep the same entry in graph_bdrv_states
1863
     * We do want to swap name but don't want to swap linked list entries
1864
     */
1865
    bs_dest->node_list   = bs_src->node_list;
1866
}
1867

    
1868
/*
1869
 * Swap bs contents for two image chains while they are live,
1870
 * while keeping required fields on the BlockDriverState that is
1871
 * actually attached to a device.
1872
 *
1873
 * This will modify the BlockDriverState fields, and swap contents
1874
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1875
 *
1876
 * bs_new is required to be anonymous.
1877
 *
1878
 * This function does not create any image files.
1879
 */
1880
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1881
{
1882
    BlockDriverState tmp;
1883

    
1884
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1885
    assert(bs_new->device_name[0] == '\0');
1886
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1887
    assert(bs_new->job == NULL);
1888
    assert(bs_new->dev == NULL);
1889
    assert(bs_new->in_use == 0);
1890
    assert(bs_new->io_limits_enabled == false);
1891
    assert(!throttle_have_timer(&bs_new->throttle_state));
1892

    
1893
    tmp = *bs_new;
1894
    *bs_new = *bs_old;
1895
    *bs_old = tmp;
1896

    
1897
    /* there are some fields that should not be swapped, move them back */
1898
    bdrv_move_feature_fields(&tmp, bs_old);
1899
    bdrv_move_feature_fields(bs_old, bs_new);
1900
    bdrv_move_feature_fields(bs_new, &tmp);
1901

    
1902
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1903
    assert(bs_new->device_name[0] == '\0');
1904

    
1905
    /* Check a few fields that should remain attached to the device */
1906
    assert(bs_new->dev == NULL);
1907
    assert(bs_new->job == NULL);
1908
    assert(bs_new->in_use == 0);
1909
    assert(bs_new->io_limits_enabled == false);
1910
    assert(!throttle_have_timer(&bs_new->throttle_state));
1911

    
1912
    bdrv_rebind(bs_new);
1913
    bdrv_rebind(bs_old);
1914
}
1915

    
1916
/*
1917
 * Add new bs contents at the top of an image chain while the chain is
1918
 * live, while keeping required fields on the top layer.
1919
 *
1920
 * This will modify the BlockDriverState fields, and swap contents
1921
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1922
 *
1923
 * bs_new is required to be anonymous.
1924
 *
1925
 * This function does not create any image files.
1926
 */
1927
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1928
{
1929
    bdrv_swap(bs_new, bs_top);
1930

    
1931
    /* The contents of 'tmp' will become bs_top, as we are
1932
     * swapping bs_new and bs_top contents. */
1933
    bs_top->backing_hd = bs_new;
1934
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1935
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1936
            bs_new->filename);
1937
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1938
            bs_new->drv ? bs_new->drv->format_name : "");
1939
}
1940

    
1941
static void bdrv_delete(BlockDriverState *bs)
1942
{
1943
    assert(!bs->dev);
1944
    assert(!bs->job);
1945
    assert(!bs->in_use);
1946
    assert(!bs->refcnt);
1947
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1948

    
1949
    bdrv_close(bs);
1950

    
1951
    /* remove from list, if necessary */
1952
    bdrv_make_anon(bs);
1953

    
1954
    g_free(bs);
1955
}
1956

    
1957
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1958
/* TODO change to DeviceState *dev when all users are qdevified */
1959
{
1960
    if (bs->dev) {
1961
        return -EBUSY;
1962
    }
1963
    bs->dev = dev;
1964
    bdrv_iostatus_reset(bs);
1965
    return 0;
1966
}
1967

    
1968
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1969
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1970
{
1971
    if (bdrv_attach_dev(bs, dev) < 0) {
1972
        abort();
1973
    }
1974
}
1975

    
1976
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1977
/* TODO change to DeviceState *dev when all users are qdevified */
1978
{
1979
    assert(bs->dev == dev);
1980
    bs->dev = NULL;
1981
    bs->dev_ops = NULL;
1982
    bs->dev_opaque = NULL;
1983
    bs->guest_block_size = 512;
1984
}
1985

    
1986
/* TODO change to return DeviceState * when all users are qdevified */
1987
void *bdrv_get_attached_dev(BlockDriverState *bs)
1988
{
1989
    return bs->dev;
1990
}
1991

    
1992
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1993
                      void *opaque)
1994
{
1995
    bs->dev_ops = ops;
1996
    bs->dev_opaque = opaque;
1997
}
1998

    
1999
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2000
                               enum MonitorEvent ev,
2001
                               BlockErrorAction action, bool is_read)
2002
{
2003
    QObject *data;
2004
    const char *action_str;
2005

    
2006
    switch (action) {
2007
    case BDRV_ACTION_REPORT:
2008
        action_str = "report";
2009
        break;
2010
    case BDRV_ACTION_IGNORE:
2011
        action_str = "ignore";
2012
        break;
2013
    case BDRV_ACTION_STOP:
2014
        action_str = "stop";
2015
        break;
2016
    default:
2017
        abort();
2018
    }
2019

    
2020
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2021
                              bdrv->device_name,
2022
                              action_str,
2023
                              is_read ? "read" : "write");
2024
    monitor_protocol_event(ev, data);
2025

    
2026
    qobject_decref(data);
2027
}
2028

    
2029
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2030
{
2031
    QObject *data;
2032

    
2033
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2034
                              bdrv_get_device_name(bs), ejected);
2035
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2036

    
2037
    qobject_decref(data);
2038
}
2039

    
2040
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2041
{
2042
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2043
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2044
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2045
        if (tray_was_closed) {
2046
            /* tray open */
2047
            bdrv_emit_qmp_eject_event(bs, true);
2048
        }
2049
        if (load) {
2050
            /* tray close */
2051
            bdrv_emit_qmp_eject_event(bs, false);
2052
        }
2053
    }
2054
}
2055

    
2056
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2057
{
2058
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2059
}
2060

    
2061
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2062
{
2063
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2064
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2065
    }
2066
}
2067

    
2068
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2069
{
2070
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2071
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
2072
    }
2073
    return false;
2074
}
2075

    
2076
static void bdrv_dev_resize_cb(BlockDriverState *bs)
2077
{
2078
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
2079
        bs->dev_ops->resize_cb(bs->dev_opaque);
2080
    }
2081
}
2082

    
2083
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2084
{
2085
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2086
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2087
    }
2088
    return false;
2089
}
2090

    
2091
/*
2092
 * Run consistency checks on an image
2093
 *
2094
 * Returns 0 if the check could be completed (it doesn't mean that the image is
2095
 * free of errors) or -errno when an internal error occurred. The results of the
2096
 * check are stored in res.
2097
 */
2098
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2099
{
2100
    if (bs->drv->bdrv_check == NULL) {
2101
        return -ENOTSUP;
2102
    }
2103

    
2104
    memset(res, 0, sizeof(*res));
2105
    return bs->drv->bdrv_check(bs, res, fix);
2106
}
2107

    
2108
#define COMMIT_BUF_SECTORS 2048
2109

    
2110
/* commit COW file into the raw image */
2111
int bdrv_commit(BlockDriverState *bs)
2112
{
2113
    BlockDriver *drv = bs->drv;
2114
    int64_t sector, total_sectors, length, backing_length;
2115
    int n, ro, open_flags;
2116
    int ret = 0;
2117
    uint8_t *buf = NULL;
2118
    char filename[PATH_MAX];
2119

    
2120
    if (!drv)
2121
        return -ENOMEDIUM;
2122
    
2123
    if (!bs->backing_hd) {
2124
        return -ENOTSUP;
2125
    }
2126

    
2127
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2128
        return -EBUSY;
2129
    }
2130

    
2131
    ro = bs->backing_hd->read_only;
2132
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2133
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2134
    open_flags =  bs->backing_hd->open_flags;
2135

    
2136
    if (ro) {
2137
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2138
            return -EACCES;
2139
        }
2140
    }
2141

    
2142
    length = bdrv_getlength(bs);
2143
    if (length < 0) {
2144
        ret = length;
2145
        goto ro_cleanup;
2146
    }
2147

    
2148
    backing_length = bdrv_getlength(bs->backing_hd);
2149
    if (backing_length < 0) {
2150
        ret = backing_length;
2151
        goto ro_cleanup;
2152
    }
2153

    
2154
    /* If our top snapshot is larger than the backing file image,
2155
     * grow the backing file image if possible.  If not possible,
2156
     * we must return an error */
2157
    if (length > backing_length) {
2158
        ret = bdrv_truncate(bs->backing_hd, length);
2159
        if (ret < 0) {
2160
            goto ro_cleanup;
2161
        }
2162
    }
2163

    
2164
    total_sectors = length >> BDRV_SECTOR_BITS;
2165
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2166

    
2167
    for (sector = 0; sector < total_sectors; sector += n) {
2168
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2169
        if (ret < 0) {
2170
            goto ro_cleanup;
2171
        }
2172
        if (ret) {
2173
            ret = bdrv_read(bs, sector, buf, n);
2174
            if (ret < 0) {
2175
                goto ro_cleanup;
2176
            }
2177

    
2178
            ret = bdrv_write(bs->backing_hd, sector, buf, n);
2179
            if (ret < 0) {
2180
                goto ro_cleanup;
2181
            }
2182
        }
2183
    }
2184

    
2185
    if (drv->bdrv_make_empty) {
2186
        ret = drv->bdrv_make_empty(bs);
2187
        if (ret < 0) {
2188
            goto ro_cleanup;
2189
        }
2190
        bdrv_flush(bs);
2191
    }
2192

    
2193
    /*
2194
     * Make sure all data we wrote to the backing device is actually
2195
     * stable on disk.
2196
     */
2197
    if (bs->backing_hd) {
2198
        bdrv_flush(bs->backing_hd);
2199
    }
2200

    
2201
    ret = 0;
2202
ro_cleanup:
2203
    g_free(buf);
2204

    
2205
    if (ro) {
2206
        /* ignoring error return here */
2207
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2208
    }
2209

    
2210
    return ret;
2211
}
2212

    
2213
int bdrv_commit_all(void)
2214
{
2215
    BlockDriverState *bs;
2216

    
2217
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2218
        if (bs->drv && bs->backing_hd) {
2219
            int ret = bdrv_commit(bs);
2220
            if (ret < 0) {
2221
                return ret;
2222
            }
2223
        }
2224
    }
2225
    return 0;
2226
}
2227

    
2228
/**
2229
 * Remove an active request from the tracked requests list
2230
 *
2231
 * This function should be called when a tracked request is completing.
2232
 */
2233
static void tracked_request_end(BdrvTrackedRequest *req)
2234
{
2235
    if (req->serialising) {
2236
        req->bs->serialising_in_flight--;
2237
    }
2238

    
2239
    QLIST_REMOVE(req, list);
2240
    qemu_co_queue_restart_all(&req->wait_queue);
2241
}
2242

    
2243
/**
2244
 * Add an active request to the tracked requests list
2245
 */
2246
static void tracked_request_begin(BdrvTrackedRequest *req,
2247
                                  BlockDriverState *bs,
2248
                                  int64_t offset,
2249
                                  unsigned int bytes, bool is_write)
2250
{
2251
    *req = (BdrvTrackedRequest){
2252
        .bs = bs,
2253
        .offset         = offset,
2254
        .bytes          = bytes,
2255
        .is_write       = is_write,
2256
        .co             = qemu_coroutine_self(),
2257
        .serialising    = false,
2258
        .overlap_offset = offset,
2259
        .overlap_bytes  = bytes,
2260
    };
2261

    
2262
    qemu_co_queue_init(&req->wait_queue);
2263

    
2264
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2265
}
2266

    
2267
static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2268
{
2269
    int64_t overlap_offset = req->offset & ~(align - 1);
2270
    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2271
                               - overlap_offset;
2272

    
2273
    if (!req->serialising) {
2274
        req->bs->serialising_in_flight++;
2275
        req->serialising = true;
2276
    }
2277

    
2278
    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2279
    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2280
}
2281

    
2282
/**
2283
 * Round a region to cluster boundaries
2284
 */
2285
void bdrv_round_to_clusters(BlockDriverState *bs,
2286
                            int64_t sector_num, int nb_sectors,
2287
                            int64_t *cluster_sector_num,
2288
                            int *cluster_nb_sectors)
2289
{
2290
    BlockDriverInfo bdi;
2291

    
2292
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2293
        *cluster_sector_num = sector_num;
2294
        *cluster_nb_sectors = nb_sectors;
2295
    } else {
2296
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2297
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2298
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2299
                                            nb_sectors, c);
2300
    }
2301
}
2302

    
2303
static int bdrv_get_cluster_size(BlockDriverState *bs)
2304
{
2305
    BlockDriverInfo bdi;
2306
    int ret;
2307

    
2308
    ret = bdrv_get_info(bs, &bdi);
2309
    if (ret < 0 || bdi.cluster_size == 0) {
2310
        return bs->request_alignment;
2311
    } else {
2312
        return bdi.cluster_size;
2313
    }
2314
}
2315

    
2316
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2317
                                     int64_t offset, unsigned int bytes)
2318
{
2319
    /*        aaaa   bbbb */
2320
    if (offset >= req->overlap_offset + req->overlap_bytes) {
2321
        return false;
2322
    }
2323
    /* bbbb   aaaa        */
2324
    if (req->overlap_offset >= offset + bytes) {
2325
        return false;
2326
    }
2327
    return true;
2328
}
2329

    
2330
static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2331
{
2332
    BlockDriverState *bs = self->bs;
2333
    BdrvTrackedRequest *req;
2334
    bool retry;
2335
    bool waited = false;
2336

    
2337
    if (!bs->serialising_in_flight) {
2338
        return false;
2339
    }
2340

    
2341
    do {
2342
        retry = false;
2343
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2344
            if (req == self || (!req->serialising && !self->serialising)) {
2345
                continue;
2346
            }
2347
            if (tracked_request_overlaps(req, self->overlap_offset,
2348
                                         self->overlap_bytes))
2349
            {
2350
                /* Hitting this means there was a reentrant request, for
2351
                 * example, a block driver issuing nested requests.  This must
2352
                 * never happen since it means deadlock.
2353
                 */
2354
                assert(qemu_coroutine_self() != req->co);
2355

    
2356
                /* If the request is already (indirectly) waiting for us, or
2357
                 * will wait for us as soon as it wakes up, then just go on
2358
                 * (instead of producing a deadlock in the former case). */
2359
                if (!req->waiting_for) {
2360
                    self->waiting_for = req;
2361
                    qemu_co_queue_wait(&req->wait_queue);
2362
                    self->waiting_for = NULL;
2363
                    retry = true;
2364
                    waited = true;
2365
                    break;
2366
                }
2367
            }
2368
        }
2369
    } while (retry);
2370

    
2371
    return waited;
2372
}
2373

    
2374
/*
2375
 * Return values:
2376
 * 0        - success
2377
 * -EINVAL  - backing format specified, but no file
2378
 * -ENOSPC  - can't update the backing file because no space is left in the
2379
 *            image file header
2380
 * -ENOTSUP - format driver doesn't support changing the backing file
2381
 */
2382
int bdrv_change_backing_file(BlockDriverState *bs,
2383
    const char *backing_file, const char *backing_fmt)
2384
{
2385
    BlockDriver *drv = bs->drv;
2386
    int ret;
2387

    
2388
    /* Backing file format doesn't make sense without a backing file */
2389
    if (backing_fmt && !backing_file) {
2390
        return -EINVAL;
2391
    }
2392

    
2393
    if (drv->bdrv_change_backing_file != NULL) {
2394
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2395
    } else {
2396
        ret = -ENOTSUP;
2397
    }
2398

    
2399
    if (ret == 0) {
2400
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2401
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2402
    }
2403
    return ret;
2404
}
2405

    
2406
/*
2407
 * Finds the image layer in the chain that has 'bs' as its backing file.
2408
 *
2409
 * active is the current topmost image.
2410
 *
2411
 * Returns NULL if bs is not found in active's image chain,
2412
 * or if active == bs.
2413
 */
2414
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2415
                                    BlockDriverState *bs)
2416
{
2417
    BlockDriverState *overlay = NULL;
2418
    BlockDriverState *intermediate;
2419

    
2420
    assert(active != NULL);
2421
    assert(bs != NULL);
2422

    
2423
    /* if bs is the same as active, then by definition it has no overlay
2424
     */
2425
    if (active == bs) {
2426
        return NULL;
2427
    }
2428

    
2429
    intermediate = active;
2430
    while (intermediate->backing_hd) {
2431
        if (intermediate->backing_hd == bs) {
2432
            overlay = intermediate;
2433
            break;
2434
        }
2435
        intermediate = intermediate->backing_hd;
2436
    }
2437

    
2438
    return overlay;
2439
}
2440

    
2441
typedef struct BlkIntermediateStates {
2442
    BlockDriverState *bs;
2443
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2444
} BlkIntermediateStates;
2445

    
2446

    
2447
/*
2448
 * Drops images above 'base' up to and including 'top', and sets the image
2449
 * above 'top' to have base as its backing file.
2450
 *
2451
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2452
 * information in 'bs' can be properly updated.
2453
 *
2454
 * E.g., this will convert the following chain:
2455
 * bottom <- base <- intermediate <- top <- active
2456
 *
2457
 * to
2458
 *
2459
 * bottom <- base <- active
2460
 *
2461
 * It is allowed for bottom==base, in which case it converts:
2462
 *
2463
 * base <- intermediate <- top <- active
2464
 *
2465
 * to
2466
 *
2467
 * base <- active
2468
 *
2469
 * Error conditions:
2470
 *  if active == top, that is considered an error
2471
 *
2472
 */
2473
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2474
                           BlockDriverState *base)
2475
{
2476
    BlockDriverState *intermediate;
2477
    BlockDriverState *base_bs = NULL;
2478
    BlockDriverState *new_top_bs = NULL;
2479
    BlkIntermediateStates *intermediate_state, *next;
2480
    int ret = -EIO;
2481

    
2482
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2483
    QSIMPLEQ_INIT(&states_to_delete);
2484

    
2485
    if (!top->drv || !base->drv) {
2486
        goto exit;
2487
    }
2488

    
2489
    new_top_bs = bdrv_find_overlay(active, top);
2490

    
2491
    if (new_top_bs == NULL) {
2492
        /* we could not find the image above 'top', this is an error */
2493
        goto exit;
2494
    }
2495

    
2496
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2497
     * to do, no intermediate images */
2498
    if (new_top_bs->backing_hd == base) {
2499
        ret = 0;
2500
        goto exit;
2501
    }
2502

    
2503
    intermediate = top;
2504

    
2505
    /* now we will go down through the list, and add each BDS we find
2506
     * into our deletion queue, until we hit the 'base'
2507
     */
2508
    while (intermediate) {
2509
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2510
        intermediate_state->bs = intermediate;
2511
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2512

    
2513
        if (intermediate->backing_hd == base) {
2514
            base_bs = intermediate->backing_hd;
2515
            break;
2516
        }
2517
        intermediate = intermediate->backing_hd;
2518
    }
2519
    if (base_bs == NULL) {
2520
        /* something went wrong, we did not end at the base. safely
2521
         * unravel everything, and exit with error */
2522
        goto exit;
2523
    }
2524

    
2525
    /* success - we can delete the intermediate states, and link top->base */
2526
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2527
                                   base_bs->drv ? base_bs->drv->format_name : "");
2528
    if (ret) {
2529
        goto exit;
2530
    }
2531
    new_top_bs->backing_hd = base_bs;
2532

    
2533
    bdrv_refresh_limits(new_top_bs);
2534

    
2535
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2536
        /* so that bdrv_close() does not recursively close the chain */
2537
        intermediate_state->bs->backing_hd = NULL;
2538
        bdrv_unref(intermediate_state->bs);
2539
    }
2540
    ret = 0;
2541

    
2542
exit:
2543
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2544
        g_free(intermediate_state);
2545
    }
2546
    return ret;
2547
}
2548

    
2549

    
2550
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2551
                                   size_t size)
2552
{
2553
    int64_t len;
2554

    
2555
    if (!bdrv_is_inserted(bs))
2556
        return -ENOMEDIUM;
2557

    
2558
    if (bs->growable)
2559
        return 0;
2560

    
2561
    len = bdrv_getlength(bs);
2562

    
2563
    if (offset < 0)
2564
        return -EIO;
2565

    
2566
    if ((offset > len) || (len - offset < size))
2567
        return -EIO;
2568

    
2569
    return 0;
2570
}
2571

    
2572
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2573
                              int nb_sectors)
2574
{
2575
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2576
                                   nb_sectors * BDRV_SECTOR_SIZE);
2577
}
2578

    
2579
typedef struct RwCo {
2580
    BlockDriverState *bs;
2581
    int64_t offset;
2582
    QEMUIOVector *qiov;
2583
    bool is_write;
2584
    int ret;
2585
    BdrvRequestFlags flags;
2586
} RwCo;
2587

    
2588
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2589
{
2590
    RwCo *rwco = opaque;
2591

    
2592
    if (!rwco->is_write) {
2593
        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2594
                                      rwco->qiov->size, rwco->qiov,
2595
                                      rwco->flags);
2596
    } else {
2597
        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2598
                                       rwco->qiov->size, rwco->qiov,
2599
                                       rwco->flags);
2600
    }
2601
}
2602

    
2603
/*
2604
 * Process a vectored synchronous request using coroutines
2605
 */
2606
static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2607
                        QEMUIOVector *qiov, bool is_write,
2608
                        BdrvRequestFlags flags)
2609
{
2610
    Coroutine *co;
2611
    RwCo rwco = {
2612
        .bs = bs,
2613
        .offset = offset,
2614
        .qiov = qiov,
2615
        .is_write = is_write,
2616
        .ret = NOT_DONE,
2617
        .flags = flags,
2618
    };
2619

    
2620
    /**
2621
     * In sync call context, when the vcpu is blocked, this throttling timer
2622
     * will not fire; so the I/O throttling function has to be disabled here
2623
     * if it has been enabled.
2624
     */
2625
    if (bs->io_limits_enabled) {
2626
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2627
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2628
        bdrv_io_limits_disable(bs);
2629
    }
2630

    
2631
    if (qemu_in_coroutine()) {
2632
        /* Fast-path if already in coroutine context */
2633
        bdrv_rw_co_entry(&rwco);
2634
    } else {
2635
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2636
        qemu_coroutine_enter(co, &rwco);
2637
        while (rwco.ret == NOT_DONE) {
2638
            qemu_aio_wait();
2639
        }
2640
    }
2641
    return rwco.ret;
2642
}
2643

    
2644
/*
2645
 * Process a synchronous request using coroutines
2646
 */
2647
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2648
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2649
{
2650
    QEMUIOVector qiov;
2651
    struct iovec iov = {
2652
        .iov_base = (void *)buf,
2653
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2654
    };
2655

    
2656
    qemu_iovec_init_external(&qiov, &iov, 1);
2657
    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2658
                        &qiov, is_write, flags);
2659
}
2660

    
2661
/* return < 0 if error. See bdrv_write() for the return codes */
2662
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2663
              uint8_t *buf, int nb_sectors)
2664
{
2665
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2666
}
2667

    
2668
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2669
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2670
                          uint8_t *buf, int nb_sectors)
2671
{
2672
    bool enabled;
2673
    int ret;
2674

    
2675
    enabled = bs->io_limits_enabled;
2676
    bs->io_limits_enabled = false;
2677
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2678
    bs->io_limits_enabled = enabled;
2679
    return ret;
2680
}
2681

    
2682
/* Return < 0 if error. Important errors are:
2683
  -EIO         generic I/O error (may happen for all errors)
2684
  -ENOMEDIUM   No media inserted.
2685
  -EINVAL      Invalid sector number or nb_sectors
2686
  -EACCES      Trying to write a read-only device
2687
*/
2688
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2689
               const uint8_t *buf, int nb_sectors)
2690
{
2691
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2692
}
2693

    
2694
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2695
                      int nb_sectors, BdrvRequestFlags flags)
2696
{
2697
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2698
                      BDRV_REQ_ZERO_WRITE | flags);
2699
}
2700

    
2701
/*
2702
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2703
 * The operation is sped up by checking the block status and only writing
2704
 * zeroes to the device if they currently do not return zeroes. Optional
2705
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2706
 *
2707
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2708
 */
2709
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2710
{
2711
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2712
    int64_t ret, nb_sectors, sector_num = 0;
2713
    int n;
2714

    
2715
    for (;;) {
2716
        nb_sectors = target_size - sector_num;
2717
        if (nb_sectors <= 0) {
2718
            return 0;
2719
        }
2720
        if (nb_sectors > INT_MAX) {
2721
            nb_sectors = INT_MAX;
2722
        }
2723
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2724
        if (ret < 0) {
2725
            error_report("error getting block status at sector %" PRId64 ": %s",
2726
                         sector_num, strerror(-ret));
2727
            return ret;
2728
        }
2729
        if (ret & BDRV_BLOCK_ZERO) {
2730
            sector_num += n;
2731
            continue;
2732
        }
2733
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2734
        if (ret < 0) {
2735
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2736
                         sector_num, strerror(-ret));
2737
            return ret;
2738
        }
2739
        sector_num += n;
2740
    }
2741
}
2742

    
2743
int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2744
{
2745
    QEMUIOVector qiov;
2746
    struct iovec iov = {
2747
        .iov_base = (void *)buf,
2748
        .iov_len = bytes,
2749
    };
2750
    int ret;
2751

    
2752
    if (bytes < 0) {
2753
        return -EINVAL;
2754
    }
2755

    
2756
    qemu_iovec_init_external(&qiov, &iov, 1);
2757
    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2758
    if (ret < 0) {
2759
        return ret;
2760
    }
2761

    
2762
    return bytes;
2763
}
2764

    
2765
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2766
{
2767
    int ret;
2768

    
2769
    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2770
    if (ret < 0) {
2771
        return ret;
2772
    }
2773

    
2774
    return qiov->size;
2775
}
2776

    
2777
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2778
                const void *buf, int bytes)
2779
{
2780
    QEMUIOVector qiov;
2781
    struct iovec iov = {
2782
        .iov_base   = (void *) buf,
2783
        .iov_len    = bytes,
2784
    };
2785

    
2786
    if (bytes < 0) {
2787
        return -EINVAL;
2788
    }
2789

    
2790
    qemu_iovec_init_external(&qiov, &iov, 1);
2791
    return bdrv_pwritev(bs, offset, &qiov);
2792
}
2793

    
2794
/*
2795
 * Writes to the file and ensures that no writes are reordered across this
2796
 * request (acts as a barrier)
2797
 *
2798
 * Returns 0 on success, -errno in error cases.
2799
 */
2800
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2801
    const void *buf, int count)
2802
{
2803
    int ret;
2804

    
2805
    ret = bdrv_pwrite(bs, offset, buf, count);
2806
    if (ret < 0) {
2807
        return ret;
2808
    }
2809

    
2810
    /* No flush needed for cache modes that already do it */
2811
    if (bs->enable_write_cache) {
2812
        bdrv_flush(bs);
2813
    }
2814

    
2815
    return 0;
2816
}
2817

    
2818
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2819
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2820
{
2821
    /* Perform I/O through a temporary buffer so that users who scribble over
2822
     * their read buffer while the operation is in progress do not end up
2823
     * modifying the image file.  This is critical for zero-copy guest I/O
2824
     * where anything might happen inside guest memory.
2825
     */
2826
    void *bounce_buffer;
2827

    
2828
    BlockDriver *drv = bs->drv;
2829
    struct iovec iov;
2830
    QEMUIOVector bounce_qiov;
2831
    int64_t cluster_sector_num;
2832
    int cluster_nb_sectors;
2833
    size_t skip_bytes;
2834
    int ret;
2835

    
2836
    /* Cover entire cluster so no additional backing file I/O is required when
2837
     * allocating cluster in the image file.
2838
     */
2839
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2840
                           &cluster_sector_num, &cluster_nb_sectors);
2841

    
2842
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2843
                                   cluster_sector_num, cluster_nb_sectors);
2844

    
2845
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2846
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2847
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2848

    
2849
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2850
                             &bounce_qiov);
2851
    if (ret < 0) {
2852
        goto err;
2853
    }
2854

    
2855
    if (drv->bdrv_co_write_zeroes &&
2856
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2857
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2858
                                      cluster_nb_sectors, 0);
2859
    } else {
2860
        /* This does not change the data on the disk, it is not necessary
2861
         * to flush even in cache=writethrough mode.
2862
         */
2863
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2864
                                  &bounce_qiov);
2865
    }
2866

    
2867
    if (ret < 0) {
2868
        /* It might be okay to ignore write errors for guest requests.  If this
2869
         * is a deliberate copy-on-read then we don't want to ignore the error.
2870
         * Simply report it in all cases.
2871
         */
2872
        goto err;
2873
    }
2874

    
2875
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2876
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2877
                        nb_sectors * BDRV_SECTOR_SIZE);
2878

    
2879
err:
2880
    qemu_vfree(bounce_buffer);
2881
    return ret;
2882
}
2883

    
2884
/*
2885
 * Forwards an already correctly aligned request to the BlockDriver. This
2886
 * handles copy on read and zeroing after EOF; any other features must be
2887
 * implemented by the caller.
2888
 */
2889
static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2890
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2891
    int64_t align, QEMUIOVector *qiov, int flags)
2892
{
2893
    BlockDriver *drv = bs->drv;
2894
    int ret;
2895

    
2896
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2897
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2898

    
2899
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2900
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2901

    
2902
    /* Handle Copy on Read and associated serialisation */
2903
    if (flags & BDRV_REQ_COPY_ON_READ) {
2904
        /* If we touch the same cluster it counts as an overlap.  This
2905
         * guarantees that allocating writes will be serialized and not race
2906
         * with each other for the same cluster.  For example, in copy-on-read
2907
         * it ensures that the CoR read and write operations are atomic and
2908
         * guest writes cannot interleave between them. */
2909
        mark_request_serialising(req, bdrv_get_cluster_size(bs));
2910
    }
2911

    
2912
    wait_serialising_requests(req);
2913

    
2914
    if (flags & BDRV_REQ_COPY_ON_READ) {
2915
        int pnum;
2916

    
2917
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2918
        if (ret < 0) {
2919
            goto out;
2920
        }
2921

    
2922
        if (!ret || pnum != nb_sectors) {
2923
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2924
            goto out;
2925
        }
2926
    }
2927

    
2928
    /* Forward the request to the BlockDriver */
2929
    if (!(bs->zero_beyond_eof && bs->growable)) {
2930
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2931
    } else {
2932
        /* Read zeros after EOF of growable BDSes */
2933
        int64_t len, total_sectors, max_nb_sectors;
2934

    
2935
        len = bdrv_getlength(bs);
2936
        if (len < 0) {
2937
            ret = len;
2938
            goto out;
2939
        }
2940

    
2941
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2942
        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2943
                                  align >> BDRV_SECTOR_BITS);
2944
        if (max_nb_sectors > 0) {
2945
            ret = drv->bdrv_co_readv(bs, sector_num,
2946
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2947
        } else {
2948
            ret = 0;
2949
        }
2950

    
2951
        /* Reading beyond end of file is supposed to produce zeroes */
2952
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2953
            uint64_t offset = MAX(0, total_sectors - sector_num);
2954
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2955
                              BDRV_SECTOR_SIZE;
2956
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2957
        }
2958
    }
2959

    
2960
out:
2961
    return ret;
2962
}
2963

    
2964
/*
2965
 * Handle a read request in coroutine context
2966
 */
2967
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2968
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2969
    BdrvRequestFlags flags)
2970
{
2971
    BlockDriver *drv = bs->drv;
2972
    BdrvTrackedRequest req;
2973

    
2974
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2975
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2976
    uint8_t *head_buf = NULL;
2977
    uint8_t *tail_buf = NULL;
2978
    QEMUIOVector local_qiov;
2979
    bool use_local_qiov = false;
2980
    int ret;
2981

    
2982
    if (!drv) {
2983
        return -ENOMEDIUM;
2984
    }
2985
    if (bdrv_check_byte_request(bs, offset, bytes)) {
2986
        return -EIO;
2987
    }
2988

    
2989
    if (bs->copy_on_read) {
2990
        flags |= BDRV_REQ_COPY_ON_READ;
2991
    }
2992

    
2993
    /* throttling disk I/O */
2994
    if (bs->io_limits_enabled) {
2995
        bdrv_io_limits_intercept(bs, bytes, false);
2996
    }
2997

    
2998
    /* Align read if necessary by padding qiov */
2999
    if (offset & (align - 1)) {
3000
        head_buf = qemu_blockalign(bs, align);
3001
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3002
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3003
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3004
        use_local_qiov = true;
3005

    
3006
        bytes += offset & (align - 1);
3007
        offset = offset & ~(align - 1);
3008
    }
3009

    
3010
    if ((offset + bytes) & (align - 1)) {
3011
        if (!use_local_qiov) {
3012
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3013
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3014
            use_local_qiov = true;
3015
        }
3016
        tail_buf = qemu_blockalign(bs, align);
3017
        qemu_iovec_add(&local_qiov, tail_buf,
3018
                       align - ((offset + bytes) & (align - 1)));
3019

    
3020
        bytes = ROUND_UP(bytes, align);
3021
    }
3022

    
3023
    tracked_request_begin(&req, bs, offset, bytes, false);
3024
    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3025
                              use_local_qiov ? &local_qiov : qiov,
3026
                              flags);
3027
    tracked_request_end(&req);
3028

    
3029
    if (use_local_qiov) {
3030
        qemu_iovec_destroy(&local_qiov);
3031
        qemu_vfree(head_buf);
3032
        qemu_vfree(tail_buf);
3033
    }
3034

    
3035
    return ret;
3036
}
3037

    
3038
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3039
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3040
    BdrvRequestFlags flags)
3041
{
3042
    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3043
        return -EINVAL;
3044
    }
3045

    
3046
    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3047
                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3048
}
3049

    
3050
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3051
    int nb_sectors, QEMUIOVector *qiov)
3052
{
3053
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3054

    
3055
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3056
}
3057

    
3058
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3059
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3060
{
3061
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3062

    
3063
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3064
                            BDRV_REQ_COPY_ON_READ);
3065
}
3066

    
3067
/* if no limit is specified in the BlockLimits use a default
3068
 * of 32768 512-byte sectors (16 MiB) per request.
3069
 */
3070
#define MAX_WRITE_ZEROES_DEFAULT 32768
3071

    
3072
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3073
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3074
{
3075
    BlockDriver *drv = bs->drv;
3076
    QEMUIOVector qiov;
3077
    struct iovec iov = {0};
3078
    int ret = 0;
3079

    
3080
    int max_write_zeroes = bs->bl.max_write_zeroes ?
3081
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3082

    
3083
    while (nb_sectors > 0 && !ret) {
3084
        int num = nb_sectors;
3085

    
3086
        /* Align request.  Block drivers can expect the "bulk" of the request
3087
         * to be aligned.
3088
         */
3089
        if (bs->bl.write_zeroes_alignment
3090
            && num > bs->bl.write_zeroes_alignment) {
3091
            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3092
                /* Make a small request up to the first aligned sector.  */
3093
                num = bs->bl.write_zeroes_alignment;
3094
                num -= sector_num % bs->bl.write_zeroes_alignment;
3095
            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3096
                /* Shorten the request to the last aligned sector.  num cannot
3097
                 * underflow because num > bs->bl.write_zeroes_alignment.
3098
                 */
3099
                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3100
            }
3101
        }
3102

    
3103
        /* limit request size */
3104
        if (num > max_write_zeroes) {
3105
            num = max_write_zeroes;
3106
        }
3107

    
3108
        ret = -ENOTSUP;
3109
        /* First try the efficient write zeroes operation */
3110
        if (drv->bdrv_co_write_zeroes) {
3111
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3112
        }
3113

    
3114
        if (ret == -ENOTSUP) {
3115
            /* Fall back to bounce buffer if write zeroes is unsupported */
3116
            iov.iov_len = num * BDRV_SECTOR_SIZE;
3117
            if (iov.iov_base == NULL) {
3118
                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3119
                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3120
            }
3121
            qemu_iovec_init_external(&qiov, &iov, 1);
3122

    
3123
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3124

    
3125
            /* Keep bounce buffer around if it is big enough for all
3126
             * all future requests.
3127
             */
3128
            if (num < max_write_zeroes) {
3129
                qemu_vfree(iov.iov_base);
3130
                iov.iov_base = NULL;
3131
            }
3132
        }
3133

    
3134
        sector_num += num;
3135
        nb_sectors -= num;
3136
    }
3137

    
3138
    qemu_vfree(iov.iov_base);
3139
    return ret;
3140
}
3141

    
3142
/*
3143
 * Forwards an already correctly aligned write request to the BlockDriver.
3144
 */
3145
static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3146
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3147
    QEMUIOVector *qiov, int flags)
3148
{
3149
    BlockDriver *drv = bs->drv;
3150
    bool waited;
3151
    int ret;
3152

    
3153
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3154
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3155

    
3156
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3157
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3158

    
3159
    waited = wait_serialising_requests(req);
3160
    assert(!waited || !req->serialising);
3161
    assert(req->overlap_offset <= offset);
3162
    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3163

    
3164
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3165

    
3166
    if (ret < 0) {
3167
        /* Do nothing, write notifier decided to fail this request */
3168
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
3169
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3170
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3171
    } else {
3172
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3173
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3174
    }
3175
    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3176

    
3177
    if (ret == 0 && !bs->enable_write_cache) {
3178
        ret = bdrv_co_flush(bs);
3179
    }
3180

    
3181
    bdrv_set_dirty(bs, sector_num, nb_sectors);
3182

    
3183
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3184
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
3185
    }
3186
    if (bs->growable && ret >= 0) {
3187
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3188
    }
3189

    
3190
    return ret;
3191
}
3192

    
3193
/*
3194
 * Handle a write request in coroutine context
3195
 */
3196
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3197
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3198
    BdrvRequestFlags flags)
3199
{
3200
    BdrvTrackedRequest req;
3201
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3202
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3203
    uint8_t *head_buf = NULL;
3204
    uint8_t *tail_buf = NULL;
3205
    QEMUIOVector local_qiov;
3206
    bool use_local_qiov = false;
3207
    int ret;
3208

    
3209
    if (!bs->drv) {
3210
        return -ENOMEDIUM;
3211
    }
3212
    if (bs->read_only) {
3213
        return -EACCES;
3214
    }
3215
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3216
        return -EIO;
3217
    }
3218

    
3219
    /* throttling disk I/O */
3220
    if (bs->io_limits_enabled) {
3221
        bdrv_io_limits_intercept(bs, bytes, true);
3222
    }
3223

    
3224
    /*
3225
     * Align write if necessary by performing a read-modify-write cycle.
3226
     * Pad qiov with the read parts and be sure to have a tracked request not
3227
     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3228
     */
3229
    tracked_request_begin(&req, bs, offset, bytes, true);
3230

    
3231
    if (offset & (align - 1)) {
3232
        QEMUIOVector head_qiov;
3233
        struct iovec head_iov;
3234

    
3235
        mark_request_serialising(&req, align);
3236
        wait_serialising_requests(&req);
3237

    
3238
        head_buf = qemu_blockalign(bs, align);
3239
        head_iov = (struct iovec) {
3240
            .iov_base   = head_buf,
3241
            .iov_len    = align,
3242
        };
3243
        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3244

    
3245
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3246
        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3247
                                  align, &head_qiov, 0);
3248
        if (ret < 0) {
3249
            goto fail;
3250
        }
3251
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3252

    
3253
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3254
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3255
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3256
        use_local_qiov = true;
3257

    
3258
        bytes += offset & (align - 1);
3259
        offset = offset & ~(align - 1);
3260
    }
3261

    
3262
    if ((offset + bytes) & (align - 1)) {
3263
        QEMUIOVector tail_qiov;
3264
        struct iovec tail_iov;
3265
        size_t tail_bytes;
3266
        bool waited;
3267

    
3268
        mark_request_serialising(&req, align);
3269
        waited = wait_serialising_requests(&req);
3270
        assert(!waited || !use_local_qiov);
3271

    
3272
        tail_buf = qemu_blockalign(bs, align);
3273
        tail_iov = (struct iovec) {
3274
            .iov_base   = tail_buf,
3275
            .iov_len    = align,
3276
        };
3277
        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3278

    
3279
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3280
        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3281
                                  align, &tail_qiov, 0);
3282
        if (ret < 0) {
3283
            goto fail;
3284
        }
3285
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3286

    
3287
        if (!use_local_qiov) {
3288
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3289
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3290
            use_local_qiov = true;
3291
        }
3292

    
3293
        tail_bytes = (offset + bytes) & (align - 1);
3294
        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3295

    
3296
        bytes = ROUND_UP(bytes, align);
3297
    }
3298

    
3299
    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3300
                               use_local_qiov ? &local_qiov : qiov,
3301
                               flags);
3302

    
3303
fail:
3304
    tracked_request_end(&req);
3305

    
3306
    if (use_local_qiov) {
3307
        qemu_iovec_destroy(&local_qiov);
3308
    }
3309
    qemu_vfree(head_buf);
3310
    qemu_vfree(tail_buf);
3311

    
3312
    return ret;
3313
}
3314

    
3315
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3316
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3317
    BdrvRequestFlags flags)
3318
{
3319
    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3320
        return -EINVAL;
3321
    }
3322

    
3323
    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3324
                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3325
}
3326

    
3327
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3328
    int nb_sectors, QEMUIOVector *qiov)
3329
{
3330
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3331

    
3332
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3333
}
3334

    
3335
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3336
                                      int64_t sector_num, int nb_sectors,
3337
                                      BdrvRequestFlags flags)
3338
{
3339
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3340

    
3341
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
3342
        flags &= ~BDRV_REQ_MAY_UNMAP;
3343
    }
3344

    
3345
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3346
                             BDRV_REQ_ZERO_WRITE | flags);
3347
}
3348

    
3349
/**
3350
 * Truncate file to 'offset' bytes (needed only for file protocols)
3351
 */
3352
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3353
{
3354
    BlockDriver *drv = bs->drv;
3355
    int ret;
3356
    if (!drv)
3357
        return -ENOMEDIUM;
3358
    if (!drv->bdrv_truncate)
3359
        return -ENOTSUP;
3360
    if (bs->read_only)
3361
        return -EACCES;
3362
    if (bdrv_in_use(bs))
3363
        return -EBUSY;
3364
    ret = drv->bdrv_truncate(bs, offset);
3365
    if (ret == 0) {
3366
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3367
        bdrv_dev_resize_cb(bs);
3368
    }
3369
    return ret;
3370
}
3371

    
3372
/**
3373
 * Length of a allocated file in bytes. Sparse files are counted by actual
3374
 * allocated space. Return < 0 if error or unknown.
3375
 */
3376
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3377
{
3378
    BlockDriver *drv = bs->drv;
3379
    if (!drv) {
3380
        return -ENOMEDIUM;
3381
    }
3382
    if (drv->bdrv_get_allocated_file_size) {
3383
        return drv->bdrv_get_allocated_file_size(bs);
3384
    }
3385
    if (bs->file) {
3386
        return bdrv_get_allocated_file_size(bs->file);
3387
    }
3388
    return -ENOTSUP;
3389
}
3390

    
3391
/**
3392
 * Length of a file in bytes. Return < 0 if error or unknown.
3393
 */
3394
int64_t bdrv_getlength(BlockDriverState *bs)
3395
{
3396
    BlockDriver *drv = bs->drv;
3397
    if (!drv)
3398
        return -ENOMEDIUM;
3399

    
3400
    if (drv->has_variable_length) {
3401
        int ret = refresh_total_sectors(bs, bs->total_sectors);
3402
        if (ret < 0) {
3403
            return ret;
3404
        }
3405
    }
3406
    return bs->total_sectors * BDRV_SECTOR_SIZE;
3407
}
3408

    
3409
/* return 0 as number of sectors if no device present or error */
3410
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3411
{
3412
    int64_t length;
3413
    length = bdrv_getlength(bs);
3414
    if (length < 0)
3415
        length = 0;
3416
    else
3417
        length = length >> BDRV_SECTOR_BITS;
3418
    *nb_sectors_ptr = length;
3419
}
3420

    
3421
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3422
                       BlockdevOnError on_write_error)
3423
{
3424
    bs->on_read_error = on_read_error;
3425
    bs->on_write_error = on_write_error;
3426
}
3427

    
3428
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3429
{
3430
    return is_read ? bs->on_read_error : bs->on_write_error;
3431
}
3432

    
3433
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3434
{
3435
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3436

    
3437
    switch (on_err) {
3438
    case BLOCKDEV_ON_ERROR_ENOSPC:
3439
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3440
    case BLOCKDEV_ON_ERROR_STOP:
3441
        return BDRV_ACTION_STOP;
3442
    case BLOCKDEV_ON_ERROR_REPORT:
3443
        return BDRV_ACTION_REPORT;
3444
    case BLOCKDEV_ON_ERROR_IGNORE:
3445
        return BDRV_ACTION_IGNORE;
3446
    default:
3447
        abort();
3448
    }
3449
}
3450

    
3451
/* This is done by device models because, while the block layer knows
3452
 * about the error, it does not know whether an operation comes from
3453
 * the device or the block layer (from a job, for example).
3454
 */
3455
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3456
                       bool is_read, int error)
3457
{
3458
    assert(error >= 0);
3459
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3460
    if (action == BDRV_ACTION_STOP) {
3461
        vm_stop(RUN_STATE_IO_ERROR);
3462
        bdrv_iostatus_set_err(bs, error);
3463
    }
3464
}
3465

    
3466
int bdrv_is_read_only(BlockDriverState *bs)
3467
{
3468
    return bs->read_only;
3469
}
3470

    
3471
int bdrv_is_sg(BlockDriverState *bs)
3472
{
3473
    return bs->sg;
3474
}
3475

    
3476
int bdrv_enable_write_cache(BlockDriverState *bs)
3477
{
3478
    return bs->enable_write_cache;
3479
}
3480

    
3481
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3482
{
3483
    bs->enable_write_cache = wce;
3484

    
3485
    /* so a reopen() will preserve wce */
3486
    if (wce) {
3487
        bs->open_flags |= BDRV_O_CACHE_WB;
3488
    } else {
3489
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3490
    }
3491
}
3492

    
3493
int bdrv_is_encrypted(BlockDriverState *bs)
3494
{
3495
    if (bs->backing_hd && bs->backing_hd->encrypted)
3496
        return 1;
3497
    return bs->encrypted;
3498
}
3499

    
3500
int bdrv_key_required(BlockDriverState *bs)
3501
{
3502
    BlockDriverState *backing_hd = bs->backing_hd;
3503

    
3504
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3505
        return 1;
3506
    return (bs->encrypted && !bs->valid_key);
3507
}
3508

    
3509
int bdrv_set_key(BlockDriverState *bs, const char *key)
3510
{
3511
    int ret;
3512
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3513
        ret = bdrv_set_key(bs->backing_hd, key);
3514
        if (ret < 0)
3515
            return ret;
3516
        if (!bs->encrypted)
3517
            return 0;
3518
    }
3519
    if (!bs->encrypted) {
3520
        return -EINVAL;
3521
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3522
        return -ENOMEDIUM;
3523
    }
3524
    ret = bs->drv->bdrv_set_key(bs, key);
3525
    if (ret < 0) {
3526
        bs->valid_key = 0;
3527
    } else if (!bs->valid_key) {
3528
        bs->valid_key = 1;
3529
        /* call the change callback now, we skipped it on open */
3530
        bdrv_dev_change_media_cb(bs, true);
3531
    }
3532
    return ret;
3533
}
3534

    
3535
const char *bdrv_get_format_name(BlockDriverState *bs)
3536
{
3537
    return bs->drv ? bs->drv->format_name : NULL;
3538
}
3539

    
3540
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3541
                         void *opaque)
3542
{
3543
    BlockDriver *drv;
3544

    
3545
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3546
        it(opaque, drv->format_name);
3547
    }
3548
}
3549

    
3550
/* This function is to find block backend bs */
3551
BlockDriverState *bdrv_find(const char *name)
3552
{
3553
    BlockDriverState *bs;
3554

    
3555
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3556
        if (!strcmp(name, bs->device_name)) {
3557
            return bs;
3558
        }
3559
    }
3560
    return NULL;
3561
}
3562

    
3563
/* This function is to find a node in the bs graph */
3564
BlockDriverState *bdrv_find_node(const char *node_name)
3565
{
3566
    BlockDriverState *bs;
3567

    
3568
    assert(node_name);
3569

    
3570
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3571
        if (!strcmp(node_name, bs->node_name)) {
3572
            return bs;
3573
        }
3574
    }
3575
    return NULL;
3576
}
3577

    
3578
/* Put this QMP function here so it can access the static graph_bdrv_states. */
3579
BlockDeviceInfoList *bdrv_named_nodes_list(void)
3580
{
3581
    BlockDeviceInfoList *list, *entry;
3582
    BlockDriverState *bs;
3583

    
3584
    list = NULL;
3585
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3586
        entry = g_malloc0(sizeof(*entry));
3587
        entry->value = bdrv_block_device_info(bs);
3588
        entry->next = list;
3589
        list = entry;
3590
    }
3591

    
3592
    return list;
3593
}
3594

    
3595
BlockDriverState *bdrv_lookup_bs(const char *device,
3596
                                 const char *node_name,
3597
                                 Error **errp)
3598
{
3599
    BlockDriverState *bs = NULL;
3600

    
3601
    if (device) {
3602
        bs = bdrv_find(device);
3603

    
3604
        if (bs) {
3605
            return bs;
3606
        }
3607
    }
3608

    
3609
    if (node_name) {
3610
        bs = bdrv_find_node(node_name);
3611

    
3612
        if (bs) {
3613
            return bs;
3614
        }
3615
    }
3616

    
3617
    error_setg(errp, "Cannot find device=%s nor node_name=%s",
3618
                     device ? device : "",
3619
                     node_name ? node_name : "");
3620
    return NULL;
3621
}
3622

    
3623
BlockDriverState *bdrv_next(BlockDriverState *bs)
3624
{
3625
    if (!bs) {
3626
        return QTAILQ_FIRST(&bdrv_states);
3627
    }
3628
    return QTAILQ_NEXT(bs, device_list);
3629
}
3630

    
3631
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3632
{
3633
    BlockDriverState *bs;
3634

    
3635
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3636
        it(opaque, bs);
3637
    }
3638
}
3639

    
3640
const char *bdrv_get_device_name(BlockDriverState *bs)
3641
{
3642
    return bs->device_name;
3643
}
3644

    
3645
int bdrv_get_flags(BlockDriverState *bs)
3646
{
3647
    return bs->open_flags;
3648
}
3649

    
3650
int bdrv_flush_all(void)
3651
{
3652
    BlockDriverState *bs;
3653
    int result = 0;
3654

    
3655
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3656
        int ret = bdrv_flush(bs);
3657
        if (ret < 0 && !result) {
3658
            result = ret;
3659
        }
3660
    }
3661

    
3662
    return result;
3663
}
3664

    
3665
int bdrv_has_zero_init_1(BlockDriverState *bs)
3666
{
3667
    return 1;
3668
}
3669

    
3670
int bdrv_has_zero_init(BlockDriverState *bs)
3671
{
3672
    assert(bs->drv);
3673

    
3674
    /* If BS is a copy on write image, it is initialized to
3675
       the contents of the base image, which may not be zeroes.  */
3676
    if (bs->backing_hd) {
3677
        return 0;
3678
    }
3679
    if (bs->drv->bdrv_has_zero_init) {
3680
        return bs->drv->bdrv_has_zero_init(bs);
3681
    }
3682

    
3683
    /* safe default */
3684
    return 0;
3685
}
3686

    
3687
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3688
{
3689
    BlockDriverInfo bdi;
3690

    
3691
    if (bs->backing_hd) {
3692
        return false;
3693
    }
3694

    
3695
    if (bdrv_get_info(bs, &bdi) == 0) {
3696
        return bdi.unallocated_blocks_are_zero;
3697
    }
3698

    
3699
    return false;
3700
}
3701

    
3702
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3703
{
3704
    BlockDriverInfo bdi;
3705

    
3706
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3707
        return false;
3708
    }
3709

    
3710
    if (bdrv_get_info(bs, &bdi) == 0) {
3711
        return bdi.can_write_zeroes_with_unmap;
3712
    }
3713

    
3714
    return false;
3715
}
3716

    
3717
typedef struct BdrvCoGetBlockStatusData {
3718
    BlockDriverState *bs;
3719
    BlockDriverState *base;
3720
    int64_t sector_num;
3721
    int nb_sectors;
3722
    int *pnum;
3723
    int64_t ret;
3724
    bool done;
3725
} BdrvCoGetBlockStatusData;
3726

    
3727
/*
3728
 * Returns true iff the specified sector is present in the disk image. Drivers
3729
 * not implementing the functionality are assumed to not support backing files,
3730
 * hence all their sectors are reported as allocated.
3731
 *
3732
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3733
 * and 'pnum' is set to 0.
3734
 *
3735
 * 'pnum' is set to the number of sectors (including and immediately following
3736
 * the specified sector) that are known to be in the same
3737
 * allocated/unallocated state.
3738
 *
3739
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3740
 * beyond the end of the disk image it will be clamped.
3741
 */
3742
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3743
                                                     int64_t sector_num,
3744
                                                     int nb_sectors, int *pnum)
3745
{
3746
    int64_t length;
3747
    int64_t n;
3748
    int64_t ret, ret2;
3749

    
3750
    length = bdrv_getlength(bs);
3751
    if (length < 0) {
3752
        return length;
3753
    }
3754

    
3755
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3756
        *pnum = 0;
3757
        return 0;
3758
    }
3759

    
3760
    n = bs->total_sectors - sector_num;
3761
    if (n < nb_sectors) {
3762
        nb_sectors = n;
3763
    }
3764

    
3765
    if (!bs->drv->bdrv_co_get_block_status) {
3766
        *pnum = nb_sectors;
3767
        ret = BDRV_BLOCK_DATA;
3768
        if (bs->drv->protocol_name) {
3769
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3770
        }
3771
        return ret;
3772
    }
3773

    
3774
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3775
    if (ret < 0) {
3776
        *pnum = 0;
3777
        return ret;
3778
    }
3779

    
3780
    if (ret & BDRV_BLOCK_RAW) {
3781
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3782
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3783
                                     *pnum, pnum);
3784
    }
3785

    
3786
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3787
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3788
            ret |= BDRV_BLOCK_ZERO;
3789
        } else if (bs->backing_hd) {
3790
            BlockDriverState *bs2 = bs->backing_hd;
3791
            int64_t length2 = bdrv_getlength(bs2);
3792
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3793
                ret |= BDRV_BLOCK_ZERO;
3794
            }
3795
        }
3796
    }
3797

    
3798
    if (bs->file &&
3799
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3800
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3801
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3802
                                        *pnum, pnum);
3803
        if (ret2 >= 0) {
3804
            /* Ignore errors.  This is just providing extra information, it
3805
             * is useful but not necessary.
3806
             */
3807
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3808
        }
3809
    }
3810

    
3811
    return ret;
3812
}
3813

    
3814
/* Coroutine wrapper for bdrv_get_block_status() */
3815
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3816
{
3817
    BdrvCoGetBlockStatusData *data = opaque;
3818
    BlockDriverState *bs = data->bs;
3819

    
3820
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3821
                                         data->pnum);
3822
    data->done = true;
3823
}
3824

    
3825
/*
3826
 * Synchronous wrapper around bdrv_co_get_block_status().
3827
 *
3828
 * See bdrv_co_get_block_status() for details.
3829
 */
3830
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3831
                              int nb_sectors, int *pnum)
3832
{
3833
    Coroutine *co;
3834
    BdrvCoGetBlockStatusData data = {
3835
        .bs = bs,
3836
        .sector_num = sector_num,
3837
        .nb_sectors = nb_sectors,
3838
        .pnum = pnum,
3839
        .done = false,
3840
    };
3841

    
3842
    if (qemu_in_coroutine()) {
3843
        /* Fast-path if already in coroutine context */
3844
        bdrv_get_block_status_co_entry(&data);
3845
    } else {
3846
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3847
        qemu_coroutine_enter(co, &data);
3848
        while (!data.done) {
3849
            qemu_aio_wait();
3850
        }
3851
    }
3852
    return data.ret;
3853
}
3854

    
3855
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3856
                                   int nb_sectors, int *pnum)
3857
{
3858
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3859
    if (ret < 0) {
3860
        return ret;
3861
    }
3862
    return
3863
        (ret & BDRV_BLOCK_DATA) ||
3864
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3865
}
3866

    
3867
/*
3868
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3869
 *
3870
 * Return true if the given sector is allocated in any image between
3871
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3872
 * sector is allocated in any image of the chain.  Return false otherwise.
3873
 *
3874
 * 'pnum' is set to the number of sectors (including and immediately following
3875
 *  the specified sector) that are known to be in the same
3876
 *  allocated/unallocated state.
3877
 *
3878
 */
3879
int bdrv_is_allocated_above(BlockDriverState *top,
3880
                            BlockDriverState *base,
3881
                            int64_t sector_num,
3882
                            int nb_sectors, int *pnum)
3883
{
3884
    BlockDriverState *intermediate;
3885
    int ret, n = nb_sectors;
3886

    
3887
    intermediate = top;
3888
    while (intermediate && intermediate != base) {
3889
        int pnum_inter;
3890
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3891
                                &pnum_inter);
3892
        if (ret < 0) {
3893
            return ret;
3894
        } else if (ret) {
3895
            *pnum = pnum_inter;
3896
            return 1;
3897
        }
3898

    
3899
        /*
3900
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3901
         * might have
3902
         *
3903
         * [sector_num+x, nr_sectors] allocated.
3904
         */
3905
        if (n > pnum_inter &&
3906
            (intermediate == top ||
3907
             sector_num + pnum_inter < intermediate->total_sectors)) {
3908
            n = pnum_inter;
3909
        }
3910

    
3911
        intermediate = intermediate->backing_hd;
3912
    }
3913

    
3914
    *pnum = n;
3915
    return 0;
3916
}
3917

    
3918
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3919
{
3920
    if (bs->backing_hd && bs->backing_hd->encrypted)
3921
        return bs->backing_file;
3922
    else if (bs->encrypted)
3923
        return bs->filename;
3924
    else
3925
        return NULL;
3926
}
3927

    
3928
void bdrv_get_backing_filename(BlockDriverState *bs,
3929
                               char *filename, int filename_size)
3930
{
3931
    pstrcpy(filename, filename_size, bs->backing_file);
3932
}
3933

    
3934
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3935
                          const uint8_t *buf, int nb_sectors)
3936
{
3937
    BlockDriver *drv = bs->drv;
3938
    if (!drv)
3939
        return -ENOMEDIUM;
3940
    if (!drv->bdrv_write_compressed)
3941
        return -ENOTSUP;
3942
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3943
        return -EIO;
3944

    
3945
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3946

    
3947
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3948
}
3949

    
3950
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3951
{
3952
    BlockDriver *drv = bs->drv;
3953
    if (!drv)
3954
        return -ENOMEDIUM;
3955
    if (!drv->bdrv_get_info)
3956
        return -ENOTSUP;
3957
    memset(bdi, 0, sizeof(*bdi));
3958
    return drv->bdrv_get_info(bs, bdi);
3959
}
3960

    
3961
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3962
{
3963
    BlockDriver *drv = bs->drv;
3964
    if (drv && drv->bdrv_get_specific_info) {
3965
        return drv->bdrv_get_specific_info(bs);
3966
    }
3967
    return NULL;
3968
}
3969

    
3970
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3971
                      int64_t pos, int size)
3972
{
3973
    QEMUIOVector qiov;
3974
    struct iovec iov = {
3975
        .iov_base   = (void *) buf,
3976
        .iov_len    = size,
3977
    };
3978

    
3979
    qemu_iovec_init_external(&qiov, &iov, 1);
3980
    return bdrv_writev_vmstate(bs, &qiov, pos);
3981
}
3982

    
3983
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3984
{
3985
    BlockDriver *drv = bs->drv;
3986

    
3987
    if (!drv) {
3988
        return -ENOMEDIUM;
3989
    } else if (drv->bdrv_save_vmstate) {
3990
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3991
    } else if (bs->file) {
3992
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3993
    }
3994

    
3995
    return -ENOTSUP;
3996
}
3997

    
3998
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3999
                      int64_t pos, int size)
4000
{
4001
    BlockDriver *drv = bs->drv;
4002
    if (!drv)
4003
        return -ENOMEDIUM;
4004
    if (drv->bdrv_load_vmstate)
4005
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
4006
    if (bs->file)
4007
        return bdrv_load_vmstate(bs->file, buf, pos, size);
4008
    return -ENOTSUP;
4009
}
4010

    
4011
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4012
{
4013
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4014
        return;
4015
    }
4016

    
4017
    bs->drv->bdrv_debug_event(bs, event);
4018
}
4019

    
4020
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4021
                          const char *tag)
4022
{
4023
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4024
        bs = bs->file;
4025
    }
4026

    
4027
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4028
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4029
    }
4030

    
4031
    return -ENOTSUP;
4032
}
4033

    
4034
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4035
{
4036
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4037
        bs = bs->file;
4038
    }
4039

    
4040
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4041
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4042
    }
4043

    
4044
    return -ENOTSUP;
4045
}
4046

    
4047
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4048
{
4049
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4050
        bs = bs->file;
4051
    }
4052

    
4053
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4054
        return bs->drv->bdrv_debug_resume(bs, tag);
4055
    }
4056

    
4057
    return -ENOTSUP;
4058
}
4059

    
4060
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4061
{
4062
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4063
        bs = bs->file;
4064
    }
4065

    
4066
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4067
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
4068
    }
4069

    
4070
    return false;
4071
}
4072

    
4073
int bdrv_is_snapshot(BlockDriverState *bs)
4074
{
4075
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4076
}
4077

    
4078
/* backing_file can either be relative, or absolute, or a protocol.  If it is
4079
 * relative, it must be relative to the chain.  So, passing in bs->filename
4080
 * from a BDS as backing_file should not be done, as that may be relative to
4081
 * the CWD rather than the chain. */
4082
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4083
        const char *backing_file)
4084
{
4085
    char *filename_full = NULL;
4086
    char *backing_file_full = NULL;
4087
    char *filename_tmp = NULL;
4088
    int is_protocol = 0;
4089
    BlockDriverState *curr_bs = NULL;
4090
    BlockDriverState *retval = NULL;
4091

    
4092
    if (!bs || !bs->drv || !backing_file) {
4093
        return NULL;
4094
    }
4095

    
4096
    filename_full     = g_malloc(PATH_MAX);
4097
    backing_file_full = g_malloc(PATH_MAX);
4098
    filename_tmp      = g_malloc(PATH_MAX);
4099

    
4100
    is_protocol = path_has_protocol(backing_file);
4101

    
4102
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4103

    
4104
        /* If either of the filename paths is actually a protocol, then
4105
         * compare unmodified paths; otherwise make paths relative */
4106
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4107
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4108
                retval = curr_bs->backing_hd;
4109
                break;
4110
            }
4111
        } else {
4112
            /* If not an absolute filename path, make it relative to the current
4113
             * image's filename path */
4114
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4115
                         backing_file);
4116

    
4117
            /* We are going to compare absolute pathnames */
4118
            if (!realpath(filename_tmp, filename_full)) {
4119
                continue;
4120
            }
4121

    
4122
            /* We need to make sure the backing filename we are comparing against
4123
             * is relative to the current image filename (or absolute) */
4124
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4125
                         curr_bs->backing_file);
4126

    
4127
            if (!realpath(filename_tmp, backing_file_full)) {
4128
                continue;
4129
            }
4130

    
4131
            if (strcmp(backing_file_full, filename_full) == 0) {
4132
                retval = curr_bs->backing_hd;
4133
                break;
4134
            }
4135
        }
4136
    }
4137

    
4138
    g_free(filename_full);
4139
    g_free(backing_file_full);
4140
    g_free(filename_tmp);
4141
    return retval;
4142
}
4143

    
4144
int bdrv_get_backing_file_depth(BlockDriverState *bs)
4145
{
4146
    if (!bs->drv) {
4147
        return 0;
4148
    }
4149

    
4150
    if (!bs->backing_hd) {
4151
        return 0;
4152
    }
4153

    
4154
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4155
}
4156

    
4157
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4158
{
4159
    BlockDriverState *curr_bs = NULL;
4160

    
4161
    if (!bs) {
4162
        return NULL;
4163
    }
4164

    
4165
    curr_bs = bs;
4166

    
4167
    while (curr_bs->backing_hd) {
4168
        curr_bs = curr_bs->backing_hd;
4169
    }
4170
    return curr_bs;
4171
}
4172

    
4173
/**************************************************************/
4174
/* async I/Os */
4175

    
4176
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4177
                                 QEMUIOVector *qiov, int nb_sectors,
4178
                                 BlockDriverCompletionFunc *cb, void *opaque)
4179
{
4180
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4181

    
4182
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4183
                                 cb, opaque, false);
4184
}
4185

    
4186
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4187
                                  QEMUIOVector *qiov, int nb_sectors,
4188
                                  BlockDriverCompletionFunc *cb, void *opaque)
4189
{
4190
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4191

    
4192
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4193
                                 cb, opaque, true);
4194
}
4195

    
4196
BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4197
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4198
        BlockDriverCompletionFunc *cb, void *opaque)
4199
{
4200
    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4201

    
4202
    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4203
                                 BDRV_REQ_ZERO_WRITE | flags,
4204
                                 cb, opaque, true);
4205
}
4206

    
4207

    
4208
typedef struct MultiwriteCB {
4209
    int error;
4210
    int num_requests;
4211
    int num_callbacks;
4212
    struct {
4213
        BlockDriverCompletionFunc *cb;
4214
        void *opaque;
4215
        QEMUIOVector *free_qiov;
4216
    } callbacks[];
4217
} MultiwriteCB;
4218

    
4219
static void multiwrite_user_cb(MultiwriteCB *mcb)
4220
{
4221
    int i;
4222

    
4223
    for (i = 0; i < mcb->num_callbacks; i++) {
4224
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4225
        if (mcb->callbacks[i].free_qiov) {
4226
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4227
        }
4228
        g_free(mcb->callbacks[i].free_qiov);
4229
    }
4230
}
4231

    
4232
static void multiwrite_cb(void *opaque, int ret)
4233
{
4234
    MultiwriteCB *mcb = opaque;
4235

    
4236
    trace_multiwrite_cb(mcb, ret);
4237

    
4238
    if (ret < 0 && !mcb->error) {
4239
        mcb->error = ret;
4240
    }
4241

    
4242
    mcb->num_requests--;
4243
    if (mcb->num_requests == 0) {
4244
        multiwrite_user_cb(mcb);
4245
        g_free(mcb);
4246
    }
4247
}
4248

    
4249
static int multiwrite_req_compare(const void *a, const void *b)
4250
{
4251
    const BlockRequest *req1 = a, *req2 = b;
4252

    
4253
    /*
4254
     * Note that we can't simply subtract req2->sector from req1->sector
4255
     * here as that could overflow the return value.
4256
     */
4257
    if (req1->sector > req2->sector) {
4258
        return 1;
4259
    } else if (req1->sector < req2->sector) {
4260
        return -1;
4261
    } else {
4262
        return 0;
4263
    }
4264
}
4265

    
4266
/*
4267
 * Takes a bunch of requests and tries to merge them. Returns the number of
4268
 * requests that remain after merging.
4269
 */
4270
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4271
    int num_reqs, MultiwriteCB *mcb)
4272
{
4273
    int i, outidx;
4274

    
4275
    // Sort requests by start sector
4276
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4277

    
4278
    // Check if adjacent requests touch the same clusters. If so, combine them,
4279
    // filling up gaps with zero sectors.
4280
    outidx = 0;
4281
    for (i = 1; i < num_reqs; i++) {
4282
        int merge = 0;
4283
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4284

    
4285
        // Handle exactly sequential writes and overlapping writes.
4286
        if (reqs[i].sector <= oldreq_last) {
4287
            merge = 1;
4288
        }
4289

    
4290
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4291
            merge = 0;
4292
        }
4293

    
4294
        if (merge) {
4295
            size_t size;
4296
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4297
            qemu_iovec_init(qiov,
4298
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4299

    
4300
            // Add the first request to the merged one. If the requests are
4301
            // overlapping, drop the last sectors of the first request.
4302
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
4303
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4304

    
4305
            // We should need to add any zeros between the two requests
4306
            assert (reqs[i].sector <= oldreq_last);
4307

    
4308
            // Add the second request
4309
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4310

    
4311
            reqs[outidx].nb_sectors = qiov->size >> 9;
4312
            reqs[outidx].qiov = qiov;
4313

    
4314
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4315
        } else {
4316
            outidx++;
4317
            reqs[outidx].sector     = reqs[i].sector;
4318
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4319
            reqs[outidx].qiov       = reqs[i].qiov;
4320
        }
4321
    }
4322

    
4323
    return outidx + 1;
4324
}
4325

    
4326
/*
4327
 * Submit multiple AIO write requests at once.
4328
 *
4329
 * On success, the function returns 0 and all requests in the reqs array have
4330
 * been submitted. In error case this function returns -1, and any of the
4331
 * requests may or may not be submitted yet. In particular, this means that the
4332
 * callback will be called for some of the requests, for others it won't. The
4333
 * caller must check the error field of the BlockRequest to wait for the right
4334
 * callbacks (if error != 0, no callback will be called).
4335
 *
4336
 * The implementation may modify the contents of the reqs array, e.g. to merge
4337
 * requests. However, the fields opaque and error are left unmodified as they
4338
 * are used to signal failure for a single request to the caller.
4339
 */
4340
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4341
{
4342
    MultiwriteCB *mcb;
4343
    int i;
4344

    
4345
    /* don't submit writes if we don't have a medium */
4346
    if (bs->drv == NULL) {
4347
        for (i = 0; i < num_reqs; i++) {
4348
            reqs[i].error = -ENOMEDIUM;
4349
        }
4350
        return -1;
4351
    }
4352

    
4353
    if (num_reqs == 0) {
4354
        return 0;
4355
    }
4356

    
4357
    // Create MultiwriteCB structure
4358
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4359
    mcb->num_requests = 0;
4360
    mcb->num_callbacks = num_reqs;
4361

    
4362
    for (i = 0; i < num_reqs; i++) {
4363
        mcb->callbacks[i].cb = reqs[i].cb;
4364
        mcb->callbacks[i].opaque = reqs[i].opaque;
4365
    }
4366

    
4367
    // Check for mergable requests
4368
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4369

    
4370
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4371

    
4372
    /* Run the aio requests. */
4373
    mcb->num_requests = num_reqs;
4374
    for (i = 0; i < num_reqs; i++) {
4375
        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4376
                              reqs[i].nb_sectors, reqs[i].flags,
4377
                              multiwrite_cb, mcb,
4378
                              true);
4379
    }
4380

    
4381
    return 0;
4382
}
4383

    
4384
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4385
{
4386
    acb->aiocb_info->cancel(acb);
4387
}
4388

    
4389
/**************************************************************/
4390
/* async block device emulation */
4391

    
4392
typedef struct BlockDriverAIOCBSync {
4393
    BlockDriverAIOCB common;
4394
    QEMUBH *bh;
4395
    int ret;
4396
    /* vector translation state */
4397
    QEMUIOVector *qiov;
4398
    uint8_t *bounce;
4399
    int is_write;
4400
} BlockDriverAIOCBSync;
4401

    
4402
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4403
{
4404
    BlockDriverAIOCBSync *acb =
4405
        container_of(blockacb, BlockDriverAIOCBSync, common);
4406
    qemu_bh_delete(acb->bh);
4407
    acb->bh = NULL;
4408
    qemu_aio_release(acb);
4409
}
4410

    
4411
static const AIOCBInfo bdrv_em_aiocb_info = {
4412
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4413
    .cancel             = bdrv_aio_cancel_em,
4414
};
4415

    
4416
static void bdrv_aio_bh_cb(void *opaque)
4417
{
4418
    BlockDriverAIOCBSync *acb = opaque;
4419

    
4420
    if (!acb->is_write)
4421
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4422
    qemu_vfree(acb->bounce);
4423
    acb->common.cb(acb->common.opaque, acb->ret);
4424
    qemu_bh_delete(acb->bh);
4425
    acb->bh = NULL;
4426
    qemu_aio_release(acb);
4427
}
4428

    
4429
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4430
                                            int64_t sector_num,
4431
                                            QEMUIOVector *qiov,
4432
                                            int nb_sectors,
4433
                                            BlockDriverCompletionFunc *cb,
4434
                                            void *opaque,
4435
                                            int is_write)
4436

    
4437
{
4438
    BlockDriverAIOCBSync *acb;
4439

    
4440
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4441
    acb->is_write = is_write;
4442
    acb->qiov = qiov;
4443
    acb->bounce = qemu_blockalign(bs, qiov->size);
4444
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4445

    
4446
    if (is_write) {
4447
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4448
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4449
    } else {
4450
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4451
    }
4452

    
4453
    qemu_bh_schedule(acb->bh);
4454

    
4455
    return &acb->common;
4456
}
4457

    
4458
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4459
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4460
        BlockDriverCompletionFunc *cb, void *opaque)
4461
{
4462
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4463
}
4464

    
4465
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4466
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4467
        BlockDriverCompletionFunc *cb, void *opaque)
4468
{
4469
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4470
}
4471

    
4472

    
4473
typedef struct BlockDriverAIOCBCoroutine {
4474
    BlockDriverAIOCB common;
4475
    BlockRequest req;
4476
    bool is_write;
4477
    bool *done;
4478
    QEMUBH* bh;
4479
} BlockDriverAIOCBCoroutine;
4480

    
4481
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4482
{
4483
    BlockDriverAIOCBCoroutine *acb =
4484
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4485
    bool done = false;
4486

    
4487
    acb->done = &done;
4488
    while (!done) {
4489
        qemu_aio_wait();
4490
    }
4491
}
4492

    
4493
static const AIOCBInfo bdrv_em_co_aiocb_info = {
4494
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4495
    .cancel             = bdrv_aio_co_cancel_em,
4496
};
4497

    
4498
static void bdrv_co_em_bh(void *opaque)
4499
{
4500
    BlockDriverAIOCBCoroutine *acb = opaque;
4501

    
4502
    acb->common.cb(acb->common.opaque, acb->req.error);
4503

    
4504
    if (acb->done) {
4505
        *acb->done = true;
4506
    }
4507

    
4508
    qemu_bh_delete(acb->bh);
4509
    qemu_aio_release(acb);
4510
}
4511

    
4512
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4513
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4514
{
4515
    BlockDriverAIOCBCoroutine *acb = opaque;
4516
    BlockDriverState *bs = acb->common.bs;
4517

    
4518
    if (!acb->is_write) {
4519
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4520
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4521
    } else {
4522
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4523
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4524
    }
4525

    
4526
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4527
    qemu_bh_schedule(acb->bh);
4528
}
4529

    
4530
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4531
                                               int64_t sector_num,
4532
                                               QEMUIOVector *qiov,
4533
                                               int nb_sectors,
4534
                                               BdrvRequestFlags flags,
4535
                                               BlockDriverCompletionFunc *cb,
4536
                                               void *opaque,
4537
                                               bool is_write)
4538
{
4539
    Coroutine *co;
4540
    BlockDriverAIOCBCoroutine *acb;
4541

    
4542
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4543
    acb->req.sector = sector_num;
4544
    acb->req.nb_sectors = nb_sectors;
4545
    acb->req.qiov = qiov;
4546
    acb->req.flags = flags;
4547
    acb->is_write = is_write;
4548
    acb->done = NULL;
4549

    
4550
    co = qemu_coroutine_create(bdrv_co_do_rw);
4551
    qemu_coroutine_enter(co, acb);
4552

    
4553
    return &acb->common;
4554
}
4555

    
4556
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4557
{
4558
    BlockDriverAIOCBCoroutine *acb = opaque;
4559
    BlockDriverState *bs = acb->common.bs;
4560

    
4561
    acb->req.error = bdrv_co_flush(bs);
4562
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4563
    qemu_bh_schedule(acb->bh);
4564
}
4565

    
4566
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4567
        BlockDriverCompletionFunc *cb, void *opaque)
4568
{
4569
    trace_bdrv_aio_flush(bs, opaque);
4570

    
4571
    Coroutine *co;
4572
    BlockDriverAIOCBCoroutine *acb;
4573

    
4574
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4575
    acb->done = NULL;
4576

    
4577
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4578
    qemu_coroutine_enter(co, acb);
4579

    
4580
    return &acb->common;
4581
}
4582

    
4583
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4584
{
4585
    BlockDriverAIOCBCoroutine *acb = opaque;
4586
    BlockDriverState *bs = acb->common.bs;
4587

    
4588
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4589
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4590
    qemu_bh_schedule(acb->bh);
4591
}
4592

    
4593
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4594
        int64_t sector_num, int nb_sectors,
4595
        BlockDriverCompletionFunc *cb, void *opaque)
4596
{
4597
    Coroutine *co;
4598
    BlockDriverAIOCBCoroutine *acb;
4599

    
4600
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4601

    
4602
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4603
    acb->req.sector = sector_num;
4604
    acb->req.nb_sectors = nb_sectors;
4605
    acb->done = NULL;
4606
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4607
    qemu_coroutine_enter(co, acb);
4608

    
4609
    return &acb->common;
4610
}
4611

    
4612
void bdrv_init(void)
4613
{
4614
    module_call_init(MODULE_INIT_BLOCK);
4615
}
4616

    
4617
void bdrv_init_with_whitelist(void)
4618
{
4619
    use_bdrv_whitelist = 1;
4620
    bdrv_init();
4621
}
4622

    
4623
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4624
                   BlockDriverCompletionFunc *cb, void *opaque)
4625
{
4626
    BlockDriverAIOCB *acb;
4627

    
4628
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4629
    acb->aiocb_info = aiocb_info;
4630
    acb->bs = bs;
4631
    acb->cb = cb;
4632
    acb->opaque = opaque;
4633
    return acb;
4634
}
4635

    
4636
void qemu_aio_release(void *p)
4637
{
4638
    BlockDriverAIOCB *acb = p;
4639
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4640
}
4641

    
4642
/**************************************************************/
4643
/* Coroutine block device emulation */
4644

    
4645
typedef struct CoroutineIOCompletion {
4646
    Coroutine *coroutine;
4647
    int ret;
4648
} CoroutineIOCompletion;
4649

    
4650
static void bdrv_co_io_em_complete(void *opaque, int ret)
4651
{
4652
    CoroutineIOCompletion *co = opaque;
4653

    
4654
    co->ret = ret;
4655
    qemu_coroutine_enter(co->coroutine, NULL);
4656
}
4657

    
4658
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4659
                                      int nb_sectors, QEMUIOVector *iov,
4660
                                      bool is_write)
4661
{
4662
    CoroutineIOCompletion co = {
4663
        .coroutine = qemu_coroutine_self(),
4664
    };
4665
    BlockDriverAIOCB *acb;
4666

    
4667
    if (is_write) {
4668
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4669
                                       bdrv_co_io_em_complete, &co);
4670
    } else {
4671
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4672
                                      bdrv_co_io_em_complete, &co);
4673
    }
4674

    
4675
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4676
    if (!acb) {
4677
        return -EIO;
4678
    }
4679
    qemu_coroutine_yield();
4680

    
4681
    return co.ret;
4682
}
4683

    
4684
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4685
                                         int64_t sector_num, int nb_sectors,
4686
                                         QEMUIOVector *iov)
4687
{
4688
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4689
}
4690

    
4691
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4692
                                         int64_t sector_num, int nb_sectors,
4693
                                         QEMUIOVector *iov)
4694
{
4695
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4696
}
4697

    
4698
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4699
{
4700
    RwCo *rwco = opaque;
4701

    
4702
    rwco->ret = bdrv_co_flush(rwco->bs);
4703
}
4704

    
4705
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4706
{
4707
    int ret;
4708

    
4709
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4710
        return 0;
4711
    }
4712

    
4713
    /* Write back cached data to the OS even with cache=unsafe */
4714
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4715
    if (bs->drv->bdrv_co_flush_to_os) {
4716
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4717
        if (ret < 0) {
4718
            return ret;
4719
        }
4720
    }
4721

    
4722
    /* But don't actually force it to the disk with cache=unsafe */
4723
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4724
        goto flush_parent;
4725
    }
4726

    
4727
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4728
    if (bs->drv->bdrv_co_flush_to_disk) {
4729
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4730
    } else if (bs->drv->bdrv_aio_flush) {
4731
        BlockDriverAIOCB *acb;
4732
        CoroutineIOCompletion co = {
4733
            .coroutine = qemu_coroutine_self(),
4734
        };
4735

    
4736
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4737
        if (acb == NULL) {
4738
            ret = -EIO;
4739
        } else {
4740
            qemu_coroutine_yield();
4741
            ret = co.ret;
4742
        }
4743
    } else {
4744
        /*
4745
         * Some block drivers always operate in either writethrough or unsafe
4746
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4747
         * know how the server works (because the behaviour is hardcoded or
4748
         * depends on server-side configuration), so we can't ensure that
4749
         * everything is safe on disk. Returning an error doesn't work because
4750
         * that would break guests even if the server operates in writethrough
4751
         * mode.
4752
         *
4753
         * Let's hope the user knows what he's doing.
4754
         */
4755
        ret = 0;
4756
    }
4757
    if (ret < 0) {
4758
        return ret;
4759
    }
4760

    
4761
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4762
     * in the case of cache=unsafe, so there are no useless flushes.
4763
     */
4764
flush_parent:
4765
    return bdrv_co_flush(bs->file);
4766
}
4767

    
4768
void bdrv_invalidate_cache(BlockDriverState *bs)
4769
{
4770
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4771
        bs->drv->bdrv_invalidate_cache(bs);
4772
    }
4773
}
4774

    
4775
void bdrv_invalidate_cache_all(void)
4776
{
4777
    BlockDriverState *bs;
4778

    
4779
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4780
        bdrv_invalidate_cache(bs);
4781
    }
4782
}
4783

    
4784
void bdrv_clear_incoming_migration_all(void)
4785
{
4786
    BlockDriverState *bs;
4787

    
4788
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4789
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4790
    }
4791
}
4792

    
4793
int bdrv_flush(BlockDriverState *bs)
4794
{
4795
    Coroutine *co;
4796
    RwCo rwco = {
4797
        .bs = bs,
4798
        .ret = NOT_DONE,
4799
    };
4800

    
4801
    if (qemu_in_coroutine()) {
4802
        /* Fast-path if already in coroutine context */
4803
        bdrv_flush_co_entry(&rwco);
4804
    } else {
4805
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4806
        qemu_coroutine_enter(co, &rwco);
4807
        while (rwco.ret == NOT_DONE) {
4808
            qemu_aio_wait();
4809
        }
4810
    }
4811

    
4812
    return rwco.ret;
4813
}
4814

    
4815
typedef struct DiscardCo {
4816
    BlockDriverState *bs;
4817
    int64_t sector_num;
4818
    int nb_sectors;
4819
    int ret;
4820
} DiscardCo;
4821
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4822
{
4823
    DiscardCo *rwco = opaque;
4824

    
4825
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4826
}
4827

    
4828
/* if no limit is specified in the BlockLimits use a default
4829
 * of 32768 512-byte sectors (16 MiB) per request.
4830
 */
4831
#define MAX_DISCARD_DEFAULT 32768
4832

    
4833
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4834
                                 int nb_sectors)
4835
{
4836
    int max_discard;
4837

    
4838
    if (!bs->drv) {
4839
        return -ENOMEDIUM;
4840
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4841
        return -EIO;
4842
    } else if (bs->read_only) {
4843
        return -EROFS;
4844
    }
4845

    
4846
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4847

    
4848
    /* Do nothing if disabled.  */
4849
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4850
        return 0;
4851
    }
4852

    
4853
    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4854
        return 0;
4855
    }
4856

    
4857
    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4858
    while (nb_sectors > 0) {
4859
        int ret;
4860
        int num = nb_sectors;
4861

    
4862
        /* align request */
4863
        if (bs->bl.discard_alignment &&
4864
            num >= bs->bl.discard_alignment &&
4865
            sector_num % bs->bl.discard_alignment) {
4866
            if (num > bs->bl.discard_alignment) {
4867
                num = bs->bl.discard_alignment;
4868
            }
4869
            num -= sector_num % bs->bl.discard_alignment;
4870
        }
4871

    
4872
        /* limit request size */
4873
        if (num > max_discard) {
4874
            num = max_discard;
4875
        }
4876

    
4877
        if (bs->drv->bdrv_co_discard) {
4878
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4879
        } else {
4880
            BlockDriverAIOCB *acb;
4881
            CoroutineIOCompletion co = {
4882
                .coroutine = qemu_coroutine_self(),
4883
            };
4884

    
4885
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4886
                                            bdrv_co_io_em_complete, &co);
4887
            if (acb == NULL) {
4888
                return -EIO;
4889
            } else {
4890
                qemu_coroutine_yield();
4891
                ret = co.ret;
4892
            }
4893
        }
4894
        if (ret && ret != -ENOTSUP) {
4895
            return ret;
4896
        }
4897

    
4898
        sector_num += num;
4899
        nb_sectors -= num;
4900
    }
4901
    return 0;
4902
}
4903

    
4904
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4905
{
4906
    Coroutine *co;
4907
    DiscardCo rwco = {
4908
        .bs = bs,
4909
        .sector_num = sector_num,
4910
        .nb_sectors = nb_sectors,
4911
        .ret = NOT_DONE,
4912
    };
4913

    
4914
    if (qemu_in_coroutine()) {
4915
        /* Fast-path if already in coroutine context */
4916
        bdrv_discard_co_entry(&rwco);
4917
    } else {
4918
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4919
        qemu_coroutine_enter(co, &rwco);
4920
        while (rwco.ret == NOT_DONE) {
4921
            qemu_aio_wait();
4922
        }
4923
    }
4924

    
4925
    return rwco.ret;
4926
}
4927

    
4928
/**************************************************************/
4929
/* removable device support */
4930

    
4931
/**
4932
 * Return TRUE if the media is present
4933
 */
4934
int bdrv_is_inserted(BlockDriverState *bs)
4935
{
4936
    BlockDriver *drv = bs->drv;
4937

    
4938
    if (!drv)
4939
        return 0;
4940
    if (!drv->bdrv_is_inserted)
4941
        return 1;
4942
    return drv->bdrv_is_inserted(bs);
4943
}
4944

    
4945
/**
4946
 * Return whether the media changed since the last call to this
4947
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4948
 */
4949
int bdrv_media_changed(BlockDriverState *bs)
4950
{
4951
    BlockDriver *drv = bs->drv;
4952

    
4953
    if (drv && drv->bdrv_media_changed) {
4954
        return drv->bdrv_media_changed(bs);
4955
    }
4956
    return -ENOTSUP;
4957
}
4958

    
4959
/**
4960
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4961
 */
4962
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4963
{
4964
    BlockDriver *drv = bs->drv;
4965

    
4966
    if (drv && drv->bdrv_eject) {
4967
        drv->bdrv_eject(bs, eject_flag);
4968
    }
4969

    
4970
    if (bs->device_name[0] != '\0') {
4971
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4972
    }
4973
}
4974

    
4975
/**
4976
 * Lock or unlock the media (if it is locked, the user won't be able
4977
 * to eject it manually).
4978
 */
4979
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4980
{
4981
    BlockDriver *drv = bs->drv;
4982

    
4983
    trace_bdrv_lock_medium(bs, locked);
4984

    
4985
    if (drv && drv->bdrv_lock_medium) {
4986
        drv->bdrv_lock_medium(bs, locked);
4987
    }
4988
}
4989

    
4990
/* needed for generic scsi interface */
4991

    
4992
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4993
{
4994
    BlockDriver *drv = bs->drv;
4995

    
4996
    if (drv && drv->bdrv_ioctl)
4997
        return drv->bdrv_ioctl(bs, req, buf);
4998
    return -ENOTSUP;
4999
}
5000

    
5001
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5002
        unsigned long int req, void *buf,
5003
        BlockDriverCompletionFunc *cb, void *opaque)
5004
{
5005
    BlockDriver *drv = bs->drv;
5006

    
5007
    if (drv && drv->bdrv_aio_ioctl)
5008
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5009
    return NULL;
5010
}
5011

    
5012
void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5013
{
5014
    bs->guest_block_size = align;
5015
}
5016

    
5017
void *qemu_blockalign(BlockDriverState *bs, size_t size)
5018
{
5019
    return qemu_memalign(bdrv_opt_mem_align(bs), size);
5020
}
5021

    
5022
/*
5023
 * Check if all memory in this vector is sector aligned.
5024
 */
5025
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5026
{
5027
    int i;
5028
    size_t alignment = bdrv_opt_mem_align(bs);
5029

    
5030
    for (i = 0; i < qiov->niov; i++) {
5031
        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5032
            return false;
5033
        }
5034
        if (qiov->iov[i].iov_len % alignment) {
5035
            return false;
5036
        }
5037
    }
5038

    
5039
    return true;
5040
}
5041

    
5042
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5043
{
5044
    int64_t bitmap_size;
5045
    BdrvDirtyBitmap *bitmap;
5046

    
5047
    assert((granularity & (granularity - 1)) == 0);
5048

    
5049
    granularity >>= BDRV_SECTOR_BITS;
5050
    assert(granularity);
5051
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5052
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5053
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5054
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5055
    return bitmap;
5056
}
5057

    
5058
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5059
{
5060
    BdrvDirtyBitmap *bm, *next;
5061
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5062
        if (bm == bitmap) {
5063
            QLIST_REMOVE(bitmap, list);
5064
            hbitmap_free(bitmap->bitmap);
5065
            g_free(bitmap);
5066
            return;
5067
        }
5068
    }
5069
}
5070

    
5071
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5072
{
5073
    BdrvDirtyBitmap *bm;
5074
    BlockDirtyInfoList *list = NULL;
5075
    BlockDirtyInfoList **plist = &list;
5076

    
5077
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5078
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5079
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5080
        info->count = bdrv_get_dirty_count(bs, bm);
5081
        info->granularity =
5082
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5083
        entry->value = info;
5084
        *plist = entry;
5085
        plist = &entry->next;
5086
    }
5087

    
5088
    return list;
5089
}
5090

    
5091
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5092
{
5093
    if (bitmap) {
5094
        return hbitmap_get(bitmap->bitmap, sector);
5095
    } else {
5096
        return 0;
5097
    }
5098
}
5099

    
5100
void bdrv_dirty_iter_init(BlockDriverState *bs,
5101
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5102
{
5103
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5104
}
5105

    
5106
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5107
                    int nr_sectors)
5108
{
5109
    BdrvDirtyBitmap *bitmap;
5110
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5111
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5112
    }
5113
}
5114

    
5115
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5116
{
5117
    BdrvDirtyBitmap *bitmap;
5118
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5119
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5120
    }
5121
}
5122

    
5123
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5124
{
5125
    return hbitmap_count(bitmap->bitmap);
5126
}
5127

    
5128
/* Get a reference to bs */
5129
void bdrv_ref(BlockDriverState *bs)
5130
{
5131
    bs->refcnt++;
5132
}
5133

    
5134
/* Release a previously grabbed reference to bs.
5135
 * If after releasing, reference count is zero, the BlockDriverState is
5136
 * deleted. */
5137
void bdrv_unref(BlockDriverState *bs)
5138
{
5139
    assert(bs->refcnt > 0);
5140
    if (--bs->refcnt == 0) {
5141
        bdrv_delete(bs);
5142
    }
5143
}
5144

    
5145
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5146
{
5147
    assert(bs->in_use != in_use);
5148
    bs->in_use = in_use;
5149
}
5150

    
5151
int bdrv_in_use(BlockDriverState *bs)
5152
{
5153
    return bs->in_use;
5154
}
5155

    
5156
void bdrv_iostatus_enable(BlockDriverState *bs)
5157
{
5158
    bs->iostatus_enabled = true;
5159
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5160
}
5161

    
5162
/* The I/O status is only enabled if the drive explicitly
5163
 * enables it _and_ the VM is configured to stop on errors */
5164
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5165
{
5166
    return (bs->iostatus_enabled &&
5167
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5168
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5169
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5170
}
5171

    
5172
void bdrv_iostatus_disable(BlockDriverState *bs)
5173
{
5174
    bs->iostatus_enabled = false;
5175
}
5176

    
5177
void bdrv_iostatus_reset(BlockDriverState *bs)
5178
{
5179
    if (bdrv_iostatus_is_enabled(bs)) {
5180
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5181
        if (bs->job) {
5182
            block_job_iostatus_reset(bs->job);
5183
        }
5184
    }
5185
}
5186

    
5187
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5188
{
5189
    assert(bdrv_iostatus_is_enabled(bs));
5190
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5191
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5192
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
5193
    }
5194
}
5195

    
5196
void
5197
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5198
        enum BlockAcctType type)
5199
{
5200
    assert(type < BDRV_MAX_IOTYPE);
5201

    
5202
    cookie->bytes = bytes;
5203
    cookie->start_time_ns = get_clock();
5204
    cookie->type = type;
5205
}
5206

    
5207
void
5208
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5209
{
5210
    assert(cookie->type < BDRV_MAX_IOTYPE);
5211

    
5212
    bs->nr_bytes[cookie->type] += cookie->bytes;
5213
    bs->nr_ops[cookie->type]++;
5214
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5215
}
5216

    
5217
void bdrv_img_create(const char *filename, const char *fmt,
5218
                     const char *base_filename, const char *base_fmt,
5219
                     char *options, uint64_t img_size, int flags,
5220
                     Error **errp, bool quiet)
5221
{
5222
    QEMUOptionParameter *param = NULL, *create_options = NULL;
5223
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
5224
    BlockDriver *drv, *proto_drv;
5225
    BlockDriver *backing_drv = NULL;
5226
    Error *local_err = NULL;
5227
    int ret = 0;
5228

    
5229
    /* Find driver and parse its options */
5230
    drv = bdrv_find_format(fmt);
5231
    if (!drv) {
5232
        error_setg(errp, "Unknown file format '%s'", fmt);
5233
        return;
5234
    }
5235

    
5236
    proto_drv = bdrv_find_protocol(filename, true);
5237
    if (!proto_drv) {
5238
        error_setg(errp, "Unknown protocol '%s'", filename);
5239
        return;
5240
    }
5241

    
5242
    create_options = append_option_parameters(create_options,
5243
                                              drv->create_options);
5244
    create_options = append_option_parameters(create_options,
5245
                                              proto_drv->create_options);
5246

    
5247
    /* Create parameter list with default values */
5248
    param = parse_option_parameters("", create_options, param);
5249

    
5250
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5251

    
5252
    /* Parse -o options */
5253
    if (options) {
5254
        param = parse_option_parameters(options, create_options, param);
5255
        if (param == NULL) {
5256
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
5257
            goto out;
5258
        }
5259
    }
5260

    
5261
    if (base_filename) {
5262
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5263
                                 base_filename)) {
5264
            error_setg(errp, "Backing file not supported for file format '%s'",
5265
                       fmt);
5266
            goto out;
5267
        }
5268
    }
5269

    
5270
    if (base_fmt) {
5271
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5272
            error_setg(errp, "Backing file format not supported for file "
5273
                             "format '%s'", fmt);
5274
            goto out;
5275
        }
5276
    }
5277

    
5278
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5279
    if (backing_file && backing_file->value.s) {
5280
        if (!strcmp(filename, backing_file->value.s)) {
5281
            error_setg(errp, "Error: Trying to create an image with the "
5282
                             "same filename as the backing file");
5283
            goto out;
5284
        }
5285
    }
5286

    
5287
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5288
    if (backing_fmt && backing_fmt->value.s) {
5289
        backing_drv = bdrv_find_format(backing_fmt->value.s);
5290
        if (!backing_drv) {
5291
            error_setg(errp, "Unknown backing file format '%s'",
5292
                       backing_fmt->value.s);
5293
            goto out;
5294
        }
5295
    }
5296

    
5297
    // The size for the image must always be specified, with one exception:
5298
    // If we are using a backing file, we can obtain the size from there
5299
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
5300
    if (size && size->value.n == -1) {
5301
        if (backing_file && backing_file->value.s) {
5302
            BlockDriverState *bs;
5303
            uint64_t size;
5304
            char buf[32];
5305
            int back_flags;
5306

    
5307
            /* backing files always opened read-only */
5308
            back_flags =
5309
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5310

    
5311
            bs = NULL;
5312
            ret = bdrv_open(&bs, backing_file->value.s, NULL, back_flags,
5313
                            backing_drv, &local_err);
5314
            if (ret < 0) {
5315
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
5316
                                 backing_file->value.s,
5317
                                 error_get_pretty(local_err));
5318
                error_free(local_err);
5319
                local_err = NULL;
5320
                goto out;
5321
            }
5322
            bdrv_get_geometry(bs, &size);
5323
            size *= 512;
5324

    
5325
            snprintf(buf, sizeof(buf), "%" PRId64, size);
5326
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5327

    
5328
            bdrv_unref(bs);
5329
        } else {
5330
            error_setg(errp, "Image creation needs a size parameter");
5331
            goto out;
5332
        }
5333
    }
5334

    
5335
    if (!quiet) {
5336
        printf("Formatting '%s', fmt=%s ", filename, fmt);
5337
        print_option_parameters(param);
5338
        puts("");
5339
    }
5340
    ret = bdrv_create(drv, filename, param, &local_err);
5341
    if (ret == -EFBIG) {
5342
        /* This is generally a better message than whatever the driver would
5343
         * deliver (especially because of the cluster_size_hint), since that
5344
         * is most probably not much different from "image too large". */
5345
        const char *cluster_size_hint = "";
5346
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5347
            cluster_size_hint = " (try using a larger cluster size)";
5348
        }
5349
        error_setg(errp, "The image size is too large for file format '%s'"
5350
                   "%s", fmt, cluster_size_hint);
5351
        error_free(local_err);
5352
        local_err = NULL;
5353
    }
5354

    
5355
out:
5356
    free_option_parameters(create_options);
5357
    free_option_parameters(param);
5358

    
5359
    if (local_err) {
5360
        error_propagate(errp, local_err);
5361
    }
5362
}
5363

    
5364
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5365
{
5366
    /* Currently BlockDriverState always uses the main loop AioContext */
5367
    return qemu_get_aio_context();
5368
}
5369

    
5370
void bdrv_add_before_write_notifier(BlockDriverState *bs,
5371
                                    NotifierWithReturn *notifier)
5372
{
5373
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5374
}
5375

    
5376
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5377
{
5378
    if (bs->drv->bdrv_amend_options == NULL) {
5379
        return -ENOTSUP;
5380
    }
5381
    return bs->drv->bdrv_amend_options(bs, options);
5382
}
5383

    
5384
/* Used to recurse on single child block filters.
5385
 * Single child block filter will store their child in bs->file.
5386
 */
5387
bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5388
                                      BlockDriverState *candidate)
5389
{
5390
    if (!bs->drv) {
5391
        return false;
5392
    }
5393

    
5394
    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5395
        if (bs == candidate) {
5396
            return true;
5397
        } else {
5398
            return false;
5399
        }
5400
    }
5401

    
5402
    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5403
        return false;
5404
    }
5405

    
5406
    if (!bs->file) {
5407
        return false;
5408
    }
5409

    
5410
    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5411
}
5412

    
5413
bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5414
                                      BlockDriverState *candidate)
5415
{
5416
    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5417
        return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5418
    }
5419

    
5420
    return bdrv_generic_is_first_non_filter(bs, candidate);
5421
}
5422

    
5423
/* This function checks if the candidate is the first non filter bs down it's
5424
 * bs chain. Since we don't have pointers to parents it explore all bs chains
5425
 * from the top. Some filters can choose not to pass down the recursion.
5426
 */
5427
bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5428
{
5429
    BlockDriverState *bs;
5430

    
5431
    /* walk down the bs forest recursively */
5432
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5433
        bool perm;
5434

    
5435
        perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5436

    
5437
        /* candidate is the first non filter */
5438
        if (perm) {
5439
            return true;
5440
        }
5441
    }
5442

    
5443
    return false;
5444
}