Statistics
| Branch: | Revision:

root / block.c @ 5d12aa63

History | View | Annotate | Download (151.8 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "block/qapi.h"
36
#include "qmp-commands.h"
37
#include "qemu/timer.h"
38

    
39
#ifdef CONFIG_BSD
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <sys/ioctl.h>
43
#include <sys/queue.h>
44
#ifndef __DragonFly__
45
#include <sys/disk.h>
46
#endif
47
#endif
48

    
49
#ifdef _WIN32
50
#include <windows.h>
51
#endif
52

    
53
struct BdrvDirtyBitmap {
54
    HBitmap *bitmap;
55
    QLIST_ENTRY(BdrvDirtyBitmap) list;
56
};
57

    
58
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59

    
60
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63
        BlockDriverCompletionFunc *cb, void *opaque);
64
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66
        BlockDriverCompletionFunc *cb, void *opaque);
67
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68
                                         int64_t sector_num, int nb_sectors,
69
                                         QEMUIOVector *iov);
70
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71
                                         int64_t sector_num, int nb_sectors,
72
                                         QEMUIOVector *iov);
73
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75
    BdrvRequestFlags flags);
76
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78
    BdrvRequestFlags flags);
79
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80
                                               int64_t sector_num,
81
                                               QEMUIOVector *qiov,
82
                                               int nb_sectors,
83
                                               BdrvRequestFlags flags,
84
                                               BlockDriverCompletionFunc *cb,
85
                                               void *opaque,
86
                                               bool is_write);
87
static void coroutine_fn bdrv_co_do_rw(void *opaque);
88
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96

    
97
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_set_io_limits(BlockDriverState *bs,
125
                        ThrottleConfig *cfg)
126
{
127
    int i;
128

    
129
    throttle_config(&bs->throttle_state, cfg);
130

    
131
    for (i = 0; i < 2; i++) {
132
        qemu_co_enter_next(&bs->throttled_reqs[i]);
133
    }
134
}
135

    
136
/* this function drain all the throttled IOs */
137
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138
{
139
    bool drained = false;
140
    bool enabled = bs->io_limits_enabled;
141
    int i;
142

    
143
    bs->io_limits_enabled = false;
144

    
145
    for (i = 0; i < 2; i++) {
146
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147
            drained = true;
148
        }
149
    }
150

    
151
    bs->io_limits_enabled = enabled;
152

    
153
    return drained;
154
}
155

    
156
void bdrv_io_limits_disable(BlockDriverState *bs)
157
{
158
    bs->io_limits_enabled = false;
159

    
160
    bdrv_start_throttled_reqs(bs);
161

    
162
    throttle_destroy(&bs->throttle_state);
163
}
164

    
165
static void bdrv_throttle_read_timer_cb(void *opaque)
166
{
167
    BlockDriverState *bs = opaque;
168
    qemu_co_enter_next(&bs->throttled_reqs[0]);
169
}
170

    
171
static void bdrv_throttle_write_timer_cb(void *opaque)
172
{
173
    BlockDriverState *bs = opaque;
174
    qemu_co_enter_next(&bs->throttled_reqs[1]);
175
}
176

    
177
/* should be called before bdrv_set_io_limits if a limit is set */
178
void bdrv_io_limits_enable(BlockDriverState *bs)
179
{
180
    assert(!bs->io_limits_enabled);
181
    throttle_init(&bs->throttle_state,
182
                  QEMU_CLOCK_VIRTUAL,
183
                  bdrv_throttle_read_timer_cb,
184
                  bdrv_throttle_write_timer_cb,
185
                  bs);
186
    bs->io_limits_enabled = true;
187
}
188

    
189
/* This function makes an IO wait if needed
190
 *
191
 * @nb_sectors: the number of sectors of the IO
192
 * @is_write:   is the IO a write
193
 */
194
static void bdrv_io_limits_intercept(BlockDriverState *bs,
195
                                     unsigned int bytes,
196
                                     bool is_write)
197
{
198
    /* does this io must wait */
199
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200

    
201
    /* if must wait or any request of this type throttled queue the IO */
202
    if (must_wait ||
203
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205
    }
206

    
207
    /* the IO will be executed, do the accounting */
208
    throttle_account(&bs->throttle_state, is_write, bytes);
209

    
210

    
211
    /* if the next request must wait -> do nothing */
212
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213
        return;
214
    }
215

    
216
    /* else queue next request for execution */
217
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218
}
219

    
220
size_t bdrv_opt_mem_align(BlockDriverState *bs)
221
{
222
    if (!bs || !bs->drv) {
223
        /* 4k should be on the safe side */
224
        return 4096;
225
    }
226

    
227
    return bs->bl.opt_mem_alignment;
228
}
229

    
230
/* check if the path starts with "<protocol>:" */
231
static int path_has_protocol(const char *path)
232
{
233
    const char *p;
234

    
235
#ifdef _WIN32
236
    if (is_windows_drive(path) ||
237
        is_windows_drive_prefix(path)) {
238
        return 0;
239
    }
240
    p = path + strcspn(path, ":/\\");
241
#else
242
    p = path + strcspn(path, ":/");
243
#endif
244

    
245
    return *p == ':';
246
}
247

    
248
int path_is_absolute(const char *path)
249
{
250
#ifdef _WIN32
251
    /* specific case for names like: "\\.\d:" */
252
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253
        return 1;
254
    }
255
    return (*path == '/' || *path == '\\');
256
#else
257
    return (*path == '/');
258
#endif
259
}
260

    
261
/* if filename is absolute, just copy it to dest. Otherwise, build a
262
   path to it by considering it is relative to base_path. URL are
263
   supported. */
264
void path_combine(char *dest, int dest_size,
265
                  const char *base_path,
266
                  const char *filename)
267
{
268
    const char *p, *p1;
269
    int len;
270

    
271
    if (dest_size <= 0)
272
        return;
273
    if (path_is_absolute(filename)) {
274
        pstrcpy(dest, dest_size, filename);
275
    } else {
276
        p = strchr(base_path, ':');
277
        if (p)
278
            p++;
279
        else
280
            p = base_path;
281
        p1 = strrchr(base_path, '/');
282
#ifdef _WIN32
283
        {
284
            const char *p2;
285
            p2 = strrchr(base_path, '\\');
286
            if (!p1 || p2 > p1)
287
                p1 = p2;
288
        }
289
#endif
290
        if (p1)
291
            p1++;
292
        else
293
            p1 = base_path;
294
        if (p1 > p)
295
            p = p1;
296
        len = p - base_path;
297
        if (len > dest_size - 1)
298
            len = dest_size - 1;
299
        memcpy(dest, base_path, len);
300
        dest[len] = '\0';
301
        pstrcat(dest, dest_size, filename);
302
    }
303
}
304

    
305
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306
{
307
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308
        pstrcpy(dest, sz, bs->backing_file);
309
    } else {
310
        path_combine(dest, sz, bs->filename, bs->backing_file);
311
    }
312
}
313

    
314
void bdrv_register(BlockDriver *bdrv)
315
{
316
    /* Block drivers without coroutine functions need emulation */
317
    if (!bdrv->bdrv_co_readv) {
318
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
319
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
320

    
321
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322
         * the block driver lacks aio we need to emulate that too.
323
         */
324
        if (!bdrv->bdrv_aio_readv) {
325
            /* add AIO emulation layer */
326
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328
        }
329
    }
330

    
331
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332
}
333

    
334
/* create a new block device (by default it is empty) */
335
BlockDriverState *bdrv_new(const char *device_name)
336
{
337
    BlockDriverState *bs;
338

    
339
    bs = g_malloc0(sizeof(BlockDriverState));
340
    QLIST_INIT(&bs->dirty_bitmaps);
341
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342
    if (device_name[0] != '\0') {
343
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344
    }
345
    bdrv_iostatus_disable(bs);
346
    notifier_list_init(&bs->close_notifiers);
347
    notifier_with_return_list_init(&bs->before_write_notifiers);
348
    qemu_co_queue_init(&bs->throttled_reqs[0]);
349
    qemu_co_queue_init(&bs->throttled_reqs[1]);
350
    bs->refcnt = 1;
351

    
352
    return bs;
353
}
354

    
355
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356
{
357
    notifier_list_add(&bs->close_notifiers, notify);
358
}
359

    
360
BlockDriver *bdrv_find_format(const char *format_name)
361
{
362
    BlockDriver *drv1;
363
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364
        if (!strcmp(drv1->format_name, format_name)) {
365
            return drv1;
366
        }
367
    }
368
    return NULL;
369
}
370

    
371
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372
{
373
    static const char *whitelist_rw[] = {
374
        CONFIG_BDRV_RW_WHITELIST
375
    };
376
    static const char *whitelist_ro[] = {
377
        CONFIG_BDRV_RO_WHITELIST
378
    };
379
    const char **p;
380

    
381
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
382
        return 1;               /* no whitelist, anything goes */
383
    }
384

    
385
    for (p = whitelist_rw; *p; p++) {
386
        if (!strcmp(drv->format_name, *p)) {
387
            return 1;
388
        }
389
    }
390
    if (read_only) {
391
        for (p = whitelist_ro; *p; p++) {
392
            if (!strcmp(drv->format_name, *p)) {
393
                return 1;
394
            }
395
        }
396
    }
397
    return 0;
398
}
399

    
400
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401
                                          bool read_only)
402
{
403
    BlockDriver *drv = bdrv_find_format(format_name);
404
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405
}
406

    
407
typedef struct CreateCo {
408
    BlockDriver *drv;
409
    char *filename;
410
    QEMUOptionParameter *options;
411
    int ret;
412
    Error *err;
413
} CreateCo;
414

    
415
static void coroutine_fn bdrv_create_co_entry(void *opaque)
416
{
417
    Error *local_err = NULL;
418
    int ret;
419

    
420
    CreateCo *cco = opaque;
421
    assert(cco->drv);
422

    
423
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424
    if (local_err) {
425
        error_propagate(&cco->err, local_err);
426
    }
427
    cco->ret = ret;
428
}
429

    
430
int bdrv_create(BlockDriver *drv, const char* filename,
431
    QEMUOptionParameter *options, Error **errp)
432
{
433
    int ret;
434

    
435
    Coroutine *co;
436
    CreateCo cco = {
437
        .drv = drv,
438
        .filename = g_strdup(filename),
439
        .options = options,
440
        .ret = NOT_DONE,
441
        .err = NULL,
442
    };
443

    
444
    if (!drv->bdrv_create) {
445
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446
        ret = -ENOTSUP;
447
        goto out;
448
    }
449

    
450
    if (qemu_in_coroutine()) {
451
        /* Fast-path if already in coroutine context */
452
        bdrv_create_co_entry(&cco);
453
    } else {
454
        co = qemu_coroutine_create(bdrv_create_co_entry);
455
        qemu_coroutine_enter(co, &cco);
456
        while (cco.ret == NOT_DONE) {
457
            qemu_aio_wait();
458
        }
459
    }
460

    
461
    ret = cco.ret;
462
    if (ret < 0) {
463
        if (cco.err) {
464
            error_propagate(errp, cco.err);
465
        } else {
466
            error_setg_errno(errp, -ret, "Could not create image");
467
        }
468
    }
469

    
470
out:
471
    g_free(cco.filename);
472
    return ret;
473
}
474

    
475
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476
                     Error **errp)
477
{
478
    BlockDriver *drv;
479
    Error *local_err = NULL;
480
    int ret;
481

    
482
    drv = bdrv_find_protocol(filename, true);
483
    if (drv == NULL) {
484
        error_setg(errp, "Could not find protocol for file '%s'", filename);
485
        return -ENOENT;
486
    }
487

    
488
    ret = bdrv_create(drv, filename, options, &local_err);
489
    if (local_err) {
490
        error_propagate(errp, local_err);
491
    }
492
    return ret;
493
}
494

    
495
int bdrv_refresh_limits(BlockDriverState *bs)
496
{
497
    BlockDriver *drv = bs->drv;
498

    
499
    memset(&bs->bl, 0, sizeof(bs->bl));
500

    
501
    if (!drv) {
502
        return 0;
503
    }
504

    
505
    /* Take some limits from the children as a default */
506
    if (bs->file) {
507
        bdrv_refresh_limits(bs->file);
508
        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509
        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510
    } else {
511
        bs->bl.opt_mem_alignment = 512;
512
    }
513

    
514
    if (bs->backing_hd) {
515
        bdrv_refresh_limits(bs->backing_hd);
516
        bs->bl.opt_transfer_length =
517
            MAX(bs->bl.opt_transfer_length,
518
                bs->backing_hd->bl.opt_transfer_length);
519
        bs->bl.opt_mem_alignment =
520
            MAX(bs->bl.opt_mem_alignment,
521
                bs->backing_hd->bl.opt_mem_alignment);
522
    }
523

    
524
    /* Then let the driver override it */
525
    if (drv->bdrv_refresh_limits) {
526
        return drv->bdrv_refresh_limits(bs);
527
    }
528

    
529
    return 0;
530
}
531

    
532
/*
533
 * Create a uniquely-named empty temporary file.
534
 * Return 0 upon success, otherwise a negative errno value.
535
 */
536
int get_tmp_filename(char *filename, int size)
537
{
538
#ifdef _WIN32
539
    char temp_dir[MAX_PATH];
540
    /* GetTempFileName requires that its output buffer (4th param)
541
       have length MAX_PATH or greater.  */
542
    assert(size >= MAX_PATH);
543
    return (GetTempPath(MAX_PATH, temp_dir)
544
            && GetTempFileName(temp_dir, "qem", 0, filename)
545
            ? 0 : -GetLastError());
546
#else
547
    int fd;
548
    const char *tmpdir;
549
    tmpdir = getenv("TMPDIR");
550
    if (!tmpdir)
551
        tmpdir = "/tmp";
552
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
553
        return -EOVERFLOW;
554
    }
555
    fd = mkstemp(filename);
556
    if (fd < 0) {
557
        return -errno;
558
    }
559
    if (close(fd) != 0) {
560
        unlink(filename);
561
        return -errno;
562
    }
563
    return 0;
564
#endif
565
}
566

    
567
/*
568
 * Detect host devices. By convention, /dev/cdrom[N] is always
569
 * recognized as a host CDROM.
570
 */
571
static BlockDriver *find_hdev_driver(const char *filename)
572
{
573
    int score_max = 0, score;
574
    BlockDriver *drv = NULL, *d;
575

    
576
    QLIST_FOREACH(d, &bdrv_drivers, list) {
577
        if (d->bdrv_probe_device) {
578
            score = d->bdrv_probe_device(filename);
579
            if (score > score_max) {
580
                score_max = score;
581
                drv = d;
582
            }
583
        }
584
    }
585

    
586
    return drv;
587
}
588

    
589
BlockDriver *bdrv_find_protocol(const char *filename,
590
                                bool allow_protocol_prefix)
591
{
592
    BlockDriver *drv1;
593
    char protocol[128];
594
    int len;
595
    const char *p;
596

    
597
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
598

    
599
    /*
600
     * XXX(hch): we really should not let host device detection
601
     * override an explicit protocol specification, but moving this
602
     * later breaks access to device names with colons in them.
603
     * Thanks to the brain-dead persistent naming schemes on udev-
604
     * based Linux systems those actually are quite common.
605
     */
606
    drv1 = find_hdev_driver(filename);
607
    if (drv1) {
608
        return drv1;
609
    }
610

    
611
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
612
        return bdrv_find_format("file");
613
    }
614

    
615
    p = strchr(filename, ':');
616
    assert(p != NULL);
617
    len = p - filename;
618
    if (len > sizeof(protocol) - 1)
619
        len = sizeof(protocol) - 1;
620
    memcpy(protocol, filename, len);
621
    protocol[len] = '\0';
622
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
623
        if (drv1->protocol_name &&
624
            !strcmp(drv1->protocol_name, protocol)) {
625
            return drv1;
626
        }
627
    }
628
    return NULL;
629
}
630

    
631
static int find_image_format(BlockDriverState *bs, const char *filename,
632
                             BlockDriver **pdrv, Error **errp)
633
{
634
    int score, score_max;
635
    BlockDriver *drv1, *drv;
636
    uint8_t buf[2048];
637
    int ret = 0;
638

    
639
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
640
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
641
        drv = bdrv_find_format("raw");
642
        if (!drv) {
643
            error_setg(errp, "Could not find raw image format");
644
            ret = -ENOENT;
645
        }
646
        *pdrv = drv;
647
        return ret;
648
    }
649

    
650
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
651
    if (ret < 0) {
652
        error_setg_errno(errp, -ret, "Could not read image for determining its "
653
                         "format");
654
        *pdrv = NULL;
655
        return ret;
656
    }
657

    
658
    score_max = 0;
659
    drv = NULL;
660
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
661
        if (drv1->bdrv_probe) {
662
            score = drv1->bdrv_probe(buf, ret, filename);
663
            if (score > score_max) {
664
                score_max = score;
665
                drv = drv1;
666
            }
667
        }
668
    }
669
    if (!drv) {
670
        error_setg(errp, "Could not determine image format: No compatible "
671
                   "driver found");
672
        ret = -ENOENT;
673
    }
674
    *pdrv = drv;
675
    return ret;
676
}
677

    
678
/**
679
 * Set the current 'total_sectors' value
680
 */
681
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
682
{
683
    BlockDriver *drv = bs->drv;
684

    
685
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
686
    if (bs->sg)
687
        return 0;
688

    
689
    /* query actual device if possible, otherwise just trust the hint */
690
    if (drv->bdrv_getlength) {
691
        int64_t length = drv->bdrv_getlength(bs);
692
        if (length < 0) {
693
            return length;
694
        }
695
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
696
    }
697

    
698
    bs->total_sectors = hint;
699
    return 0;
700
}
701

    
702
/**
703
 * Set open flags for a given discard mode
704
 *
705
 * Return 0 on success, -1 if the discard mode was invalid.
706
 */
707
int bdrv_parse_discard_flags(const char *mode, int *flags)
708
{
709
    *flags &= ~BDRV_O_UNMAP;
710

    
711
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
712
        /* do nothing */
713
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
714
        *flags |= BDRV_O_UNMAP;
715
    } else {
716
        return -1;
717
    }
718

    
719
    return 0;
720
}
721

    
722
/**
723
 * Set open flags for a given cache mode
724
 *
725
 * Return 0 on success, -1 if the cache mode was invalid.
726
 */
727
int bdrv_parse_cache_flags(const char *mode, int *flags)
728
{
729
    *flags &= ~BDRV_O_CACHE_MASK;
730

    
731
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
732
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
733
    } else if (!strcmp(mode, "directsync")) {
734
        *flags |= BDRV_O_NOCACHE;
735
    } else if (!strcmp(mode, "writeback")) {
736
        *flags |= BDRV_O_CACHE_WB;
737
    } else if (!strcmp(mode, "unsafe")) {
738
        *flags |= BDRV_O_CACHE_WB;
739
        *flags |= BDRV_O_NO_FLUSH;
740
    } else if (!strcmp(mode, "writethrough")) {
741
        /* this is the default */
742
    } else {
743
        return -1;
744
    }
745

    
746
    return 0;
747
}
748

    
749
/**
750
 * The copy-on-read flag is actually a reference count so multiple users may
751
 * use the feature without worrying about clobbering its previous state.
752
 * Copy-on-read stays enabled until all users have called to disable it.
753
 */
754
void bdrv_enable_copy_on_read(BlockDriverState *bs)
755
{
756
    bs->copy_on_read++;
757
}
758

    
759
void bdrv_disable_copy_on_read(BlockDriverState *bs)
760
{
761
    assert(bs->copy_on_read > 0);
762
    bs->copy_on_read--;
763
}
764

    
765
static int bdrv_open_flags(BlockDriverState *bs, int flags)
766
{
767
    int open_flags = flags | BDRV_O_CACHE_WB;
768

    
769
    /*
770
     * Clear flags that are internal to the block layer before opening the
771
     * image.
772
     */
773
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
774

    
775
    /*
776
     * Snapshots should be writable.
777
     */
778
    if (bs->is_temporary) {
779
        open_flags |= BDRV_O_RDWR;
780
    }
781

    
782
    return open_flags;
783
}
784

    
785
static int bdrv_assign_node_name(BlockDriverState *bs,
786
                                 const char *node_name,
787
                                 Error **errp)
788
{
789
    if (!node_name) {
790
        return 0;
791
    }
792

    
793
    /* empty string node name is invalid */
794
    if (node_name[0] == '\0') {
795
        error_setg(errp, "Empty node name");
796
        return -EINVAL;
797
    }
798

    
799
    /* takes care of avoiding namespaces collisions */
800
    if (bdrv_find(node_name)) {
801
        error_setg(errp, "node-name=%s is conflicting with a device id",
802
                   node_name);
803
        return -EINVAL;
804
    }
805

    
806
    /* takes care of avoiding duplicates node names */
807
    if (bdrv_find_node(node_name)) {
808
        error_setg(errp, "Duplicate node name");
809
        return -EINVAL;
810
    }
811

    
812
    /* copy node name into the bs and insert it into the graph list */
813
    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
814
    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
815

    
816
    return 0;
817
}
818

    
819
/*
820
 * Common part for opening disk images and files
821
 *
822
 * Removes all processed options from *options.
823
 */
824
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
825
    QDict *options, int flags, BlockDriver *drv, Error **errp)
826
{
827
    int ret, open_flags;
828
    const char *filename;
829
    const char *node_name = NULL;
830
    Error *local_err = NULL;
831

    
832
    assert(drv != NULL);
833
    assert(bs->file == NULL);
834
    assert(options != NULL && bs->options != options);
835

    
836
    if (file != NULL) {
837
        filename = file->filename;
838
    } else {
839
        filename = qdict_get_try_str(options, "filename");
840
    }
841

    
842
    if (drv->bdrv_needs_filename && !filename) {
843
        error_setg(errp, "The '%s' block driver requires a file name",
844
                   drv->format_name);
845
        return -EINVAL;
846
    }
847

    
848
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
849

    
850
    node_name = qdict_get_try_str(options, "node-name");
851
    ret = bdrv_assign_node_name(bs, node_name, errp);
852
    if (ret < 0) {
853
        return ret;
854
    }
855
    qdict_del(options, "node-name");
856

    
857
    /* bdrv_open() with directly using a protocol as drv. This layer is already
858
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
859
     * and return immediately. */
860
    if (file != NULL && drv->bdrv_file_open) {
861
        bdrv_swap(file, bs);
862
        return 0;
863
    }
864

    
865
    bs->open_flags = flags;
866
    bs->guest_block_size = 512;
867
    bs->request_alignment = 512;
868
    bs->zero_beyond_eof = true;
869
    open_flags = bdrv_open_flags(bs, flags);
870
    bs->read_only = !(open_flags & BDRV_O_RDWR);
871

    
872
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
873
        error_setg(errp,
874
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
875
                        ? "Driver '%s' can only be used for read-only devices"
876
                        : "Driver '%s' is not whitelisted",
877
                   drv->format_name);
878
        return -ENOTSUP;
879
    }
880

    
881
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
882
    if (flags & BDRV_O_COPY_ON_READ) {
883
        if (!bs->read_only) {
884
            bdrv_enable_copy_on_read(bs);
885
        } else {
886
            error_setg(errp, "Can't use copy-on-read on read-only device");
887
            return -EINVAL;
888
        }
889
    }
890

    
891
    if (filename != NULL) {
892
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
893
    } else {
894
        bs->filename[0] = '\0';
895
    }
896

    
897
    bs->drv = drv;
898
    bs->opaque = g_malloc0(drv->instance_size);
899

    
900
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
901

    
902
    /* Open the image, either directly or using a protocol */
903
    if (drv->bdrv_file_open) {
904
        assert(file == NULL);
905
        assert(!drv->bdrv_needs_filename || filename != NULL);
906
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
907
    } else {
908
        if (file == NULL) {
909
            error_setg(errp, "Can't use '%s' as a block driver for the "
910
                       "protocol level", drv->format_name);
911
            ret = -EINVAL;
912
            goto free_and_fail;
913
        }
914
        bs->file = file;
915
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
916
    }
917

    
918
    if (ret < 0) {
919
        if (local_err) {
920
            error_propagate(errp, local_err);
921
        } else if (bs->filename[0]) {
922
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
923
        } else {
924
            error_setg_errno(errp, -ret, "Could not open image");
925
        }
926
        goto free_and_fail;
927
    }
928

    
929
    ret = refresh_total_sectors(bs, bs->total_sectors);
930
    if (ret < 0) {
931
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
932
        goto free_and_fail;
933
    }
934

    
935
    bdrv_refresh_limits(bs);
936
    assert(bdrv_opt_mem_align(bs) != 0);
937
    assert(bs->request_alignment != 0);
938

    
939
#ifndef _WIN32
940
    if (bs->is_temporary) {
941
        assert(bs->filename[0] != '\0');
942
        unlink(bs->filename);
943
    }
944
#endif
945
    return 0;
946

    
947
free_and_fail:
948
    bs->file = NULL;
949
    g_free(bs->opaque);
950
    bs->opaque = NULL;
951
    bs->drv = NULL;
952
    return ret;
953
}
954

    
955
/*
956
 * Opens a file using a protocol (file, host_device, nbd, ...)
957
 *
958
 * options is a QDict of options to pass to the block drivers, or NULL for an
959
 * empty set of options. The reference to the QDict belongs to the block layer
960
 * after the call (even on failure), so if the caller intends to reuse the
961
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
962
 */
963
static int bdrv_file_open(BlockDriverState **pbs, const char *filename,
964
                          QDict *options, int flags, Error **errp)
965
{
966
    BlockDriverState *bs = NULL;
967
    BlockDriver *drv;
968
    const char *drvname;
969
    bool allow_protocol_prefix = false;
970
    Error *local_err = NULL;
971
    int ret;
972

    
973
    /* NULL means an empty set of options */
974
    if (options == NULL) {
975
        options = qdict_new();
976
    }
977

    
978
    bs = bdrv_new("");
979
    bs->options = options;
980
    options = qdict_clone_shallow(options);
981

    
982
    /* Fetch the file name from the options QDict if necessary */
983
    if (!filename) {
984
        filename = qdict_get_try_str(options, "filename");
985
    } else if (filename && !qdict_haskey(options, "filename")) {
986
        qdict_put(options, "filename", qstring_from_str(filename));
987
        allow_protocol_prefix = true;
988
    } else {
989
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
990
                   "same time");
991
        ret = -EINVAL;
992
        goto fail;
993
    }
994

    
995
    /* Find the right block driver */
996
    drvname = qdict_get_try_str(options, "driver");
997
    if (drvname) {
998
        drv = bdrv_find_format(drvname);
999
        if (!drv) {
1000
            error_setg(errp, "Unknown driver '%s'", drvname);
1001
        }
1002
        qdict_del(options, "driver");
1003
    } else if (filename) {
1004
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1005
        if (!drv) {
1006
            error_setg(errp, "Unknown protocol");
1007
        }
1008
    } else {
1009
        error_setg(errp, "Must specify either driver or file");
1010
        drv = NULL;
1011
    }
1012

    
1013
    if (!drv) {
1014
        /* errp has been set already */
1015
        ret = -ENOENT;
1016
        goto fail;
1017
    }
1018

    
1019
    /* Parse the filename and open it */
1020
    if (drv->bdrv_parse_filename && filename) {
1021
        drv->bdrv_parse_filename(filename, options, &local_err);
1022
        if (local_err) {
1023
            error_propagate(errp, local_err);
1024
            ret = -EINVAL;
1025
            goto fail;
1026
        }
1027
        qdict_del(options, "filename");
1028
    }
1029

    
1030
    if (!drv->bdrv_file_open) {
1031
        ret = bdrv_open(&bs, filename, NULL, options, flags, drv, &local_err);
1032
        options = NULL;
1033
    } else {
1034
        ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1035
    }
1036
    if (ret < 0) {
1037
        error_propagate(errp, local_err);
1038
        goto fail;
1039
    }
1040

    
1041
    /* Check if any unknown options were used */
1042
    if (options && (qdict_size(options) != 0)) {
1043
        const QDictEntry *entry = qdict_first(options);
1044
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1045
                   drv->format_name, entry->key);
1046
        ret = -EINVAL;
1047
        goto fail;
1048
    }
1049
    QDECREF(options);
1050

    
1051
    bs->growable = 1;
1052
    *pbs = bs;
1053
    return 0;
1054

    
1055
fail:
1056
    QDECREF(options);
1057
    if (!bs->drv) {
1058
        QDECREF(bs->options);
1059
    }
1060
    bdrv_unref(bs);
1061
    return ret;
1062
}
1063

    
1064
/*
1065
 * Opens the backing file for a BlockDriverState if not yet open
1066
 *
1067
 * options is a QDict of options to pass to the block drivers, or NULL for an
1068
 * empty set of options. The reference to the QDict is transferred to this
1069
 * function (even on failure), so if the caller intends to reuse the dictionary,
1070
 * it needs to use QINCREF() before calling bdrv_file_open.
1071
 */
1072
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1073
{
1074
    char backing_filename[PATH_MAX];
1075
    int back_flags, ret;
1076
    BlockDriver *back_drv = NULL;
1077
    Error *local_err = NULL;
1078

    
1079
    if (bs->backing_hd != NULL) {
1080
        QDECREF(options);
1081
        return 0;
1082
    }
1083

    
1084
    /* NULL means an empty set of options */
1085
    if (options == NULL) {
1086
        options = qdict_new();
1087
    }
1088

    
1089
    bs->open_flags &= ~BDRV_O_NO_BACKING;
1090
    if (qdict_haskey(options, "file.filename")) {
1091
        backing_filename[0] = '\0';
1092
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1093
        QDECREF(options);
1094
        return 0;
1095
    } else {
1096
        bdrv_get_full_backing_filename(bs, backing_filename,
1097
                                       sizeof(backing_filename));
1098
    }
1099

    
1100
    if (bs->backing_format[0] != '\0') {
1101
        back_drv = bdrv_find_format(bs->backing_format);
1102
    }
1103

    
1104
    /* backing files always opened read-only */
1105
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1106
                                    BDRV_O_COPY_ON_READ);
1107

    
1108
    assert(bs->backing_hd == NULL);
1109
    ret = bdrv_open(&bs->backing_hd,
1110
                    *backing_filename ? backing_filename : NULL, NULL, options,
1111
                    back_flags, back_drv, &local_err);
1112
    if (ret < 0) {
1113
        bs->backing_hd = NULL;
1114
        bs->open_flags |= BDRV_O_NO_BACKING;
1115
        error_setg(errp, "Could not open backing file: %s",
1116
                   error_get_pretty(local_err));
1117
        error_free(local_err);
1118
        return ret;
1119
    }
1120

    
1121
    if (bs->backing_hd->file) {
1122
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1123
                bs->backing_hd->file->filename);
1124
    }
1125

    
1126
    /* Recalculate the BlockLimits with the backing file */
1127
    bdrv_refresh_limits(bs);
1128

    
1129
    return 0;
1130
}
1131

    
1132
/*
1133
 * Opens a disk image whose options are given as BlockdevRef in another block
1134
 * device's options.
1135
 *
1136
 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1137
 * image format auto-detection. If it is false and a filename is given,
1138
 * bdrv_open() will be used for auto-detection.
1139
 *
1140
 * If allow_none is true, no image will be opened if filename is false and no
1141
 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1142
 *
1143
 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1144
 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1145
 * itself, all options starting with "${bdref_key}." are considered part of the
1146
 * BlockdevRef.
1147
 *
1148
 * The BlockdevRef will be removed from the options QDict.
1149
 *
1150
 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1151
 */
1152
int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1153
                    QDict *options, const char *bdref_key, int flags,
1154
                    bool force_raw, bool allow_none, Error **errp)
1155
{
1156
    QDict *image_options;
1157
    int ret;
1158
    char *bdref_key_dot;
1159
    const char *reference;
1160

    
1161
    assert(pbs);
1162
    assert(*pbs == NULL);
1163

    
1164
    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1165
    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1166
    g_free(bdref_key_dot);
1167

    
1168
    reference = qdict_get_try_str(options, bdref_key);
1169
    if (!filename && !reference && !qdict_size(image_options)) {
1170
        if (allow_none) {
1171
            ret = 0;
1172
        } else {
1173
            error_setg(errp, "A block device must be specified for \"%s\"",
1174
                       bdref_key);
1175
            ret = -EINVAL;
1176
        }
1177
        goto done;
1178
    }
1179

    
1180
    if (filename && !force_raw) {
1181
        /* If a filename is given and the block driver should be detected
1182
           automatically (instead of using none), use bdrv_open() in order to do
1183
           that auto-detection. */
1184
        if (reference) {
1185
            error_setg(errp, "Cannot reference an existing block device while "
1186
                       "giving a filename");
1187
            ret = -EINVAL;
1188
            goto done;
1189
        }
1190

    
1191
        ret = bdrv_open(pbs, filename, NULL, image_options, flags, NULL, errp);
1192
    } else {
1193
        ret = bdrv_open(pbs, filename, reference, image_options,
1194
                        flags | BDRV_O_PROTOCOL, NULL, errp);
1195
    }
1196

    
1197
done:
1198
    qdict_del(options, bdref_key);
1199
    return ret;
1200
}
1201

    
1202
/*
1203
 * Opens a disk image (raw, qcow2, vmdk, ...)
1204
 *
1205
 * options is a QDict of options to pass to the block drivers, or NULL for an
1206
 * empty set of options. The reference to the QDict belongs to the block layer
1207
 * after the call (even on failure), so if the caller intends to reuse the
1208
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1209
 *
1210
 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1211
 * If it is not NULL, the referenced BDS will be reused.
1212
 *
1213
 * The reference parameter may be used to specify an existing block device which
1214
 * should be opened. If specified, neither options nor a filename may be given,
1215
 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1216
 */
1217
int bdrv_open(BlockDriverState **pbs, const char *filename,
1218
              const char *reference, QDict *options, int flags,
1219
              BlockDriver *drv, Error **errp)
1220
{
1221
    int ret;
1222
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1223
    char tmp_filename[PATH_MAX + 1];
1224
    BlockDriverState *file = NULL, *bs;
1225
    const char *drvname;
1226
    Error *local_err = NULL;
1227

    
1228
    assert(pbs);
1229

    
1230
    if (reference) {
1231
        bool options_non_empty = options ? qdict_size(options) : false;
1232
        QDECREF(options);
1233

    
1234
        if (*pbs) {
1235
            error_setg(errp, "Cannot reuse an existing BDS when referencing "
1236
                       "another block device");
1237
            return -EINVAL;
1238
        }
1239

    
1240
        if (filename || options_non_empty) {
1241
            error_setg(errp, "Cannot reference an existing block device with "
1242
                       "additional options or a new filename");
1243
            return -EINVAL;
1244
        }
1245

    
1246
        bs = bdrv_lookup_bs(reference, reference, errp);
1247
        if (!bs) {
1248
            return -ENODEV;
1249
        }
1250
        bdrv_ref(bs);
1251
        *pbs = bs;
1252
        return 0;
1253
    }
1254

    
1255
    if (flags & BDRV_O_PROTOCOL) {
1256
        assert(!drv);
1257
        return bdrv_file_open(pbs, filename, options, flags & ~BDRV_O_PROTOCOL,
1258
                              errp);
1259
    }
1260

    
1261
    if (*pbs) {
1262
        bs = *pbs;
1263
    } else {
1264
        bs = bdrv_new("");
1265
    }
1266

    
1267
    /* NULL means an empty set of options */
1268
    if (options == NULL) {
1269
        options = qdict_new();
1270
    }
1271

    
1272
    bs->options = options;
1273
    options = qdict_clone_shallow(options);
1274

    
1275
    /* For snapshot=on, create a temporary qcow2 overlay */
1276
    if (flags & BDRV_O_SNAPSHOT) {
1277
        BlockDriverState *bs1;
1278
        int64_t total_size;
1279
        BlockDriver *bdrv_qcow2;
1280
        QEMUOptionParameter *create_options;
1281
        QDict *snapshot_options;
1282

    
1283
        /* if snapshot, we create a temporary backing file and open it
1284
           instead of opening 'filename' directly */
1285

    
1286
        /* Get the required size from the image */
1287
        QINCREF(options);
1288
        bs1 = NULL;
1289
        ret = bdrv_open(&bs1, filename, NULL, options, BDRV_O_NO_BACKING,
1290
                        drv, &local_err);
1291
        if (ret < 0) {
1292
            goto fail;
1293
        }
1294
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1295

    
1296
        bdrv_unref(bs1);
1297

    
1298
        /* Create the temporary image */
1299
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1300
        if (ret < 0) {
1301
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1302
            goto fail;
1303
        }
1304

    
1305
        bdrv_qcow2 = bdrv_find_format("qcow2");
1306
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1307
                                                 NULL);
1308

    
1309
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1310

    
1311
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1312
        free_option_parameters(create_options);
1313
        if (ret < 0) {
1314
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1315
                             "'%s': %s", tmp_filename,
1316
                             error_get_pretty(local_err));
1317
            error_free(local_err);
1318
            local_err = NULL;
1319
            goto fail;
1320
        }
1321

    
1322
        /* Prepare a new options QDict for the temporary file, where user
1323
         * options refer to the backing file */
1324
        if (filename) {
1325
            qdict_put(options, "file.filename", qstring_from_str(filename));
1326
        }
1327
        if (drv) {
1328
            qdict_put(options, "driver", qstring_from_str(drv->format_name));
1329
        }
1330

    
1331
        snapshot_options = qdict_new();
1332
        qdict_put(snapshot_options, "backing", options);
1333
        qdict_flatten(snapshot_options);
1334

    
1335
        bs->options = snapshot_options;
1336
        options = qdict_clone_shallow(bs->options);
1337

    
1338
        filename = tmp_filename;
1339
        drv = bdrv_qcow2;
1340
        bs->is_temporary = 1;
1341
    }
1342

    
1343
    /* Open image file without format layer */
1344
    if (flags & BDRV_O_RDWR) {
1345
        flags |= BDRV_O_ALLOW_RDWR;
1346
    }
1347

    
1348
    assert(file == NULL);
1349
    ret = bdrv_open_image(&file, filename, options, "file",
1350
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1351
                          &local_err);
1352
    if (ret < 0) {
1353
        goto fail;
1354
    }
1355

    
1356
    /* Find the right image format driver */
1357
    drvname = qdict_get_try_str(options, "driver");
1358
    if (drvname) {
1359
        drv = bdrv_find_format(drvname);
1360
        qdict_del(options, "driver");
1361
        if (!drv) {
1362
            error_setg(errp, "Invalid driver: '%s'", drvname);
1363
            ret = -EINVAL;
1364
            goto unlink_and_fail;
1365
        }
1366
    }
1367

    
1368
    if (!drv) {
1369
        if (file) {
1370
            ret = find_image_format(file, filename, &drv, &local_err);
1371
        } else {
1372
            error_setg(errp, "Must specify either driver or file");
1373
            ret = -EINVAL;
1374
            goto unlink_and_fail;
1375
        }
1376
    }
1377

    
1378
    if (!drv) {
1379
        goto unlink_and_fail;
1380
    }
1381

    
1382
    /* Open the image */
1383
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1384
    if (ret < 0) {
1385
        goto unlink_and_fail;
1386
    }
1387

    
1388
    if (file && (bs->file != file)) {
1389
        bdrv_unref(file);
1390
        file = NULL;
1391
    }
1392

    
1393
    /* If there is a backing file, use it */
1394
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1395
        QDict *backing_options;
1396

    
1397
        qdict_extract_subqdict(options, &backing_options, "backing.");
1398
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1399
        if (ret < 0) {
1400
            goto close_and_fail;
1401
        }
1402
    }
1403

    
1404
    /* Check if any unknown options were used */
1405
    if (qdict_size(options) != 0) {
1406
        const QDictEntry *entry = qdict_first(options);
1407
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1408
                   "support the option '%s'", drv->format_name, bs->device_name,
1409
                   entry->key);
1410

    
1411
        ret = -EINVAL;
1412
        goto close_and_fail;
1413
    }
1414
    QDECREF(options);
1415

    
1416
    if (!bdrv_key_required(bs)) {
1417
        bdrv_dev_change_media_cb(bs, true);
1418
    }
1419

    
1420
    *pbs = bs;
1421
    return 0;
1422

    
1423
unlink_and_fail:
1424
    if (file != NULL) {
1425
        bdrv_unref(file);
1426
    }
1427
    if (bs->is_temporary) {
1428
        unlink(filename);
1429
    }
1430
fail:
1431
    QDECREF(bs->options);
1432
    QDECREF(options);
1433
    bs->options = NULL;
1434
    if (!*pbs) {
1435
        /* If *pbs is NULL, a new BDS has been created in this function and
1436
           needs to be freed now. Otherwise, it does not need to be closed,
1437
           since it has not really been opened yet. */
1438
        bdrv_unref(bs);
1439
    }
1440
    if (local_err) {
1441
        error_propagate(errp, local_err);
1442
    }
1443
    return ret;
1444

    
1445
close_and_fail:
1446
    /* See fail path, but now the BDS has to be always closed */
1447
    if (*pbs) {
1448
        bdrv_close(bs);
1449
    } else {
1450
        bdrv_unref(bs);
1451
    }
1452
    QDECREF(options);
1453
    if (local_err) {
1454
        error_propagate(errp, local_err);
1455
    }
1456
    return ret;
1457
}
1458

    
1459
typedef struct BlockReopenQueueEntry {
1460
     bool prepared;
1461
     BDRVReopenState state;
1462
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1463
} BlockReopenQueueEntry;
1464

    
1465
/*
1466
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1467
 * reopen of multiple devices.
1468
 *
1469
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1470
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1471
 * be created and initialized. This newly created BlockReopenQueue should be
1472
 * passed back in for subsequent calls that are intended to be of the same
1473
 * atomic 'set'.
1474
 *
1475
 * bs is the BlockDriverState to add to the reopen queue.
1476
 *
1477
 * flags contains the open flags for the associated bs
1478
 *
1479
 * returns a pointer to bs_queue, which is either the newly allocated
1480
 * bs_queue, or the existing bs_queue being used.
1481
 *
1482
 */
1483
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1484
                                    BlockDriverState *bs, int flags)
1485
{
1486
    assert(bs != NULL);
1487

    
1488
    BlockReopenQueueEntry *bs_entry;
1489
    if (bs_queue == NULL) {
1490
        bs_queue = g_new0(BlockReopenQueue, 1);
1491
        QSIMPLEQ_INIT(bs_queue);
1492
    }
1493

    
1494
    if (bs->file) {
1495
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1496
    }
1497

    
1498
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1499
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1500

    
1501
    bs_entry->state.bs = bs;
1502
    bs_entry->state.flags = flags;
1503

    
1504
    return bs_queue;
1505
}
1506

    
1507
/*
1508
 * Reopen multiple BlockDriverStates atomically & transactionally.
1509
 *
1510
 * The queue passed in (bs_queue) must have been built up previous
1511
 * via bdrv_reopen_queue().
1512
 *
1513
 * Reopens all BDS specified in the queue, with the appropriate
1514
 * flags.  All devices are prepared for reopen, and failure of any
1515
 * device will cause all device changes to be abandonded, and intermediate
1516
 * data cleaned up.
1517
 *
1518
 * If all devices prepare successfully, then the changes are committed
1519
 * to all devices.
1520
 *
1521
 */
1522
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1523
{
1524
    int ret = -1;
1525
    BlockReopenQueueEntry *bs_entry, *next;
1526
    Error *local_err = NULL;
1527

    
1528
    assert(bs_queue != NULL);
1529

    
1530
    bdrv_drain_all();
1531

    
1532
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1533
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1534
            error_propagate(errp, local_err);
1535
            goto cleanup;
1536
        }
1537
        bs_entry->prepared = true;
1538
    }
1539

    
1540
    /* If we reach this point, we have success and just need to apply the
1541
     * changes
1542
     */
1543
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1544
        bdrv_reopen_commit(&bs_entry->state);
1545
    }
1546

    
1547
    ret = 0;
1548

    
1549
cleanup:
1550
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1551
        if (ret && bs_entry->prepared) {
1552
            bdrv_reopen_abort(&bs_entry->state);
1553
        }
1554
        g_free(bs_entry);
1555
    }
1556
    g_free(bs_queue);
1557
    return ret;
1558
}
1559

    
1560

    
1561
/* Reopen a single BlockDriverState with the specified flags. */
1562
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1563
{
1564
    int ret = -1;
1565
    Error *local_err = NULL;
1566
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1567

    
1568
    ret = bdrv_reopen_multiple(queue, &local_err);
1569
    if (local_err != NULL) {
1570
        error_propagate(errp, local_err);
1571
    }
1572
    return ret;
1573
}
1574

    
1575

    
1576
/*
1577
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1578
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1579
 * the block driver layer .bdrv_reopen_prepare()
1580
 *
1581
 * bs is the BlockDriverState to reopen
1582
 * flags are the new open flags
1583
 * queue is the reopen queue
1584
 *
1585
 * Returns 0 on success, non-zero on error.  On error errp will be set
1586
 * as well.
1587
 *
1588
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1589
 * It is the responsibility of the caller to then call the abort() or
1590
 * commit() for any other BDS that have been left in a prepare() state
1591
 *
1592
 */
1593
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1594
                        Error **errp)
1595
{
1596
    int ret = -1;
1597
    Error *local_err = NULL;
1598
    BlockDriver *drv;
1599

    
1600
    assert(reopen_state != NULL);
1601
    assert(reopen_state->bs->drv != NULL);
1602
    drv = reopen_state->bs->drv;
1603

    
1604
    /* if we are to stay read-only, do not allow permission change
1605
     * to r/w */
1606
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1607
        reopen_state->flags & BDRV_O_RDWR) {
1608
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1609
                  reopen_state->bs->device_name);
1610
        goto error;
1611
    }
1612

    
1613

    
1614
    ret = bdrv_flush(reopen_state->bs);
1615
    if (ret) {
1616
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1617
                  strerror(-ret));
1618
        goto error;
1619
    }
1620

    
1621
    if (drv->bdrv_reopen_prepare) {
1622
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1623
        if (ret) {
1624
            if (local_err != NULL) {
1625
                error_propagate(errp, local_err);
1626
            } else {
1627
                error_setg(errp, "failed while preparing to reopen image '%s'",
1628
                           reopen_state->bs->filename);
1629
            }
1630
            goto error;
1631
        }
1632
    } else {
1633
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1634
         * handler for each supported drv. */
1635
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1636
                  drv->format_name, reopen_state->bs->device_name,
1637
                 "reopening of file");
1638
        ret = -1;
1639
        goto error;
1640
    }
1641

    
1642
    ret = 0;
1643

    
1644
error:
1645
    return ret;
1646
}
1647

    
1648
/*
1649
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1650
 * makes them final by swapping the staging BlockDriverState contents into
1651
 * the active BlockDriverState contents.
1652
 */
1653
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1654
{
1655
    BlockDriver *drv;
1656

    
1657
    assert(reopen_state != NULL);
1658
    drv = reopen_state->bs->drv;
1659
    assert(drv != NULL);
1660

    
1661
    /* If there are any driver level actions to take */
1662
    if (drv->bdrv_reopen_commit) {
1663
        drv->bdrv_reopen_commit(reopen_state);
1664
    }
1665

    
1666
    /* set BDS specific flags now */
1667
    reopen_state->bs->open_flags         = reopen_state->flags;
1668
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1669
                                              BDRV_O_CACHE_WB);
1670
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1671

    
1672
    bdrv_refresh_limits(reopen_state->bs);
1673
}
1674

    
1675
/*
1676
 * Abort the reopen, and delete and free the staged changes in
1677
 * reopen_state
1678
 */
1679
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1680
{
1681
    BlockDriver *drv;
1682

    
1683
    assert(reopen_state != NULL);
1684
    drv = reopen_state->bs->drv;
1685
    assert(drv != NULL);
1686

    
1687
    if (drv->bdrv_reopen_abort) {
1688
        drv->bdrv_reopen_abort(reopen_state);
1689
    }
1690
}
1691

    
1692

    
1693
void bdrv_close(BlockDriverState *bs)
1694
{
1695
    if (bs->job) {
1696
        block_job_cancel_sync(bs->job);
1697
    }
1698
    bdrv_drain_all(); /* complete I/O */
1699
    bdrv_flush(bs);
1700
    bdrv_drain_all(); /* in case flush left pending I/O */
1701
    notifier_list_notify(&bs->close_notifiers, bs);
1702

    
1703
    if (bs->drv) {
1704
        if (bs->backing_hd) {
1705
            bdrv_unref(bs->backing_hd);
1706
            bs->backing_hd = NULL;
1707
        }
1708
        bs->drv->bdrv_close(bs);
1709
        g_free(bs->opaque);
1710
#ifdef _WIN32
1711
        if (bs->is_temporary) {
1712
            unlink(bs->filename);
1713
        }
1714
#endif
1715
        bs->opaque = NULL;
1716
        bs->drv = NULL;
1717
        bs->copy_on_read = 0;
1718
        bs->backing_file[0] = '\0';
1719
        bs->backing_format[0] = '\0';
1720
        bs->total_sectors = 0;
1721
        bs->encrypted = 0;
1722
        bs->valid_key = 0;
1723
        bs->sg = 0;
1724
        bs->growable = 0;
1725
        bs->zero_beyond_eof = false;
1726
        QDECREF(bs->options);
1727
        bs->options = NULL;
1728

    
1729
        if (bs->file != NULL) {
1730
            bdrv_unref(bs->file);
1731
            bs->file = NULL;
1732
        }
1733
    }
1734

    
1735
    bdrv_dev_change_media_cb(bs, false);
1736

    
1737
    /*throttling disk I/O limits*/
1738
    if (bs->io_limits_enabled) {
1739
        bdrv_io_limits_disable(bs);
1740
    }
1741
}
1742

    
1743
void bdrv_close_all(void)
1744
{
1745
    BlockDriverState *bs;
1746

    
1747
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1748
        bdrv_close(bs);
1749
    }
1750
}
1751

    
1752
/* Check if any requests are in-flight (including throttled requests) */
1753
static bool bdrv_requests_pending(BlockDriverState *bs)
1754
{
1755
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1756
        return true;
1757
    }
1758
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1759
        return true;
1760
    }
1761
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1762
        return true;
1763
    }
1764
    if (bs->file && bdrv_requests_pending(bs->file)) {
1765
        return true;
1766
    }
1767
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1768
        return true;
1769
    }
1770
    return false;
1771
}
1772

    
1773
static bool bdrv_requests_pending_all(void)
1774
{
1775
    BlockDriverState *bs;
1776
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1777
        if (bdrv_requests_pending(bs)) {
1778
            return true;
1779
        }
1780
    }
1781
    return false;
1782
}
1783

    
1784
/*
1785
 * Wait for pending requests to complete across all BlockDriverStates
1786
 *
1787
 * This function does not flush data to disk, use bdrv_flush_all() for that
1788
 * after calling this function.
1789
 *
1790
 * Note that completion of an asynchronous I/O operation can trigger any
1791
 * number of other I/O operations on other devices---for example a coroutine
1792
 * can be arbitrarily complex and a constant flow of I/O can come until the
1793
 * coroutine is complete.  Because of this, it is not possible to have a
1794
 * function to drain a single device's I/O queue.
1795
 */
1796
void bdrv_drain_all(void)
1797
{
1798
    /* Always run first iteration so any pending completion BHs run */
1799
    bool busy = true;
1800
    BlockDriverState *bs;
1801

    
1802
    while (busy) {
1803
        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1804
            bdrv_start_throttled_reqs(bs);
1805
        }
1806

    
1807
        busy = bdrv_requests_pending_all();
1808
        busy |= aio_poll(qemu_get_aio_context(), busy);
1809
    }
1810
}
1811

    
1812
/* make a BlockDriverState anonymous by removing from bdrv_state and
1813
 * graph_bdrv_state list.
1814
   Also, NULL terminate the device_name to prevent double remove */
1815
void bdrv_make_anon(BlockDriverState *bs)
1816
{
1817
    if (bs->device_name[0] != '\0') {
1818
        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1819
    }
1820
    bs->device_name[0] = '\0';
1821
    if (bs->node_name[0] != '\0') {
1822
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1823
    }
1824
    bs->node_name[0] = '\0';
1825
}
1826

    
1827
static void bdrv_rebind(BlockDriverState *bs)
1828
{
1829
    if (bs->drv && bs->drv->bdrv_rebind) {
1830
        bs->drv->bdrv_rebind(bs);
1831
    }
1832
}
1833

    
1834
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1835
                                     BlockDriverState *bs_src)
1836
{
1837
    /* move some fields that need to stay attached to the device */
1838
    bs_dest->open_flags         = bs_src->open_flags;
1839

    
1840
    /* dev info */
1841
    bs_dest->dev_ops            = bs_src->dev_ops;
1842
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1843
    bs_dest->dev                = bs_src->dev;
1844
    bs_dest->guest_block_size   = bs_src->guest_block_size;
1845
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1846

    
1847
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1848

    
1849
    /* i/o throttled req */
1850
    memcpy(&bs_dest->throttle_state,
1851
           &bs_src->throttle_state,
1852
           sizeof(ThrottleState));
1853
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1854
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1855
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1856

    
1857
    /* r/w error */
1858
    bs_dest->on_read_error      = bs_src->on_read_error;
1859
    bs_dest->on_write_error     = bs_src->on_write_error;
1860

    
1861
    /* i/o status */
1862
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1863
    bs_dest->iostatus           = bs_src->iostatus;
1864

    
1865
    /* dirty bitmap */
1866
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1867

    
1868
    /* reference count */
1869
    bs_dest->refcnt             = bs_src->refcnt;
1870

    
1871
    /* job */
1872
    bs_dest->in_use             = bs_src->in_use;
1873
    bs_dest->job                = bs_src->job;
1874

    
1875
    /* keep the same entry in bdrv_states */
1876
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1877
            bs_src->device_name);
1878
    bs_dest->device_list = bs_src->device_list;
1879

    
1880
    /* keep the same entry in graph_bdrv_states
1881
     * We do want to swap name but don't want to swap linked list entries
1882
     */
1883
    bs_dest->node_list   = bs_src->node_list;
1884
}
1885

    
1886
/*
1887
 * Swap bs contents for two image chains while they are live,
1888
 * while keeping required fields on the BlockDriverState that is
1889
 * actually attached to a device.
1890
 *
1891
 * This will modify the BlockDriverState fields, and swap contents
1892
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1893
 *
1894
 * bs_new is required to be anonymous.
1895
 *
1896
 * This function does not create any image files.
1897
 */
1898
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1899
{
1900
    BlockDriverState tmp;
1901

    
1902
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1903
    assert(bs_new->device_name[0] == '\0');
1904
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1905
    assert(bs_new->job == NULL);
1906
    assert(bs_new->dev == NULL);
1907
    assert(bs_new->in_use == 0);
1908
    assert(bs_new->io_limits_enabled == false);
1909
    assert(!throttle_have_timer(&bs_new->throttle_state));
1910

    
1911
    tmp = *bs_new;
1912
    *bs_new = *bs_old;
1913
    *bs_old = tmp;
1914

    
1915
    /* there are some fields that should not be swapped, move them back */
1916
    bdrv_move_feature_fields(&tmp, bs_old);
1917
    bdrv_move_feature_fields(bs_old, bs_new);
1918
    bdrv_move_feature_fields(bs_new, &tmp);
1919

    
1920
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1921
    assert(bs_new->device_name[0] == '\0');
1922

    
1923
    /* Check a few fields that should remain attached to the device */
1924
    assert(bs_new->dev == NULL);
1925
    assert(bs_new->job == NULL);
1926
    assert(bs_new->in_use == 0);
1927
    assert(bs_new->io_limits_enabled == false);
1928
    assert(!throttle_have_timer(&bs_new->throttle_state));
1929

    
1930
    bdrv_rebind(bs_new);
1931
    bdrv_rebind(bs_old);
1932
}
1933

    
1934
/*
1935
 * Add new bs contents at the top of an image chain while the chain is
1936
 * live, while keeping required fields on the top layer.
1937
 *
1938
 * This will modify the BlockDriverState fields, and swap contents
1939
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1940
 *
1941
 * bs_new is required to be anonymous.
1942
 *
1943
 * This function does not create any image files.
1944
 */
1945
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1946
{
1947
    bdrv_swap(bs_new, bs_top);
1948

    
1949
    /* The contents of 'tmp' will become bs_top, as we are
1950
     * swapping bs_new and bs_top contents. */
1951
    bs_top->backing_hd = bs_new;
1952
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1953
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1954
            bs_new->filename);
1955
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1956
            bs_new->drv ? bs_new->drv->format_name : "");
1957
}
1958

    
1959
static void bdrv_delete(BlockDriverState *bs)
1960
{
1961
    assert(!bs->dev);
1962
    assert(!bs->job);
1963
    assert(!bs->in_use);
1964
    assert(!bs->refcnt);
1965
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1966

    
1967
    bdrv_close(bs);
1968

    
1969
    /* remove from list, if necessary */
1970
    bdrv_make_anon(bs);
1971

    
1972
    g_free(bs);
1973
}
1974

    
1975
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1976
/* TODO change to DeviceState *dev when all users are qdevified */
1977
{
1978
    if (bs->dev) {
1979
        return -EBUSY;
1980
    }
1981
    bs->dev = dev;
1982
    bdrv_iostatus_reset(bs);
1983
    return 0;
1984
}
1985

    
1986
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1987
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1988
{
1989
    if (bdrv_attach_dev(bs, dev) < 0) {
1990
        abort();
1991
    }
1992
}
1993

    
1994
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1995
/* TODO change to DeviceState *dev when all users are qdevified */
1996
{
1997
    assert(bs->dev == dev);
1998
    bs->dev = NULL;
1999
    bs->dev_ops = NULL;
2000
    bs->dev_opaque = NULL;
2001
    bs->guest_block_size = 512;
2002
}
2003

    
2004
/* TODO change to return DeviceState * when all users are qdevified */
2005
void *bdrv_get_attached_dev(BlockDriverState *bs)
2006
{
2007
    return bs->dev;
2008
}
2009

    
2010
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2011
                      void *opaque)
2012
{
2013
    bs->dev_ops = ops;
2014
    bs->dev_opaque = opaque;
2015
}
2016

    
2017
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2018
                               enum MonitorEvent ev,
2019
                               BlockErrorAction action, bool is_read)
2020
{
2021
    QObject *data;
2022
    const char *action_str;
2023

    
2024
    switch (action) {
2025
    case BDRV_ACTION_REPORT:
2026
        action_str = "report";
2027
        break;
2028
    case BDRV_ACTION_IGNORE:
2029
        action_str = "ignore";
2030
        break;
2031
    case BDRV_ACTION_STOP:
2032
        action_str = "stop";
2033
        break;
2034
    default:
2035
        abort();
2036
    }
2037

    
2038
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2039
                              bdrv->device_name,
2040
                              action_str,
2041
                              is_read ? "read" : "write");
2042
    monitor_protocol_event(ev, data);
2043

    
2044
    qobject_decref(data);
2045
}
2046

    
2047
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2048
{
2049
    QObject *data;
2050

    
2051
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2052
                              bdrv_get_device_name(bs), ejected);
2053
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2054

    
2055
    qobject_decref(data);
2056
}
2057

    
2058
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2059
{
2060
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2061
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2062
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2063
        if (tray_was_closed) {
2064
            /* tray open */
2065
            bdrv_emit_qmp_eject_event(bs, true);
2066
        }
2067
        if (load) {
2068
            /* tray close */
2069
            bdrv_emit_qmp_eject_event(bs, false);
2070
        }
2071
    }
2072
}
2073

    
2074
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2075
{
2076
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2077
}
2078

    
2079
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2080
{
2081
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2082
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2083
    }
2084
}
2085

    
2086
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2087
{
2088
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2089
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
2090
    }
2091
    return false;
2092
}
2093

    
2094
static void bdrv_dev_resize_cb(BlockDriverState *bs)
2095
{
2096
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
2097
        bs->dev_ops->resize_cb(bs->dev_opaque);
2098
    }
2099
}
2100

    
2101
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2102
{
2103
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2104
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2105
    }
2106
    return false;
2107
}
2108

    
2109
/*
2110
 * Run consistency checks on an image
2111
 *
2112
 * Returns 0 if the check could be completed (it doesn't mean that the image is
2113
 * free of errors) or -errno when an internal error occurred. The results of the
2114
 * check are stored in res.
2115
 */
2116
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2117
{
2118
    if (bs->drv->bdrv_check == NULL) {
2119
        return -ENOTSUP;
2120
    }
2121

    
2122
    memset(res, 0, sizeof(*res));
2123
    return bs->drv->bdrv_check(bs, res, fix);
2124
}
2125

    
2126
#define COMMIT_BUF_SECTORS 2048
2127

    
2128
/* commit COW file into the raw image */
2129
int bdrv_commit(BlockDriverState *bs)
2130
{
2131
    BlockDriver *drv = bs->drv;
2132
    int64_t sector, total_sectors, length, backing_length;
2133
    int n, ro, open_flags;
2134
    int ret = 0;
2135
    uint8_t *buf = NULL;
2136
    char filename[PATH_MAX];
2137

    
2138
    if (!drv)
2139
        return -ENOMEDIUM;
2140
    
2141
    if (!bs->backing_hd) {
2142
        return -ENOTSUP;
2143
    }
2144

    
2145
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2146
        return -EBUSY;
2147
    }
2148

    
2149
    ro = bs->backing_hd->read_only;
2150
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2151
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2152
    open_flags =  bs->backing_hd->open_flags;
2153

    
2154
    if (ro) {
2155
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2156
            return -EACCES;
2157
        }
2158
    }
2159

    
2160
    length = bdrv_getlength(bs);
2161
    if (length < 0) {
2162
        ret = length;
2163
        goto ro_cleanup;
2164
    }
2165

    
2166
    backing_length = bdrv_getlength(bs->backing_hd);
2167
    if (backing_length < 0) {
2168
        ret = backing_length;
2169
        goto ro_cleanup;
2170
    }
2171

    
2172
    /* If our top snapshot is larger than the backing file image,
2173
     * grow the backing file image if possible.  If not possible,
2174
     * we must return an error */
2175
    if (length > backing_length) {
2176
        ret = bdrv_truncate(bs->backing_hd, length);
2177
        if (ret < 0) {
2178
            goto ro_cleanup;
2179
        }
2180
    }
2181

    
2182
    total_sectors = length >> BDRV_SECTOR_BITS;
2183
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2184

    
2185
    for (sector = 0; sector < total_sectors; sector += n) {
2186
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2187
        if (ret < 0) {
2188
            goto ro_cleanup;
2189
        }
2190
        if (ret) {
2191
            ret = bdrv_read(bs, sector, buf, n);
2192
            if (ret < 0) {
2193
                goto ro_cleanup;
2194
            }
2195

    
2196
            ret = bdrv_write(bs->backing_hd, sector, buf, n);
2197
            if (ret < 0) {
2198
                goto ro_cleanup;
2199
            }
2200
        }
2201
    }
2202

    
2203
    if (drv->bdrv_make_empty) {
2204
        ret = drv->bdrv_make_empty(bs);
2205
        if (ret < 0) {
2206
            goto ro_cleanup;
2207
        }
2208
        bdrv_flush(bs);
2209
    }
2210

    
2211
    /*
2212
     * Make sure all data we wrote to the backing device is actually
2213
     * stable on disk.
2214
     */
2215
    if (bs->backing_hd) {
2216
        bdrv_flush(bs->backing_hd);
2217
    }
2218

    
2219
    ret = 0;
2220
ro_cleanup:
2221
    g_free(buf);
2222

    
2223
    if (ro) {
2224
        /* ignoring error return here */
2225
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2226
    }
2227

    
2228
    return ret;
2229
}
2230

    
2231
int bdrv_commit_all(void)
2232
{
2233
    BlockDriverState *bs;
2234

    
2235
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2236
        if (bs->drv && bs->backing_hd) {
2237
            int ret = bdrv_commit(bs);
2238
            if (ret < 0) {
2239
                return ret;
2240
            }
2241
        }
2242
    }
2243
    return 0;
2244
}
2245

    
2246
/**
2247
 * Remove an active request from the tracked requests list
2248
 *
2249
 * This function should be called when a tracked request is completing.
2250
 */
2251
static void tracked_request_end(BdrvTrackedRequest *req)
2252
{
2253
    if (req->serialising) {
2254
        req->bs->serialising_in_flight--;
2255
    }
2256

    
2257
    QLIST_REMOVE(req, list);
2258
    qemu_co_queue_restart_all(&req->wait_queue);
2259
}
2260

    
2261
/**
2262
 * Add an active request to the tracked requests list
2263
 */
2264
static void tracked_request_begin(BdrvTrackedRequest *req,
2265
                                  BlockDriverState *bs,
2266
                                  int64_t offset,
2267
                                  unsigned int bytes, bool is_write)
2268
{
2269
    *req = (BdrvTrackedRequest){
2270
        .bs = bs,
2271
        .offset         = offset,
2272
        .bytes          = bytes,
2273
        .is_write       = is_write,
2274
        .co             = qemu_coroutine_self(),
2275
        .serialising    = false,
2276
        .overlap_offset = offset,
2277
        .overlap_bytes  = bytes,
2278
    };
2279

    
2280
    qemu_co_queue_init(&req->wait_queue);
2281

    
2282
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2283
}
2284

    
2285
static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2286
{
2287
    int64_t overlap_offset = req->offset & ~(align - 1);
2288
    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2289
                               - overlap_offset;
2290

    
2291
    if (!req->serialising) {
2292
        req->bs->serialising_in_flight++;
2293
        req->serialising = true;
2294
    }
2295

    
2296
    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2297
    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2298
}
2299

    
2300
/**
2301
 * Round a region to cluster boundaries
2302
 */
2303
void bdrv_round_to_clusters(BlockDriverState *bs,
2304
                            int64_t sector_num, int nb_sectors,
2305
                            int64_t *cluster_sector_num,
2306
                            int *cluster_nb_sectors)
2307
{
2308
    BlockDriverInfo bdi;
2309

    
2310
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2311
        *cluster_sector_num = sector_num;
2312
        *cluster_nb_sectors = nb_sectors;
2313
    } else {
2314
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2315
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2316
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2317
                                            nb_sectors, c);
2318
    }
2319
}
2320

    
2321
static int bdrv_get_cluster_size(BlockDriverState *bs)
2322
{
2323
    BlockDriverInfo bdi;
2324
    int ret;
2325

    
2326
    ret = bdrv_get_info(bs, &bdi);
2327
    if (ret < 0 || bdi.cluster_size == 0) {
2328
        return bs->request_alignment;
2329
    } else {
2330
        return bdi.cluster_size;
2331
    }
2332
}
2333

    
2334
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2335
                                     int64_t offset, unsigned int bytes)
2336
{
2337
    /*        aaaa   bbbb */
2338
    if (offset >= req->overlap_offset + req->overlap_bytes) {
2339
        return false;
2340
    }
2341
    /* bbbb   aaaa        */
2342
    if (req->overlap_offset >= offset + bytes) {
2343
        return false;
2344
    }
2345
    return true;
2346
}
2347

    
2348
static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2349
{
2350
    BlockDriverState *bs = self->bs;
2351
    BdrvTrackedRequest *req;
2352
    bool retry;
2353
    bool waited = false;
2354

    
2355
    if (!bs->serialising_in_flight) {
2356
        return false;
2357
    }
2358

    
2359
    do {
2360
        retry = false;
2361
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2362
            if (req == self || (!req->serialising && !self->serialising)) {
2363
                continue;
2364
            }
2365
            if (tracked_request_overlaps(req, self->overlap_offset,
2366
                                         self->overlap_bytes))
2367
            {
2368
                /* Hitting this means there was a reentrant request, for
2369
                 * example, a block driver issuing nested requests.  This must
2370
                 * never happen since it means deadlock.
2371
                 */
2372
                assert(qemu_coroutine_self() != req->co);
2373

    
2374
                /* If the request is already (indirectly) waiting for us, or
2375
                 * will wait for us as soon as it wakes up, then just go on
2376
                 * (instead of producing a deadlock in the former case). */
2377
                if (!req->waiting_for) {
2378
                    self->waiting_for = req;
2379
                    qemu_co_queue_wait(&req->wait_queue);
2380
                    self->waiting_for = NULL;
2381
                    retry = true;
2382
                    waited = true;
2383
                    break;
2384
                }
2385
            }
2386
        }
2387
    } while (retry);
2388

    
2389
    return waited;
2390
}
2391

    
2392
/*
2393
 * Return values:
2394
 * 0        - success
2395
 * -EINVAL  - backing format specified, but no file
2396
 * -ENOSPC  - can't update the backing file because no space is left in the
2397
 *            image file header
2398
 * -ENOTSUP - format driver doesn't support changing the backing file
2399
 */
2400
int bdrv_change_backing_file(BlockDriverState *bs,
2401
    const char *backing_file, const char *backing_fmt)
2402
{
2403
    BlockDriver *drv = bs->drv;
2404
    int ret;
2405

    
2406
    /* Backing file format doesn't make sense without a backing file */
2407
    if (backing_fmt && !backing_file) {
2408
        return -EINVAL;
2409
    }
2410

    
2411
    if (drv->bdrv_change_backing_file != NULL) {
2412
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2413
    } else {
2414
        ret = -ENOTSUP;
2415
    }
2416

    
2417
    if (ret == 0) {
2418
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2419
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2420
    }
2421
    return ret;
2422
}
2423

    
2424
/*
2425
 * Finds the image layer in the chain that has 'bs' as its backing file.
2426
 *
2427
 * active is the current topmost image.
2428
 *
2429
 * Returns NULL if bs is not found in active's image chain,
2430
 * or if active == bs.
2431
 */
2432
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2433
                                    BlockDriverState *bs)
2434
{
2435
    BlockDriverState *overlay = NULL;
2436
    BlockDriverState *intermediate;
2437

    
2438
    assert(active != NULL);
2439
    assert(bs != NULL);
2440

    
2441
    /* if bs is the same as active, then by definition it has no overlay
2442
     */
2443
    if (active == bs) {
2444
        return NULL;
2445
    }
2446

    
2447
    intermediate = active;
2448
    while (intermediate->backing_hd) {
2449
        if (intermediate->backing_hd == bs) {
2450
            overlay = intermediate;
2451
            break;
2452
        }
2453
        intermediate = intermediate->backing_hd;
2454
    }
2455

    
2456
    return overlay;
2457
}
2458

    
2459
typedef struct BlkIntermediateStates {
2460
    BlockDriverState *bs;
2461
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2462
} BlkIntermediateStates;
2463

    
2464

    
2465
/*
2466
 * Drops images above 'base' up to and including 'top', and sets the image
2467
 * above 'top' to have base as its backing file.
2468
 *
2469
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2470
 * information in 'bs' can be properly updated.
2471
 *
2472
 * E.g., this will convert the following chain:
2473
 * bottom <- base <- intermediate <- top <- active
2474
 *
2475
 * to
2476
 *
2477
 * bottom <- base <- active
2478
 *
2479
 * It is allowed for bottom==base, in which case it converts:
2480
 *
2481
 * base <- intermediate <- top <- active
2482
 *
2483
 * to
2484
 *
2485
 * base <- active
2486
 *
2487
 * Error conditions:
2488
 *  if active == top, that is considered an error
2489
 *
2490
 */
2491
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2492
                           BlockDriverState *base)
2493
{
2494
    BlockDriverState *intermediate;
2495
    BlockDriverState *base_bs = NULL;
2496
    BlockDriverState *new_top_bs = NULL;
2497
    BlkIntermediateStates *intermediate_state, *next;
2498
    int ret = -EIO;
2499

    
2500
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2501
    QSIMPLEQ_INIT(&states_to_delete);
2502

    
2503
    if (!top->drv || !base->drv) {
2504
        goto exit;
2505
    }
2506

    
2507
    new_top_bs = bdrv_find_overlay(active, top);
2508

    
2509
    if (new_top_bs == NULL) {
2510
        /* we could not find the image above 'top', this is an error */
2511
        goto exit;
2512
    }
2513

    
2514
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2515
     * to do, no intermediate images */
2516
    if (new_top_bs->backing_hd == base) {
2517
        ret = 0;
2518
        goto exit;
2519
    }
2520

    
2521
    intermediate = top;
2522

    
2523
    /* now we will go down through the list, and add each BDS we find
2524
     * into our deletion queue, until we hit the 'base'
2525
     */
2526
    while (intermediate) {
2527
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2528
        intermediate_state->bs = intermediate;
2529
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2530

    
2531
        if (intermediate->backing_hd == base) {
2532
            base_bs = intermediate->backing_hd;
2533
            break;
2534
        }
2535
        intermediate = intermediate->backing_hd;
2536
    }
2537
    if (base_bs == NULL) {
2538
        /* something went wrong, we did not end at the base. safely
2539
         * unravel everything, and exit with error */
2540
        goto exit;
2541
    }
2542

    
2543
    /* success - we can delete the intermediate states, and link top->base */
2544
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2545
                                   base_bs->drv ? base_bs->drv->format_name : "");
2546
    if (ret) {
2547
        goto exit;
2548
    }
2549
    new_top_bs->backing_hd = base_bs;
2550

    
2551
    bdrv_refresh_limits(new_top_bs);
2552

    
2553
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2554
        /* so that bdrv_close() does not recursively close the chain */
2555
        intermediate_state->bs->backing_hd = NULL;
2556
        bdrv_unref(intermediate_state->bs);
2557
    }
2558
    ret = 0;
2559

    
2560
exit:
2561
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2562
        g_free(intermediate_state);
2563
    }
2564
    return ret;
2565
}
2566

    
2567

    
2568
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2569
                                   size_t size)
2570
{
2571
    int64_t len;
2572

    
2573
    if (!bdrv_is_inserted(bs))
2574
        return -ENOMEDIUM;
2575

    
2576
    if (bs->growable)
2577
        return 0;
2578

    
2579
    len = bdrv_getlength(bs);
2580

    
2581
    if (offset < 0)
2582
        return -EIO;
2583

    
2584
    if ((offset > len) || (len - offset < size))
2585
        return -EIO;
2586

    
2587
    return 0;
2588
}
2589

    
2590
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2591
                              int nb_sectors)
2592
{
2593
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2594
                                   nb_sectors * BDRV_SECTOR_SIZE);
2595
}
2596

    
2597
typedef struct RwCo {
2598
    BlockDriverState *bs;
2599
    int64_t offset;
2600
    QEMUIOVector *qiov;
2601
    bool is_write;
2602
    int ret;
2603
    BdrvRequestFlags flags;
2604
} RwCo;
2605

    
2606
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2607
{
2608
    RwCo *rwco = opaque;
2609

    
2610
    if (!rwco->is_write) {
2611
        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2612
                                      rwco->qiov->size, rwco->qiov,
2613
                                      rwco->flags);
2614
    } else {
2615
        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2616
                                       rwco->qiov->size, rwco->qiov,
2617
                                       rwco->flags);
2618
    }
2619
}
2620

    
2621
/*
2622
 * Process a vectored synchronous request using coroutines
2623
 */
2624
static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2625
                        QEMUIOVector *qiov, bool is_write,
2626
                        BdrvRequestFlags flags)
2627
{
2628
    Coroutine *co;
2629
    RwCo rwco = {
2630
        .bs = bs,
2631
        .offset = offset,
2632
        .qiov = qiov,
2633
        .is_write = is_write,
2634
        .ret = NOT_DONE,
2635
        .flags = flags,
2636
    };
2637

    
2638
    /**
2639
     * In sync call context, when the vcpu is blocked, this throttling timer
2640
     * will not fire; so the I/O throttling function has to be disabled here
2641
     * if it has been enabled.
2642
     */
2643
    if (bs->io_limits_enabled) {
2644
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2645
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2646
        bdrv_io_limits_disable(bs);
2647
    }
2648

    
2649
    if (qemu_in_coroutine()) {
2650
        /* Fast-path if already in coroutine context */
2651
        bdrv_rw_co_entry(&rwco);
2652
    } else {
2653
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2654
        qemu_coroutine_enter(co, &rwco);
2655
        while (rwco.ret == NOT_DONE) {
2656
            qemu_aio_wait();
2657
        }
2658
    }
2659
    return rwco.ret;
2660
}
2661

    
2662
/*
2663
 * Process a synchronous request using coroutines
2664
 */
2665
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2666
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2667
{
2668
    QEMUIOVector qiov;
2669
    struct iovec iov = {
2670
        .iov_base = (void *)buf,
2671
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2672
    };
2673

    
2674
    qemu_iovec_init_external(&qiov, &iov, 1);
2675
    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2676
                        &qiov, is_write, flags);
2677
}
2678

    
2679
/* return < 0 if error. See bdrv_write() for the return codes */
2680
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2681
              uint8_t *buf, int nb_sectors)
2682
{
2683
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2684
}
2685

    
2686
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2687
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2688
                          uint8_t *buf, int nb_sectors)
2689
{
2690
    bool enabled;
2691
    int ret;
2692

    
2693
    enabled = bs->io_limits_enabled;
2694
    bs->io_limits_enabled = false;
2695
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2696
    bs->io_limits_enabled = enabled;
2697
    return ret;
2698
}
2699

    
2700
/* Return < 0 if error. Important errors are:
2701
  -EIO         generic I/O error (may happen for all errors)
2702
  -ENOMEDIUM   No media inserted.
2703
  -EINVAL      Invalid sector number or nb_sectors
2704
  -EACCES      Trying to write a read-only device
2705
*/
2706
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2707
               const uint8_t *buf, int nb_sectors)
2708
{
2709
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2710
}
2711

    
2712
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2713
                      int nb_sectors, BdrvRequestFlags flags)
2714
{
2715
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2716
                      BDRV_REQ_ZERO_WRITE | flags);
2717
}
2718

    
2719
/*
2720
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2721
 * The operation is sped up by checking the block status and only writing
2722
 * zeroes to the device if they currently do not return zeroes. Optional
2723
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2724
 *
2725
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2726
 */
2727
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2728
{
2729
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2730
    int64_t ret, nb_sectors, sector_num = 0;
2731
    int n;
2732

    
2733
    for (;;) {
2734
        nb_sectors = target_size - sector_num;
2735
        if (nb_sectors <= 0) {
2736
            return 0;
2737
        }
2738
        if (nb_sectors > INT_MAX) {
2739
            nb_sectors = INT_MAX;
2740
        }
2741
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2742
        if (ret < 0) {
2743
            error_report("error getting block status at sector %" PRId64 ": %s",
2744
                         sector_num, strerror(-ret));
2745
            return ret;
2746
        }
2747
        if (ret & BDRV_BLOCK_ZERO) {
2748
            sector_num += n;
2749
            continue;
2750
        }
2751
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2752
        if (ret < 0) {
2753
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2754
                         sector_num, strerror(-ret));
2755
            return ret;
2756
        }
2757
        sector_num += n;
2758
    }
2759
}
2760

    
2761
int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2762
{
2763
    QEMUIOVector qiov;
2764
    struct iovec iov = {
2765
        .iov_base = (void *)buf,
2766
        .iov_len = bytes,
2767
    };
2768
    int ret;
2769

    
2770
    if (bytes < 0) {
2771
        return -EINVAL;
2772
    }
2773

    
2774
    qemu_iovec_init_external(&qiov, &iov, 1);
2775
    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2776
    if (ret < 0) {
2777
        return ret;
2778
    }
2779

    
2780
    return bytes;
2781
}
2782

    
2783
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2784
{
2785
    int ret;
2786

    
2787
    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2788
    if (ret < 0) {
2789
        return ret;
2790
    }
2791

    
2792
    return qiov->size;
2793
}
2794

    
2795
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2796
                const void *buf, int bytes)
2797
{
2798
    QEMUIOVector qiov;
2799
    struct iovec iov = {
2800
        .iov_base   = (void *) buf,
2801
        .iov_len    = bytes,
2802
    };
2803

    
2804
    if (bytes < 0) {
2805
        return -EINVAL;
2806
    }
2807

    
2808
    qemu_iovec_init_external(&qiov, &iov, 1);
2809
    return bdrv_pwritev(bs, offset, &qiov);
2810
}
2811

    
2812
/*
2813
 * Writes to the file and ensures that no writes are reordered across this
2814
 * request (acts as a barrier)
2815
 *
2816
 * Returns 0 on success, -errno in error cases.
2817
 */
2818
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2819
    const void *buf, int count)
2820
{
2821
    int ret;
2822

    
2823
    ret = bdrv_pwrite(bs, offset, buf, count);
2824
    if (ret < 0) {
2825
        return ret;
2826
    }
2827

    
2828
    /* No flush needed for cache modes that already do it */
2829
    if (bs->enable_write_cache) {
2830
        bdrv_flush(bs);
2831
    }
2832

    
2833
    return 0;
2834
}
2835

    
2836
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2837
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2838
{
2839
    /* Perform I/O through a temporary buffer so that users who scribble over
2840
     * their read buffer while the operation is in progress do not end up
2841
     * modifying the image file.  This is critical for zero-copy guest I/O
2842
     * where anything might happen inside guest memory.
2843
     */
2844
    void *bounce_buffer;
2845

    
2846
    BlockDriver *drv = bs->drv;
2847
    struct iovec iov;
2848
    QEMUIOVector bounce_qiov;
2849
    int64_t cluster_sector_num;
2850
    int cluster_nb_sectors;
2851
    size_t skip_bytes;
2852
    int ret;
2853

    
2854
    /* Cover entire cluster so no additional backing file I/O is required when
2855
     * allocating cluster in the image file.
2856
     */
2857
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2858
                           &cluster_sector_num, &cluster_nb_sectors);
2859

    
2860
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2861
                                   cluster_sector_num, cluster_nb_sectors);
2862

    
2863
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2864
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2865
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2866

    
2867
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2868
                             &bounce_qiov);
2869
    if (ret < 0) {
2870
        goto err;
2871
    }
2872

    
2873
    if (drv->bdrv_co_write_zeroes &&
2874
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2875
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2876
                                      cluster_nb_sectors, 0);
2877
    } else {
2878
        /* This does not change the data on the disk, it is not necessary
2879
         * to flush even in cache=writethrough mode.
2880
         */
2881
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2882
                                  &bounce_qiov);
2883
    }
2884

    
2885
    if (ret < 0) {
2886
        /* It might be okay to ignore write errors for guest requests.  If this
2887
         * is a deliberate copy-on-read then we don't want to ignore the error.
2888
         * Simply report it in all cases.
2889
         */
2890
        goto err;
2891
    }
2892

    
2893
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2894
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2895
                        nb_sectors * BDRV_SECTOR_SIZE);
2896

    
2897
err:
2898
    qemu_vfree(bounce_buffer);
2899
    return ret;
2900
}
2901

    
2902
/*
2903
 * Forwards an already correctly aligned request to the BlockDriver. This
2904
 * handles copy on read and zeroing after EOF; any other features must be
2905
 * implemented by the caller.
2906
 */
2907
static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2908
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2909
    int64_t align, QEMUIOVector *qiov, int flags)
2910
{
2911
    BlockDriver *drv = bs->drv;
2912
    int ret;
2913

    
2914
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2915
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2916

    
2917
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2918
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2919

    
2920
    /* Handle Copy on Read and associated serialisation */
2921
    if (flags & BDRV_REQ_COPY_ON_READ) {
2922
        /* If we touch the same cluster it counts as an overlap.  This
2923
         * guarantees that allocating writes will be serialized and not race
2924
         * with each other for the same cluster.  For example, in copy-on-read
2925
         * it ensures that the CoR read and write operations are atomic and
2926
         * guest writes cannot interleave between them. */
2927
        mark_request_serialising(req, bdrv_get_cluster_size(bs));
2928
    }
2929

    
2930
    wait_serialising_requests(req);
2931

    
2932
    if (flags & BDRV_REQ_COPY_ON_READ) {
2933
        int pnum;
2934

    
2935
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2936
        if (ret < 0) {
2937
            goto out;
2938
        }
2939

    
2940
        if (!ret || pnum != nb_sectors) {
2941
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2942
            goto out;
2943
        }
2944
    }
2945

    
2946
    /* Forward the request to the BlockDriver */
2947
    if (!(bs->zero_beyond_eof && bs->growable)) {
2948
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2949
    } else {
2950
        /* Read zeros after EOF of growable BDSes */
2951
        int64_t len, total_sectors, max_nb_sectors;
2952

    
2953
        len = bdrv_getlength(bs);
2954
        if (len < 0) {
2955
            ret = len;
2956
            goto out;
2957
        }
2958

    
2959
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2960
        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2961
                                  align >> BDRV_SECTOR_BITS);
2962
        if (max_nb_sectors > 0) {
2963
            ret = drv->bdrv_co_readv(bs, sector_num,
2964
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2965
        } else {
2966
            ret = 0;
2967
        }
2968

    
2969
        /* Reading beyond end of file is supposed to produce zeroes */
2970
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2971
            uint64_t offset = MAX(0, total_sectors - sector_num);
2972
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2973
                              BDRV_SECTOR_SIZE;
2974
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2975
        }
2976
    }
2977

    
2978
out:
2979
    return ret;
2980
}
2981

    
2982
/*
2983
 * Handle a read request in coroutine context
2984
 */
2985
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2986
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2987
    BdrvRequestFlags flags)
2988
{
2989
    BlockDriver *drv = bs->drv;
2990
    BdrvTrackedRequest req;
2991

    
2992
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2993
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2994
    uint8_t *head_buf = NULL;
2995
    uint8_t *tail_buf = NULL;
2996
    QEMUIOVector local_qiov;
2997
    bool use_local_qiov = false;
2998
    int ret;
2999

    
3000
    if (!drv) {
3001
        return -ENOMEDIUM;
3002
    }
3003
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3004
        return -EIO;
3005
    }
3006

    
3007
    if (bs->copy_on_read) {
3008
        flags |= BDRV_REQ_COPY_ON_READ;
3009
    }
3010

    
3011
    /* throttling disk I/O */
3012
    if (bs->io_limits_enabled) {
3013
        bdrv_io_limits_intercept(bs, bytes, false);
3014
    }
3015

    
3016
    /* Align read if necessary by padding qiov */
3017
    if (offset & (align - 1)) {
3018
        head_buf = qemu_blockalign(bs, align);
3019
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3020
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3021
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3022
        use_local_qiov = true;
3023

    
3024
        bytes += offset & (align - 1);
3025
        offset = offset & ~(align - 1);
3026
    }
3027

    
3028
    if ((offset + bytes) & (align - 1)) {
3029
        if (!use_local_qiov) {
3030
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3031
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3032
            use_local_qiov = true;
3033
        }
3034
        tail_buf = qemu_blockalign(bs, align);
3035
        qemu_iovec_add(&local_qiov, tail_buf,
3036
                       align - ((offset + bytes) & (align - 1)));
3037

    
3038
        bytes = ROUND_UP(bytes, align);
3039
    }
3040

    
3041
    tracked_request_begin(&req, bs, offset, bytes, false);
3042
    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3043
                              use_local_qiov ? &local_qiov : qiov,
3044
                              flags);
3045
    tracked_request_end(&req);
3046

    
3047
    if (use_local_qiov) {
3048
        qemu_iovec_destroy(&local_qiov);
3049
        qemu_vfree(head_buf);
3050
        qemu_vfree(tail_buf);
3051
    }
3052

    
3053
    return ret;
3054
}
3055

    
3056
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3057
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3058
    BdrvRequestFlags flags)
3059
{
3060
    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3061
        return -EINVAL;
3062
    }
3063

    
3064
    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3065
                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3066
}
3067

    
3068
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3069
    int nb_sectors, QEMUIOVector *qiov)
3070
{
3071
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3072

    
3073
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3074
}
3075

    
3076
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3077
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3078
{
3079
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3080

    
3081
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3082
                            BDRV_REQ_COPY_ON_READ);
3083
}
3084

    
3085
/* if no limit is specified in the BlockLimits use a default
3086
 * of 32768 512-byte sectors (16 MiB) per request.
3087
 */
3088
#define MAX_WRITE_ZEROES_DEFAULT 32768
3089

    
3090
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3091
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3092
{
3093
    BlockDriver *drv = bs->drv;
3094
    QEMUIOVector qiov;
3095
    struct iovec iov = {0};
3096
    int ret = 0;
3097

    
3098
    int max_write_zeroes = bs->bl.max_write_zeroes ?
3099
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3100

    
3101
    while (nb_sectors > 0 && !ret) {
3102
        int num = nb_sectors;
3103

    
3104
        /* Align request.  Block drivers can expect the "bulk" of the request
3105
         * to be aligned.
3106
         */
3107
        if (bs->bl.write_zeroes_alignment
3108
            && num > bs->bl.write_zeroes_alignment) {
3109
            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3110
                /* Make a small request up to the first aligned sector.  */
3111
                num = bs->bl.write_zeroes_alignment;
3112
                num -= sector_num % bs->bl.write_zeroes_alignment;
3113
            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3114
                /* Shorten the request to the last aligned sector.  num cannot
3115
                 * underflow because num > bs->bl.write_zeroes_alignment.
3116
                 */
3117
                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3118
            }
3119
        }
3120

    
3121
        /* limit request size */
3122
        if (num > max_write_zeroes) {
3123
            num = max_write_zeroes;
3124
        }
3125

    
3126
        ret = -ENOTSUP;
3127
        /* First try the efficient write zeroes operation */
3128
        if (drv->bdrv_co_write_zeroes) {
3129
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3130
        }
3131

    
3132
        if (ret == -ENOTSUP) {
3133
            /* Fall back to bounce buffer if write zeroes is unsupported */
3134
            iov.iov_len = num * BDRV_SECTOR_SIZE;
3135
            if (iov.iov_base == NULL) {
3136
                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3137
                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3138
            }
3139
            qemu_iovec_init_external(&qiov, &iov, 1);
3140

    
3141
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3142

    
3143
            /* Keep bounce buffer around if it is big enough for all
3144
             * all future requests.
3145
             */
3146
            if (num < max_write_zeroes) {
3147
                qemu_vfree(iov.iov_base);
3148
                iov.iov_base = NULL;
3149
            }
3150
        }
3151

    
3152
        sector_num += num;
3153
        nb_sectors -= num;
3154
    }
3155

    
3156
    qemu_vfree(iov.iov_base);
3157
    return ret;
3158
}
3159

    
3160
/*
3161
 * Forwards an already correctly aligned write request to the BlockDriver.
3162
 */
3163
static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3164
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3165
    QEMUIOVector *qiov, int flags)
3166
{
3167
    BlockDriver *drv = bs->drv;
3168
    bool waited;
3169
    int ret;
3170

    
3171
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3172
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3173

    
3174
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3175
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3176

    
3177
    waited = wait_serialising_requests(req);
3178
    assert(!waited || !req->serialising);
3179
    assert(req->overlap_offset <= offset);
3180
    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3181

    
3182
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3183

    
3184
    if (ret < 0) {
3185
        /* Do nothing, write notifier decided to fail this request */
3186
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
3187
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3188
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3189
    } else {
3190
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3191
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3192
    }
3193
    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3194

    
3195
    if (ret == 0 && !bs->enable_write_cache) {
3196
        ret = bdrv_co_flush(bs);
3197
    }
3198

    
3199
    bdrv_set_dirty(bs, sector_num, nb_sectors);
3200

    
3201
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3202
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
3203
    }
3204
    if (bs->growable && ret >= 0) {
3205
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3206
    }
3207

    
3208
    return ret;
3209
}
3210

    
3211
/*
3212
 * Handle a write request in coroutine context
3213
 */
3214
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3215
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3216
    BdrvRequestFlags flags)
3217
{
3218
    BdrvTrackedRequest req;
3219
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3220
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3221
    uint8_t *head_buf = NULL;
3222
    uint8_t *tail_buf = NULL;
3223
    QEMUIOVector local_qiov;
3224
    bool use_local_qiov = false;
3225
    int ret;
3226

    
3227
    if (!bs->drv) {
3228
        return -ENOMEDIUM;
3229
    }
3230
    if (bs->read_only) {
3231
        return -EACCES;
3232
    }
3233
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3234
        return -EIO;
3235
    }
3236

    
3237
    /* throttling disk I/O */
3238
    if (bs->io_limits_enabled) {
3239
        bdrv_io_limits_intercept(bs, bytes, true);
3240
    }
3241

    
3242
    /*
3243
     * Align write if necessary by performing a read-modify-write cycle.
3244
     * Pad qiov with the read parts and be sure to have a tracked request not
3245
     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3246
     */
3247
    tracked_request_begin(&req, bs, offset, bytes, true);
3248

    
3249
    if (offset & (align - 1)) {
3250
        QEMUIOVector head_qiov;
3251
        struct iovec head_iov;
3252

    
3253
        mark_request_serialising(&req, align);
3254
        wait_serialising_requests(&req);
3255

    
3256
        head_buf = qemu_blockalign(bs, align);
3257
        head_iov = (struct iovec) {
3258
            .iov_base   = head_buf,
3259
            .iov_len    = align,
3260
        };
3261
        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3262

    
3263
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3264
        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3265
                                  align, &head_qiov, 0);
3266
        if (ret < 0) {
3267
            goto fail;
3268
        }
3269
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3270

    
3271
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3272
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3273
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3274
        use_local_qiov = true;
3275

    
3276
        bytes += offset & (align - 1);
3277
        offset = offset & ~(align - 1);
3278
    }
3279

    
3280
    if ((offset + bytes) & (align - 1)) {
3281
        QEMUIOVector tail_qiov;
3282
        struct iovec tail_iov;
3283
        size_t tail_bytes;
3284
        bool waited;
3285

    
3286
        mark_request_serialising(&req, align);
3287
        waited = wait_serialising_requests(&req);
3288
        assert(!waited || !use_local_qiov);
3289

    
3290
        tail_buf = qemu_blockalign(bs, align);
3291
        tail_iov = (struct iovec) {
3292
            .iov_base   = tail_buf,
3293
            .iov_len    = align,
3294
        };
3295
        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3296

    
3297
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3298
        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3299
                                  align, &tail_qiov, 0);
3300
        if (ret < 0) {
3301
            goto fail;
3302
        }
3303
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3304

    
3305
        if (!use_local_qiov) {
3306
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3307
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3308
            use_local_qiov = true;
3309
        }
3310

    
3311
        tail_bytes = (offset + bytes) & (align - 1);
3312
        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3313

    
3314
        bytes = ROUND_UP(bytes, align);
3315
    }
3316

    
3317
    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3318
                               use_local_qiov ? &local_qiov : qiov,
3319
                               flags);
3320

    
3321
fail:
3322
    tracked_request_end(&req);
3323

    
3324
    if (use_local_qiov) {
3325
        qemu_iovec_destroy(&local_qiov);
3326
    }
3327
    qemu_vfree(head_buf);
3328
    qemu_vfree(tail_buf);
3329

    
3330
    return ret;
3331
}
3332

    
3333
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3334
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3335
    BdrvRequestFlags flags)
3336
{
3337
    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3338
        return -EINVAL;
3339
    }
3340

    
3341
    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3342
                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3343
}
3344

    
3345
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3346
    int nb_sectors, QEMUIOVector *qiov)
3347
{
3348
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3349

    
3350
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3351
}
3352

    
3353
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3354
                                      int64_t sector_num, int nb_sectors,
3355
                                      BdrvRequestFlags flags)
3356
{
3357
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3358

    
3359
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
3360
        flags &= ~BDRV_REQ_MAY_UNMAP;
3361
    }
3362

    
3363
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3364
                             BDRV_REQ_ZERO_WRITE | flags);
3365
}
3366

    
3367
/**
3368
 * Truncate file to 'offset' bytes (needed only for file protocols)
3369
 */
3370
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3371
{
3372
    BlockDriver *drv = bs->drv;
3373
    int ret;
3374
    if (!drv)
3375
        return -ENOMEDIUM;
3376
    if (!drv->bdrv_truncate)
3377
        return -ENOTSUP;
3378
    if (bs->read_only)
3379
        return -EACCES;
3380
    if (bdrv_in_use(bs))
3381
        return -EBUSY;
3382
    ret = drv->bdrv_truncate(bs, offset);
3383
    if (ret == 0) {
3384
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3385
        bdrv_dev_resize_cb(bs);
3386
    }
3387
    return ret;
3388
}
3389

    
3390
/**
3391
 * Length of a allocated file in bytes. Sparse files are counted by actual
3392
 * allocated space. Return < 0 if error or unknown.
3393
 */
3394
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3395
{
3396
    BlockDriver *drv = bs->drv;
3397
    if (!drv) {
3398
        return -ENOMEDIUM;
3399
    }
3400
    if (drv->bdrv_get_allocated_file_size) {
3401
        return drv->bdrv_get_allocated_file_size(bs);
3402
    }
3403
    if (bs->file) {
3404
        return bdrv_get_allocated_file_size(bs->file);
3405
    }
3406
    return -ENOTSUP;
3407
}
3408

    
3409
/**
3410
 * Length of a file in bytes. Return < 0 if error or unknown.
3411
 */
3412
int64_t bdrv_getlength(BlockDriverState *bs)
3413
{
3414
    BlockDriver *drv = bs->drv;
3415
    if (!drv)
3416
        return -ENOMEDIUM;
3417

    
3418
    if (drv->has_variable_length) {
3419
        int ret = refresh_total_sectors(bs, bs->total_sectors);
3420
        if (ret < 0) {
3421
            return ret;
3422
        }
3423
    }
3424
    return bs->total_sectors * BDRV_SECTOR_SIZE;
3425
}
3426

    
3427
/* return 0 as number of sectors if no device present or error */
3428
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3429
{
3430
    int64_t length;
3431
    length = bdrv_getlength(bs);
3432
    if (length < 0)
3433
        length = 0;
3434
    else
3435
        length = length >> BDRV_SECTOR_BITS;
3436
    *nb_sectors_ptr = length;
3437
}
3438

    
3439
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3440
                       BlockdevOnError on_write_error)
3441
{
3442
    bs->on_read_error = on_read_error;
3443
    bs->on_write_error = on_write_error;
3444
}
3445

    
3446
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3447
{
3448
    return is_read ? bs->on_read_error : bs->on_write_error;
3449
}
3450

    
3451
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3452
{
3453
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3454

    
3455
    switch (on_err) {
3456
    case BLOCKDEV_ON_ERROR_ENOSPC:
3457
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3458
    case BLOCKDEV_ON_ERROR_STOP:
3459
        return BDRV_ACTION_STOP;
3460
    case BLOCKDEV_ON_ERROR_REPORT:
3461
        return BDRV_ACTION_REPORT;
3462
    case BLOCKDEV_ON_ERROR_IGNORE:
3463
        return BDRV_ACTION_IGNORE;
3464
    default:
3465
        abort();
3466
    }
3467
}
3468

    
3469
/* This is done by device models because, while the block layer knows
3470
 * about the error, it does not know whether an operation comes from
3471
 * the device or the block layer (from a job, for example).
3472
 */
3473
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3474
                       bool is_read, int error)
3475
{
3476
    assert(error >= 0);
3477
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3478
    if (action == BDRV_ACTION_STOP) {
3479
        vm_stop(RUN_STATE_IO_ERROR);
3480
        bdrv_iostatus_set_err(bs, error);
3481
    }
3482
}
3483

    
3484
int bdrv_is_read_only(BlockDriverState *bs)
3485
{
3486
    return bs->read_only;
3487
}
3488

    
3489
int bdrv_is_sg(BlockDriverState *bs)
3490
{
3491
    return bs->sg;
3492
}
3493

    
3494
int bdrv_enable_write_cache(BlockDriverState *bs)
3495
{
3496
    return bs->enable_write_cache;
3497
}
3498

    
3499
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3500
{
3501
    bs->enable_write_cache = wce;
3502

    
3503
    /* so a reopen() will preserve wce */
3504
    if (wce) {
3505
        bs->open_flags |= BDRV_O_CACHE_WB;
3506
    } else {
3507
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3508
    }
3509
}
3510

    
3511
int bdrv_is_encrypted(BlockDriverState *bs)
3512
{
3513
    if (bs->backing_hd && bs->backing_hd->encrypted)
3514
        return 1;
3515
    return bs->encrypted;
3516
}
3517

    
3518
int bdrv_key_required(BlockDriverState *bs)
3519
{
3520
    BlockDriverState *backing_hd = bs->backing_hd;
3521

    
3522
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3523
        return 1;
3524
    return (bs->encrypted && !bs->valid_key);
3525
}
3526

    
3527
int bdrv_set_key(BlockDriverState *bs, const char *key)
3528
{
3529
    int ret;
3530
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3531
        ret = bdrv_set_key(bs->backing_hd, key);
3532
        if (ret < 0)
3533
            return ret;
3534
        if (!bs->encrypted)
3535
            return 0;
3536
    }
3537
    if (!bs->encrypted) {
3538
        return -EINVAL;
3539
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3540
        return -ENOMEDIUM;
3541
    }
3542
    ret = bs->drv->bdrv_set_key(bs, key);
3543
    if (ret < 0) {
3544
        bs->valid_key = 0;
3545
    } else if (!bs->valid_key) {
3546
        bs->valid_key = 1;
3547
        /* call the change callback now, we skipped it on open */
3548
        bdrv_dev_change_media_cb(bs, true);
3549
    }
3550
    return ret;
3551
}
3552

    
3553
const char *bdrv_get_format_name(BlockDriverState *bs)
3554
{
3555
    return bs->drv ? bs->drv->format_name : NULL;
3556
}
3557

    
3558
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3559
                         void *opaque)
3560
{
3561
    BlockDriver *drv;
3562

    
3563
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3564
        it(opaque, drv->format_name);
3565
    }
3566
}
3567

    
3568
/* This function is to find block backend bs */
3569
BlockDriverState *bdrv_find(const char *name)
3570
{
3571
    BlockDriverState *bs;
3572

    
3573
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3574
        if (!strcmp(name, bs->device_name)) {
3575
            return bs;
3576
        }
3577
    }
3578
    return NULL;
3579
}
3580

    
3581
/* This function is to find a node in the bs graph */
3582
BlockDriverState *bdrv_find_node(const char *node_name)
3583
{
3584
    BlockDriverState *bs;
3585

    
3586
    assert(node_name);
3587

    
3588
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3589
        if (!strcmp(node_name, bs->node_name)) {
3590
            return bs;
3591
        }
3592
    }
3593
    return NULL;
3594
}
3595

    
3596
/* Put this QMP function here so it can access the static graph_bdrv_states. */
3597
BlockDeviceInfoList *bdrv_named_nodes_list(void)
3598
{
3599
    BlockDeviceInfoList *list, *entry;
3600
    BlockDriverState *bs;
3601

    
3602
    list = NULL;
3603
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3604
        entry = g_malloc0(sizeof(*entry));
3605
        entry->value = bdrv_block_device_info(bs);
3606
        entry->next = list;
3607
        list = entry;
3608
    }
3609

    
3610
    return list;
3611
}
3612

    
3613
BlockDriverState *bdrv_lookup_bs(const char *device,
3614
                                 const char *node_name,
3615
                                 Error **errp)
3616
{
3617
    BlockDriverState *bs = NULL;
3618

    
3619
    if (device) {
3620
        bs = bdrv_find(device);
3621

    
3622
        if (bs) {
3623
            return bs;
3624
        }
3625
    }
3626

    
3627
    if (node_name) {
3628
        bs = bdrv_find_node(node_name);
3629

    
3630
        if (bs) {
3631
            return bs;
3632
        }
3633
    }
3634

    
3635
    error_setg(errp, "Cannot find device=%s nor node_name=%s",
3636
                     device ? device : "",
3637
                     node_name ? node_name : "");
3638
    return NULL;
3639
}
3640

    
3641
BlockDriverState *bdrv_next(BlockDriverState *bs)
3642
{
3643
    if (!bs) {
3644
        return QTAILQ_FIRST(&bdrv_states);
3645
    }
3646
    return QTAILQ_NEXT(bs, device_list);
3647
}
3648

    
3649
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3650
{
3651
    BlockDriverState *bs;
3652

    
3653
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3654
        it(opaque, bs);
3655
    }
3656
}
3657

    
3658
const char *bdrv_get_device_name(BlockDriverState *bs)
3659
{
3660
    return bs->device_name;
3661
}
3662

    
3663
int bdrv_get_flags(BlockDriverState *bs)
3664
{
3665
    return bs->open_flags;
3666
}
3667

    
3668
int bdrv_flush_all(void)
3669
{
3670
    BlockDriverState *bs;
3671
    int result = 0;
3672

    
3673
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3674
        int ret = bdrv_flush(bs);
3675
        if (ret < 0 && !result) {
3676
            result = ret;
3677
        }
3678
    }
3679

    
3680
    return result;
3681
}
3682

    
3683
int bdrv_has_zero_init_1(BlockDriverState *bs)
3684
{
3685
    return 1;
3686
}
3687

    
3688
int bdrv_has_zero_init(BlockDriverState *bs)
3689
{
3690
    assert(bs->drv);
3691

    
3692
    /* If BS is a copy on write image, it is initialized to
3693
       the contents of the base image, which may not be zeroes.  */
3694
    if (bs->backing_hd) {
3695
        return 0;
3696
    }
3697
    if (bs->drv->bdrv_has_zero_init) {
3698
        return bs->drv->bdrv_has_zero_init(bs);
3699
    }
3700

    
3701
    /* safe default */
3702
    return 0;
3703
}
3704

    
3705
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3706
{
3707
    BlockDriverInfo bdi;
3708

    
3709
    if (bs->backing_hd) {
3710
        return false;
3711
    }
3712

    
3713
    if (bdrv_get_info(bs, &bdi) == 0) {
3714
        return bdi.unallocated_blocks_are_zero;
3715
    }
3716

    
3717
    return false;
3718
}
3719

    
3720
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3721
{
3722
    BlockDriverInfo bdi;
3723

    
3724
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3725
        return false;
3726
    }
3727

    
3728
    if (bdrv_get_info(bs, &bdi) == 0) {
3729
        return bdi.can_write_zeroes_with_unmap;
3730
    }
3731

    
3732
    return false;
3733
}
3734

    
3735
typedef struct BdrvCoGetBlockStatusData {
3736
    BlockDriverState *bs;
3737
    BlockDriverState *base;
3738
    int64_t sector_num;
3739
    int nb_sectors;
3740
    int *pnum;
3741
    int64_t ret;
3742
    bool done;
3743
} BdrvCoGetBlockStatusData;
3744

    
3745
/*
3746
 * Returns true iff the specified sector is present in the disk image. Drivers
3747
 * not implementing the functionality are assumed to not support backing files,
3748
 * hence all their sectors are reported as allocated.
3749
 *
3750
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3751
 * and 'pnum' is set to 0.
3752
 *
3753
 * 'pnum' is set to the number of sectors (including and immediately following
3754
 * the specified sector) that are known to be in the same
3755
 * allocated/unallocated state.
3756
 *
3757
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3758
 * beyond the end of the disk image it will be clamped.
3759
 */
3760
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3761
                                                     int64_t sector_num,
3762
                                                     int nb_sectors, int *pnum)
3763
{
3764
    int64_t length;
3765
    int64_t n;
3766
    int64_t ret, ret2;
3767

    
3768
    length = bdrv_getlength(bs);
3769
    if (length < 0) {
3770
        return length;
3771
    }
3772

    
3773
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3774
        *pnum = 0;
3775
        return 0;
3776
    }
3777

    
3778
    n = bs->total_sectors - sector_num;
3779
    if (n < nb_sectors) {
3780
        nb_sectors = n;
3781
    }
3782

    
3783
    if (!bs->drv->bdrv_co_get_block_status) {
3784
        *pnum = nb_sectors;
3785
        ret = BDRV_BLOCK_DATA;
3786
        if (bs->drv->protocol_name) {
3787
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3788
        }
3789
        return ret;
3790
    }
3791

    
3792
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3793
    if (ret < 0) {
3794
        *pnum = 0;
3795
        return ret;
3796
    }
3797

    
3798
    if (ret & BDRV_BLOCK_RAW) {
3799
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3800
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3801
                                     *pnum, pnum);
3802
    }
3803

    
3804
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3805
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3806
            ret |= BDRV_BLOCK_ZERO;
3807
        } else if (bs->backing_hd) {
3808
            BlockDriverState *bs2 = bs->backing_hd;
3809
            int64_t length2 = bdrv_getlength(bs2);
3810
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3811
                ret |= BDRV_BLOCK_ZERO;
3812
            }
3813
        }
3814
    }
3815

    
3816
    if (bs->file &&
3817
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3818
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3819
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3820
                                        *pnum, pnum);
3821
        if (ret2 >= 0) {
3822
            /* Ignore errors.  This is just providing extra information, it
3823
             * is useful but not necessary.
3824
             */
3825
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3826
        }
3827
    }
3828

    
3829
    return ret;
3830
}
3831

    
3832
/* Coroutine wrapper for bdrv_get_block_status() */
3833
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3834
{
3835
    BdrvCoGetBlockStatusData *data = opaque;
3836
    BlockDriverState *bs = data->bs;
3837

    
3838
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3839
                                         data->pnum);
3840
    data->done = true;
3841
}
3842

    
3843
/*
3844
 * Synchronous wrapper around bdrv_co_get_block_status().
3845
 *
3846
 * See bdrv_co_get_block_status() for details.
3847
 */
3848
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3849
                              int nb_sectors, int *pnum)
3850
{
3851
    Coroutine *co;
3852
    BdrvCoGetBlockStatusData data = {
3853
        .bs = bs,
3854
        .sector_num = sector_num,
3855
        .nb_sectors = nb_sectors,
3856
        .pnum = pnum,
3857
        .done = false,
3858
    };
3859

    
3860
    if (qemu_in_coroutine()) {
3861
        /* Fast-path if already in coroutine context */
3862
        bdrv_get_block_status_co_entry(&data);
3863
    } else {
3864
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3865
        qemu_coroutine_enter(co, &data);
3866
        while (!data.done) {
3867
            qemu_aio_wait();
3868
        }
3869
    }
3870
    return data.ret;
3871
}
3872

    
3873
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3874
                                   int nb_sectors, int *pnum)
3875
{
3876
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3877
    if (ret < 0) {
3878
        return ret;
3879
    }
3880
    return
3881
        (ret & BDRV_BLOCK_DATA) ||
3882
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3883
}
3884

    
3885
/*
3886
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3887
 *
3888
 * Return true if the given sector is allocated in any image between
3889
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3890
 * sector is allocated in any image of the chain.  Return false otherwise.
3891
 *
3892
 * 'pnum' is set to the number of sectors (including and immediately following
3893
 *  the specified sector) that are known to be in the same
3894
 *  allocated/unallocated state.
3895
 *
3896
 */
3897
int bdrv_is_allocated_above(BlockDriverState *top,
3898
                            BlockDriverState *base,
3899
                            int64_t sector_num,
3900
                            int nb_sectors, int *pnum)
3901
{
3902
    BlockDriverState *intermediate;
3903
    int ret, n = nb_sectors;
3904

    
3905
    intermediate = top;
3906
    while (intermediate && intermediate != base) {
3907
        int pnum_inter;
3908
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3909
                                &pnum_inter);
3910
        if (ret < 0) {
3911
            return ret;
3912
        } else if (ret) {
3913
            *pnum = pnum_inter;
3914
            return 1;
3915
        }
3916

    
3917
        /*
3918
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3919
         * might have
3920
         *
3921
         * [sector_num+x, nr_sectors] allocated.
3922
         */
3923
        if (n > pnum_inter &&
3924
            (intermediate == top ||
3925
             sector_num + pnum_inter < intermediate->total_sectors)) {
3926
            n = pnum_inter;
3927
        }
3928

    
3929
        intermediate = intermediate->backing_hd;
3930
    }
3931

    
3932
    *pnum = n;
3933
    return 0;
3934
}
3935

    
3936
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3937
{
3938
    if (bs->backing_hd && bs->backing_hd->encrypted)
3939
        return bs->backing_file;
3940
    else if (bs->encrypted)
3941
        return bs->filename;
3942
    else
3943
        return NULL;
3944
}
3945

    
3946
void bdrv_get_backing_filename(BlockDriverState *bs,
3947
                               char *filename, int filename_size)
3948
{
3949
    pstrcpy(filename, filename_size, bs->backing_file);
3950
}
3951

    
3952
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3953
                          const uint8_t *buf, int nb_sectors)
3954
{
3955
    BlockDriver *drv = bs->drv;
3956
    if (!drv)
3957
        return -ENOMEDIUM;
3958
    if (!drv->bdrv_write_compressed)
3959
        return -ENOTSUP;
3960
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3961
        return -EIO;
3962

    
3963
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3964

    
3965
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3966
}
3967

    
3968
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3969
{
3970
    BlockDriver *drv = bs->drv;
3971
    if (!drv)
3972
        return -ENOMEDIUM;
3973
    if (!drv->bdrv_get_info)
3974
        return -ENOTSUP;
3975
    memset(bdi, 0, sizeof(*bdi));
3976
    return drv->bdrv_get_info(bs, bdi);
3977
}
3978

    
3979
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3980
{
3981
    BlockDriver *drv = bs->drv;
3982
    if (drv && drv->bdrv_get_specific_info) {
3983
        return drv->bdrv_get_specific_info(bs);
3984
    }
3985
    return NULL;
3986
}
3987

    
3988
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3989
                      int64_t pos, int size)
3990
{
3991
    QEMUIOVector qiov;
3992
    struct iovec iov = {
3993
        .iov_base   = (void *) buf,
3994
        .iov_len    = size,
3995
    };
3996

    
3997
    qemu_iovec_init_external(&qiov, &iov, 1);
3998
    return bdrv_writev_vmstate(bs, &qiov, pos);
3999
}
4000

    
4001
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4002
{
4003
    BlockDriver *drv = bs->drv;
4004

    
4005
    if (!drv) {
4006
        return -ENOMEDIUM;
4007
    } else if (drv->bdrv_save_vmstate) {
4008
        return drv->bdrv_save_vmstate(bs, qiov, pos);
4009
    } else if (bs->file) {
4010
        return bdrv_writev_vmstate(bs->file, qiov, pos);
4011
    }
4012

    
4013
    return -ENOTSUP;
4014
}
4015

    
4016
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4017
                      int64_t pos, int size)
4018
{
4019
    BlockDriver *drv = bs->drv;
4020
    if (!drv)
4021
        return -ENOMEDIUM;
4022
    if (drv->bdrv_load_vmstate)
4023
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
4024
    if (bs->file)
4025
        return bdrv_load_vmstate(bs->file, buf, pos, size);
4026
    return -ENOTSUP;
4027
}
4028

    
4029
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4030
{
4031
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4032
        return;
4033
    }
4034

    
4035
    bs->drv->bdrv_debug_event(bs, event);
4036
}
4037

    
4038
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4039
                          const char *tag)
4040
{
4041
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4042
        bs = bs->file;
4043
    }
4044

    
4045
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4046
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4047
    }
4048

    
4049
    return -ENOTSUP;
4050
}
4051

    
4052
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4053
{
4054
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4055
        bs = bs->file;
4056
    }
4057

    
4058
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4059
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4060
    }
4061

    
4062
    return -ENOTSUP;
4063
}
4064

    
4065
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4066
{
4067
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4068
        bs = bs->file;
4069
    }
4070

    
4071
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4072
        return bs->drv->bdrv_debug_resume(bs, tag);
4073
    }
4074

    
4075
    return -ENOTSUP;
4076
}
4077

    
4078
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4079
{
4080
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4081
        bs = bs->file;
4082
    }
4083

    
4084
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4085
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
4086
    }
4087

    
4088
    return false;
4089
}
4090

    
4091
int bdrv_is_snapshot(BlockDriverState *bs)
4092
{
4093
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4094
}
4095

    
4096
/* backing_file can either be relative, or absolute, or a protocol.  If it is
4097
 * relative, it must be relative to the chain.  So, passing in bs->filename
4098
 * from a BDS as backing_file should not be done, as that may be relative to
4099
 * the CWD rather than the chain. */
4100
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4101
        const char *backing_file)
4102
{
4103
    char *filename_full = NULL;
4104
    char *backing_file_full = NULL;
4105
    char *filename_tmp = NULL;
4106
    int is_protocol = 0;
4107
    BlockDriverState *curr_bs = NULL;
4108
    BlockDriverState *retval = NULL;
4109

    
4110
    if (!bs || !bs->drv || !backing_file) {
4111
        return NULL;
4112
    }
4113

    
4114
    filename_full     = g_malloc(PATH_MAX);
4115
    backing_file_full = g_malloc(PATH_MAX);
4116
    filename_tmp      = g_malloc(PATH_MAX);
4117

    
4118
    is_protocol = path_has_protocol(backing_file);
4119

    
4120
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4121

    
4122
        /* If either of the filename paths is actually a protocol, then
4123
         * compare unmodified paths; otherwise make paths relative */
4124
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4125
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4126
                retval = curr_bs->backing_hd;
4127
                break;
4128
            }
4129
        } else {
4130
            /* If not an absolute filename path, make it relative to the current
4131
             * image's filename path */
4132
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4133
                         backing_file);
4134

    
4135
            /* We are going to compare absolute pathnames */
4136
            if (!realpath(filename_tmp, filename_full)) {
4137
                continue;
4138
            }
4139

    
4140
            /* We need to make sure the backing filename we are comparing against
4141
             * is relative to the current image filename (or absolute) */
4142
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4143
                         curr_bs->backing_file);
4144

    
4145
            if (!realpath(filename_tmp, backing_file_full)) {
4146
                continue;
4147
            }
4148

    
4149
            if (strcmp(backing_file_full, filename_full) == 0) {
4150
                retval = curr_bs->backing_hd;
4151
                break;
4152
            }
4153
        }
4154
    }
4155

    
4156
    g_free(filename_full);
4157
    g_free(backing_file_full);
4158
    g_free(filename_tmp);
4159
    return retval;
4160
}
4161

    
4162
int bdrv_get_backing_file_depth(BlockDriverState *bs)
4163
{
4164
    if (!bs->drv) {
4165
        return 0;
4166
    }
4167

    
4168
    if (!bs->backing_hd) {
4169
        return 0;
4170
    }
4171

    
4172
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4173
}
4174

    
4175
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4176
{
4177
    BlockDriverState *curr_bs = NULL;
4178

    
4179
    if (!bs) {
4180
        return NULL;
4181
    }
4182

    
4183
    curr_bs = bs;
4184

    
4185
    while (curr_bs->backing_hd) {
4186
        curr_bs = curr_bs->backing_hd;
4187
    }
4188
    return curr_bs;
4189
}
4190

    
4191
/**************************************************************/
4192
/* async I/Os */
4193

    
4194
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4195
                                 QEMUIOVector *qiov, int nb_sectors,
4196
                                 BlockDriverCompletionFunc *cb, void *opaque)
4197
{
4198
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4199

    
4200
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4201
                                 cb, opaque, false);
4202
}
4203

    
4204
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4205
                                  QEMUIOVector *qiov, int nb_sectors,
4206
                                  BlockDriverCompletionFunc *cb, void *opaque)
4207
{
4208
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4209

    
4210
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4211
                                 cb, opaque, true);
4212
}
4213

    
4214
BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4215
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4216
        BlockDriverCompletionFunc *cb, void *opaque)
4217
{
4218
    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4219

    
4220
    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4221
                                 BDRV_REQ_ZERO_WRITE | flags,
4222
                                 cb, opaque, true);
4223
}
4224

    
4225

    
4226
typedef struct MultiwriteCB {
4227
    int error;
4228
    int num_requests;
4229
    int num_callbacks;
4230
    struct {
4231
        BlockDriverCompletionFunc *cb;
4232
        void *opaque;
4233
        QEMUIOVector *free_qiov;
4234
    } callbacks[];
4235
} MultiwriteCB;
4236

    
4237
static void multiwrite_user_cb(MultiwriteCB *mcb)
4238
{
4239
    int i;
4240

    
4241
    for (i = 0; i < mcb->num_callbacks; i++) {
4242
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4243
        if (mcb->callbacks[i].free_qiov) {
4244
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4245
        }
4246
        g_free(mcb->callbacks[i].free_qiov);
4247
    }
4248
}
4249

    
4250
static void multiwrite_cb(void *opaque, int ret)
4251
{
4252
    MultiwriteCB *mcb = opaque;
4253

    
4254
    trace_multiwrite_cb(mcb, ret);
4255

    
4256
    if (ret < 0 && !mcb->error) {
4257
        mcb->error = ret;
4258
    }
4259

    
4260
    mcb->num_requests--;
4261
    if (mcb->num_requests == 0) {
4262
        multiwrite_user_cb(mcb);
4263
        g_free(mcb);
4264
    }
4265
}
4266

    
4267
static int multiwrite_req_compare(const void *a, const void *b)
4268
{
4269
    const BlockRequest *req1 = a, *req2 = b;
4270

    
4271
    /*
4272
     * Note that we can't simply subtract req2->sector from req1->sector
4273
     * here as that could overflow the return value.
4274
     */
4275
    if (req1->sector > req2->sector) {
4276
        return 1;
4277
    } else if (req1->sector < req2->sector) {
4278
        return -1;
4279
    } else {
4280
        return 0;
4281
    }
4282
}
4283

    
4284
/*
4285
 * Takes a bunch of requests and tries to merge them. Returns the number of
4286
 * requests that remain after merging.
4287
 */
4288
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4289
    int num_reqs, MultiwriteCB *mcb)
4290
{
4291
    int i, outidx;
4292

    
4293
    // Sort requests by start sector
4294
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4295

    
4296
    // Check if adjacent requests touch the same clusters. If so, combine them,
4297
    // filling up gaps with zero sectors.
4298
    outidx = 0;
4299
    for (i = 1; i < num_reqs; i++) {
4300
        int merge = 0;
4301
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4302

    
4303
        // Handle exactly sequential writes and overlapping writes.
4304
        if (reqs[i].sector <= oldreq_last) {
4305
            merge = 1;
4306
        }
4307

    
4308
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4309
            merge = 0;
4310
        }
4311

    
4312
        if (merge) {
4313
            size_t size;
4314
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4315
            qemu_iovec_init(qiov,
4316
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4317

    
4318
            // Add the first request to the merged one. If the requests are
4319
            // overlapping, drop the last sectors of the first request.
4320
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
4321
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4322

    
4323
            // We should need to add any zeros between the two requests
4324
            assert (reqs[i].sector <= oldreq_last);
4325

    
4326
            // Add the second request
4327
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4328

    
4329
            reqs[outidx].nb_sectors = qiov->size >> 9;
4330
            reqs[outidx].qiov = qiov;
4331

    
4332
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4333
        } else {
4334
            outidx++;
4335
            reqs[outidx].sector     = reqs[i].sector;
4336
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4337
            reqs[outidx].qiov       = reqs[i].qiov;
4338
        }
4339
    }
4340

    
4341
    return outidx + 1;
4342
}
4343

    
4344
/*
4345
 * Submit multiple AIO write requests at once.
4346
 *
4347
 * On success, the function returns 0 and all requests in the reqs array have
4348
 * been submitted. In error case this function returns -1, and any of the
4349
 * requests may or may not be submitted yet. In particular, this means that the
4350
 * callback will be called for some of the requests, for others it won't. The
4351
 * caller must check the error field of the BlockRequest to wait for the right
4352
 * callbacks (if error != 0, no callback will be called).
4353
 *
4354
 * The implementation may modify the contents of the reqs array, e.g. to merge
4355
 * requests. However, the fields opaque and error are left unmodified as they
4356
 * are used to signal failure for a single request to the caller.
4357
 */
4358
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4359
{
4360
    MultiwriteCB *mcb;
4361
    int i;
4362

    
4363
    /* don't submit writes if we don't have a medium */
4364
    if (bs->drv == NULL) {
4365
        for (i = 0; i < num_reqs; i++) {
4366
            reqs[i].error = -ENOMEDIUM;
4367
        }
4368
        return -1;
4369
    }
4370

    
4371
    if (num_reqs == 0) {
4372
        return 0;
4373
    }
4374

    
4375
    // Create MultiwriteCB structure
4376
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4377
    mcb->num_requests = 0;
4378
    mcb->num_callbacks = num_reqs;
4379

    
4380
    for (i = 0; i < num_reqs; i++) {
4381
        mcb->callbacks[i].cb = reqs[i].cb;
4382
        mcb->callbacks[i].opaque = reqs[i].opaque;
4383
    }
4384

    
4385
    // Check for mergable requests
4386
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4387

    
4388
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4389

    
4390
    /* Run the aio requests. */
4391
    mcb->num_requests = num_reqs;
4392
    for (i = 0; i < num_reqs; i++) {
4393
        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4394
                              reqs[i].nb_sectors, reqs[i].flags,
4395
                              multiwrite_cb, mcb,
4396
                              true);
4397
    }
4398

    
4399
    return 0;
4400
}
4401

    
4402
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4403
{
4404
    acb->aiocb_info->cancel(acb);
4405
}
4406

    
4407
/**************************************************************/
4408
/* async block device emulation */
4409

    
4410
typedef struct BlockDriverAIOCBSync {
4411
    BlockDriverAIOCB common;
4412
    QEMUBH *bh;
4413
    int ret;
4414
    /* vector translation state */
4415
    QEMUIOVector *qiov;
4416
    uint8_t *bounce;
4417
    int is_write;
4418
} BlockDriverAIOCBSync;
4419

    
4420
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4421
{
4422
    BlockDriverAIOCBSync *acb =
4423
        container_of(blockacb, BlockDriverAIOCBSync, common);
4424
    qemu_bh_delete(acb->bh);
4425
    acb->bh = NULL;
4426
    qemu_aio_release(acb);
4427
}
4428

    
4429
static const AIOCBInfo bdrv_em_aiocb_info = {
4430
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4431
    .cancel             = bdrv_aio_cancel_em,
4432
};
4433

    
4434
static void bdrv_aio_bh_cb(void *opaque)
4435
{
4436
    BlockDriverAIOCBSync *acb = opaque;
4437

    
4438
    if (!acb->is_write)
4439
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4440
    qemu_vfree(acb->bounce);
4441
    acb->common.cb(acb->common.opaque, acb->ret);
4442
    qemu_bh_delete(acb->bh);
4443
    acb->bh = NULL;
4444
    qemu_aio_release(acb);
4445
}
4446

    
4447
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4448
                                            int64_t sector_num,
4449
                                            QEMUIOVector *qiov,
4450
                                            int nb_sectors,
4451
                                            BlockDriverCompletionFunc *cb,
4452
                                            void *opaque,
4453
                                            int is_write)
4454

    
4455
{
4456
    BlockDriverAIOCBSync *acb;
4457

    
4458
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4459
    acb->is_write = is_write;
4460
    acb->qiov = qiov;
4461
    acb->bounce = qemu_blockalign(bs, qiov->size);
4462
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4463

    
4464
    if (is_write) {
4465
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4466
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4467
    } else {
4468
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4469
    }
4470

    
4471
    qemu_bh_schedule(acb->bh);
4472

    
4473
    return &acb->common;
4474
}
4475

    
4476
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4477
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4478
        BlockDriverCompletionFunc *cb, void *opaque)
4479
{
4480
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4481
}
4482

    
4483
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4484
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4485
        BlockDriverCompletionFunc *cb, void *opaque)
4486
{
4487
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4488
}
4489

    
4490

    
4491
typedef struct BlockDriverAIOCBCoroutine {
4492
    BlockDriverAIOCB common;
4493
    BlockRequest req;
4494
    bool is_write;
4495
    bool *done;
4496
    QEMUBH* bh;
4497
} BlockDriverAIOCBCoroutine;
4498

    
4499
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4500
{
4501
    BlockDriverAIOCBCoroutine *acb =
4502
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4503
    bool done = false;
4504

    
4505
    acb->done = &done;
4506
    while (!done) {
4507
        qemu_aio_wait();
4508
    }
4509
}
4510

    
4511
static const AIOCBInfo bdrv_em_co_aiocb_info = {
4512
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4513
    .cancel             = bdrv_aio_co_cancel_em,
4514
};
4515

    
4516
static void bdrv_co_em_bh(void *opaque)
4517
{
4518
    BlockDriverAIOCBCoroutine *acb = opaque;
4519

    
4520
    acb->common.cb(acb->common.opaque, acb->req.error);
4521

    
4522
    if (acb->done) {
4523
        *acb->done = true;
4524
    }
4525

    
4526
    qemu_bh_delete(acb->bh);
4527
    qemu_aio_release(acb);
4528
}
4529

    
4530
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4531
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4532
{
4533
    BlockDriverAIOCBCoroutine *acb = opaque;
4534
    BlockDriverState *bs = acb->common.bs;
4535

    
4536
    if (!acb->is_write) {
4537
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4538
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4539
    } else {
4540
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4541
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4542
    }
4543

    
4544
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4545
    qemu_bh_schedule(acb->bh);
4546
}
4547

    
4548
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4549
                                               int64_t sector_num,
4550
                                               QEMUIOVector *qiov,
4551
                                               int nb_sectors,
4552
                                               BdrvRequestFlags flags,
4553
                                               BlockDriverCompletionFunc *cb,
4554
                                               void *opaque,
4555
                                               bool is_write)
4556
{
4557
    Coroutine *co;
4558
    BlockDriverAIOCBCoroutine *acb;
4559

    
4560
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4561
    acb->req.sector = sector_num;
4562
    acb->req.nb_sectors = nb_sectors;
4563
    acb->req.qiov = qiov;
4564
    acb->req.flags = flags;
4565
    acb->is_write = is_write;
4566
    acb->done = NULL;
4567

    
4568
    co = qemu_coroutine_create(bdrv_co_do_rw);
4569
    qemu_coroutine_enter(co, acb);
4570

    
4571
    return &acb->common;
4572
}
4573

    
4574
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4575
{
4576
    BlockDriverAIOCBCoroutine *acb = opaque;
4577
    BlockDriverState *bs = acb->common.bs;
4578

    
4579
    acb->req.error = bdrv_co_flush(bs);
4580
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4581
    qemu_bh_schedule(acb->bh);
4582
}
4583

    
4584
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4585
        BlockDriverCompletionFunc *cb, void *opaque)
4586
{
4587
    trace_bdrv_aio_flush(bs, opaque);
4588

    
4589
    Coroutine *co;
4590
    BlockDriverAIOCBCoroutine *acb;
4591

    
4592
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4593
    acb->done = NULL;
4594

    
4595
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4596
    qemu_coroutine_enter(co, acb);
4597

    
4598
    return &acb->common;
4599
}
4600

    
4601
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4602
{
4603
    BlockDriverAIOCBCoroutine *acb = opaque;
4604
    BlockDriverState *bs = acb->common.bs;
4605

    
4606
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4607
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4608
    qemu_bh_schedule(acb->bh);
4609
}
4610

    
4611
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4612
        int64_t sector_num, int nb_sectors,
4613
        BlockDriverCompletionFunc *cb, void *opaque)
4614
{
4615
    Coroutine *co;
4616
    BlockDriverAIOCBCoroutine *acb;
4617

    
4618
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4619

    
4620
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4621
    acb->req.sector = sector_num;
4622
    acb->req.nb_sectors = nb_sectors;
4623
    acb->done = NULL;
4624
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4625
    qemu_coroutine_enter(co, acb);
4626

    
4627
    return &acb->common;
4628
}
4629

    
4630
void bdrv_init(void)
4631
{
4632
    module_call_init(MODULE_INIT_BLOCK);
4633
}
4634

    
4635
void bdrv_init_with_whitelist(void)
4636
{
4637
    use_bdrv_whitelist = 1;
4638
    bdrv_init();
4639
}
4640

    
4641
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4642
                   BlockDriverCompletionFunc *cb, void *opaque)
4643
{
4644
    BlockDriverAIOCB *acb;
4645

    
4646
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4647
    acb->aiocb_info = aiocb_info;
4648
    acb->bs = bs;
4649
    acb->cb = cb;
4650
    acb->opaque = opaque;
4651
    return acb;
4652
}
4653

    
4654
void qemu_aio_release(void *p)
4655
{
4656
    BlockDriverAIOCB *acb = p;
4657
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4658
}
4659

    
4660
/**************************************************************/
4661
/* Coroutine block device emulation */
4662

    
4663
typedef struct CoroutineIOCompletion {
4664
    Coroutine *coroutine;
4665
    int ret;
4666
} CoroutineIOCompletion;
4667

    
4668
static void bdrv_co_io_em_complete(void *opaque, int ret)
4669
{
4670
    CoroutineIOCompletion *co = opaque;
4671

    
4672
    co->ret = ret;
4673
    qemu_coroutine_enter(co->coroutine, NULL);
4674
}
4675

    
4676
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4677
                                      int nb_sectors, QEMUIOVector *iov,
4678
                                      bool is_write)
4679
{
4680
    CoroutineIOCompletion co = {
4681
        .coroutine = qemu_coroutine_self(),
4682
    };
4683
    BlockDriverAIOCB *acb;
4684

    
4685
    if (is_write) {
4686
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4687
                                       bdrv_co_io_em_complete, &co);
4688
    } else {
4689
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4690
                                      bdrv_co_io_em_complete, &co);
4691
    }
4692

    
4693
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4694
    if (!acb) {
4695
        return -EIO;
4696
    }
4697
    qemu_coroutine_yield();
4698

    
4699
    return co.ret;
4700
}
4701

    
4702
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4703
                                         int64_t sector_num, int nb_sectors,
4704
                                         QEMUIOVector *iov)
4705
{
4706
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4707
}
4708

    
4709
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4710
                                         int64_t sector_num, int nb_sectors,
4711
                                         QEMUIOVector *iov)
4712
{
4713
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4714
}
4715

    
4716
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4717
{
4718
    RwCo *rwco = opaque;
4719

    
4720
    rwco->ret = bdrv_co_flush(rwco->bs);
4721
}
4722

    
4723
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4724
{
4725
    int ret;
4726

    
4727
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4728
        return 0;
4729
    }
4730

    
4731
    /* Write back cached data to the OS even with cache=unsafe */
4732
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4733
    if (bs->drv->bdrv_co_flush_to_os) {
4734
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4735
        if (ret < 0) {
4736
            return ret;
4737
        }
4738
    }
4739

    
4740
    /* But don't actually force it to the disk with cache=unsafe */
4741
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4742
        goto flush_parent;
4743
    }
4744

    
4745
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4746
    if (bs->drv->bdrv_co_flush_to_disk) {
4747
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4748
    } else if (bs->drv->bdrv_aio_flush) {
4749
        BlockDriverAIOCB *acb;
4750
        CoroutineIOCompletion co = {
4751
            .coroutine = qemu_coroutine_self(),
4752
        };
4753

    
4754
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4755
        if (acb == NULL) {
4756
            ret = -EIO;
4757
        } else {
4758
            qemu_coroutine_yield();
4759
            ret = co.ret;
4760
        }
4761
    } else {
4762
        /*
4763
         * Some block drivers always operate in either writethrough or unsafe
4764
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4765
         * know how the server works (because the behaviour is hardcoded or
4766
         * depends on server-side configuration), so we can't ensure that
4767
         * everything is safe on disk. Returning an error doesn't work because
4768
         * that would break guests even if the server operates in writethrough
4769
         * mode.
4770
         *
4771
         * Let's hope the user knows what he's doing.
4772
         */
4773
        ret = 0;
4774
    }
4775
    if (ret < 0) {
4776
        return ret;
4777
    }
4778

    
4779
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4780
     * in the case of cache=unsafe, so there are no useless flushes.
4781
     */
4782
flush_parent:
4783
    return bdrv_co_flush(bs->file);
4784
}
4785

    
4786
void bdrv_invalidate_cache(BlockDriverState *bs)
4787
{
4788
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4789
        bs->drv->bdrv_invalidate_cache(bs);
4790
    }
4791
}
4792

    
4793
void bdrv_invalidate_cache_all(void)
4794
{
4795
    BlockDriverState *bs;
4796

    
4797
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4798
        bdrv_invalidate_cache(bs);
4799
    }
4800
}
4801

    
4802
void bdrv_clear_incoming_migration_all(void)
4803
{
4804
    BlockDriverState *bs;
4805

    
4806
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4807
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4808
    }
4809
}
4810

    
4811
int bdrv_flush(BlockDriverState *bs)
4812
{
4813
    Coroutine *co;
4814
    RwCo rwco = {
4815
        .bs = bs,
4816
        .ret = NOT_DONE,
4817
    };
4818

    
4819
    if (qemu_in_coroutine()) {
4820
        /* Fast-path if already in coroutine context */
4821
        bdrv_flush_co_entry(&rwco);
4822
    } else {
4823
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4824
        qemu_coroutine_enter(co, &rwco);
4825
        while (rwco.ret == NOT_DONE) {
4826
            qemu_aio_wait();
4827
        }
4828
    }
4829

    
4830
    return rwco.ret;
4831
}
4832

    
4833
typedef struct DiscardCo {
4834
    BlockDriverState *bs;
4835
    int64_t sector_num;
4836
    int nb_sectors;
4837
    int ret;
4838
} DiscardCo;
4839
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4840
{
4841
    DiscardCo *rwco = opaque;
4842

    
4843
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4844
}
4845

    
4846
/* if no limit is specified in the BlockLimits use a default
4847
 * of 32768 512-byte sectors (16 MiB) per request.
4848
 */
4849
#define MAX_DISCARD_DEFAULT 32768
4850

    
4851
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4852
                                 int nb_sectors)
4853
{
4854
    int max_discard;
4855

    
4856
    if (!bs->drv) {
4857
        return -ENOMEDIUM;
4858
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4859
        return -EIO;
4860
    } else if (bs->read_only) {
4861
        return -EROFS;
4862
    }
4863

    
4864
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4865

    
4866
    /* Do nothing if disabled.  */
4867
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4868
        return 0;
4869
    }
4870

    
4871
    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4872
        return 0;
4873
    }
4874

    
4875
    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4876
    while (nb_sectors > 0) {
4877
        int ret;
4878
        int num = nb_sectors;
4879

    
4880
        /* align request */
4881
        if (bs->bl.discard_alignment &&
4882
            num >= bs->bl.discard_alignment &&
4883
            sector_num % bs->bl.discard_alignment) {
4884
            if (num > bs->bl.discard_alignment) {
4885
                num = bs->bl.discard_alignment;
4886
            }
4887
            num -= sector_num % bs->bl.discard_alignment;
4888
        }
4889

    
4890
        /* limit request size */
4891
        if (num > max_discard) {
4892
            num = max_discard;
4893
        }
4894

    
4895
        if (bs->drv->bdrv_co_discard) {
4896
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4897
        } else {
4898
            BlockDriverAIOCB *acb;
4899
            CoroutineIOCompletion co = {
4900
                .coroutine = qemu_coroutine_self(),
4901
            };
4902

    
4903
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4904
                                            bdrv_co_io_em_complete, &co);
4905
            if (acb == NULL) {
4906
                return -EIO;
4907
            } else {
4908
                qemu_coroutine_yield();
4909
                ret = co.ret;
4910
            }
4911
        }
4912
        if (ret && ret != -ENOTSUP) {
4913
            return ret;
4914
        }
4915

    
4916
        sector_num += num;
4917
        nb_sectors -= num;
4918
    }
4919
    return 0;
4920
}
4921

    
4922
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4923
{
4924
    Coroutine *co;
4925
    DiscardCo rwco = {
4926
        .bs = bs,
4927
        .sector_num = sector_num,
4928
        .nb_sectors = nb_sectors,
4929
        .ret = NOT_DONE,
4930
    };
4931

    
4932
    if (qemu_in_coroutine()) {
4933
        /* Fast-path if already in coroutine context */
4934
        bdrv_discard_co_entry(&rwco);
4935
    } else {
4936
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4937
        qemu_coroutine_enter(co, &rwco);
4938
        while (rwco.ret == NOT_DONE) {
4939
            qemu_aio_wait();
4940
        }
4941
    }
4942

    
4943
    return rwco.ret;
4944
}
4945

    
4946
/**************************************************************/
4947
/* removable device support */
4948

    
4949
/**
4950
 * Return TRUE if the media is present
4951
 */
4952
int bdrv_is_inserted(BlockDriverState *bs)
4953
{
4954
    BlockDriver *drv = bs->drv;
4955

    
4956
    if (!drv)
4957
        return 0;
4958
    if (!drv->bdrv_is_inserted)
4959
        return 1;
4960
    return drv->bdrv_is_inserted(bs);
4961
}
4962

    
4963
/**
4964
 * Return whether the media changed since the last call to this
4965
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4966
 */
4967
int bdrv_media_changed(BlockDriverState *bs)
4968
{
4969
    BlockDriver *drv = bs->drv;
4970

    
4971
    if (drv && drv->bdrv_media_changed) {
4972
        return drv->bdrv_media_changed(bs);
4973
    }
4974
    return -ENOTSUP;
4975
}
4976

    
4977
/**
4978
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4979
 */
4980
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4981
{
4982
    BlockDriver *drv = bs->drv;
4983

    
4984
    if (drv && drv->bdrv_eject) {
4985
        drv->bdrv_eject(bs, eject_flag);
4986
    }
4987

    
4988
    if (bs->device_name[0] != '\0') {
4989
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4990
    }
4991
}
4992

    
4993
/**
4994
 * Lock or unlock the media (if it is locked, the user won't be able
4995
 * to eject it manually).
4996
 */
4997
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4998
{
4999
    BlockDriver *drv = bs->drv;
5000

    
5001
    trace_bdrv_lock_medium(bs, locked);
5002

    
5003
    if (drv && drv->bdrv_lock_medium) {
5004
        drv->bdrv_lock_medium(bs, locked);
5005
    }
5006
}
5007

    
5008
/* needed for generic scsi interface */
5009

    
5010
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5011
{
5012
    BlockDriver *drv = bs->drv;
5013

    
5014
    if (drv && drv->bdrv_ioctl)
5015
        return drv->bdrv_ioctl(bs, req, buf);
5016
    return -ENOTSUP;
5017
}
5018

    
5019
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5020
        unsigned long int req, void *buf,
5021
        BlockDriverCompletionFunc *cb, void *opaque)
5022
{
5023
    BlockDriver *drv = bs->drv;
5024

    
5025
    if (drv && drv->bdrv_aio_ioctl)
5026
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5027
    return NULL;
5028
}
5029

    
5030
void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5031
{
5032
    bs->guest_block_size = align;
5033
}
5034

    
5035
void *qemu_blockalign(BlockDriverState *bs, size_t size)
5036
{
5037
    return qemu_memalign(bdrv_opt_mem_align(bs), size);
5038
}
5039

    
5040
/*
5041
 * Check if all memory in this vector is sector aligned.
5042
 */
5043
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5044
{
5045
    int i;
5046
    size_t alignment = bdrv_opt_mem_align(bs);
5047

    
5048
    for (i = 0; i < qiov->niov; i++) {
5049
        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5050
            return false;
5051
        }
5052
        if (qiov->iov[i].iov_len % alignment) {
5053
            return false;
5054
        }
5055
    }
5056

    
5057
    return true;
5058
}
5059

    
5060
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5061
{
5062
    int64_t bitmap_size;
5063
    BdrvDirtyBitmap *bitmap;
5064

    
5065
    assert((granularity & (granularity - 1)) == 0);
5066

    
5067
    granularity >>= BDRV_SECTOR_BITS;
5068
    assert(granularity);
5069
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5070
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5071
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5072
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5073
    return bitmap;
5074
}
5075

    
5076
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5077
{
5078
    BdrvDirtyBitmap *bm, *next;
5079
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5080
        if (bm == bitmap) {
5081
            QLIST_REMOVE(bitmap, list);
5082
            hbitmap_free(bitmap->bitmap);
5083
            g_free(bitmap);
5084
            return;
5085
        }
5086
    }
5087
}
5088

    
5089
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5090
{
5091
    BdrvDirtyBitmap *bm;
5092
    BlockDirtyInfoList *list = NULL;
5093
    BlockDirtyInfoList **plist = &list;
5094

    
5095
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5096
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5097
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5098
        info->count = bdrv_get_dirty_count(bs, bm);
5099
        info->granularity =
5100
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5101
        entry->value = info;
5102
        *plist = entry;
5103
        plist = &entry->next;
5104
    }
5105

    
5106
    return list;
5107
}
5108

    
5109
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5110
{
5111
    if (bitmap) {
5112
        return hbitmap_get(bitmap->bitmap, sector);
5113
    } else {
5114
        return 0;
5115
    }
5116
}
5117

    
5118
void bdrv_dirty_iter_init(BlockDriverState *bs,
5119
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5120
{
5121
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5122
}
5123

    
5124
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5125
                    int nr_sectors)
5126
{
5127
    BdrvDirtyBitmap *bitmap;
5128
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5129
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5130
    }
5131
}
5132

    
5133
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5134
{
5135
    BdrvDirtyBitmap *bitmap;
5136
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5137
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5138
    }
5139
}
5140

    
5141
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5142
{
5143
    return hbitmap_count(bitmap->bitmap);
5144
}
5145

    
5146
/* Get a reference to bs */
5147
void bdrv_ref(BlockDriverState *bs)
5148
{
5149
    bs->refcnt++;
5150
}
5151

    
5152
/* Release a previously grabbed reference to bs.
5153
 * If after releasing, reference count is zero, the BlockDriverState is
5154
 * deleted. */
5155
void bdrv_unref(BlockDriverState *bs)
5156
{
5157
    assert(bs->refcnt > 0);
5158
    if (--bs->refcnt == 0) {
5159
        bdrv_delete(bs);
5160
    }
5161
}
5162

    
5163
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5164
{
5165
    assert(bs->in_use != in_use);
5166
    bs->in_use = in_use;
5167
}
5168

    
5169
int bdrv_in_use(BlockDriverState *bs)
5170
{
5171
    return bs->in_use;
5172
}
5173

    
5174
void bdrv_iostatus_enable(BlockDriverState *bs)
5175
{
5176
    bs->iostatus_enabled = true;
5177
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5178
}
5179

    
5180
/* The I/O status is only enabled if the drive explicitly
5181
 * enables it _and_ the VM is configured to stop on errors */
5182
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5183
{
5184
    return (bs->iostatus_enabled &&
5185
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5186
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5187
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5188
}
5189

    
5190
void bdrv_iostatus_disable(BlockDriverState *bs)
5191
{
5192
    bs->iostatus_enabled = false;
5193
}
5194

    
5195
void bdrv_iostatus_reset(BlockDriverState *bs)
5196
{
5197
    if (bdrv_iostatus_is_enabled(bs)) {
5198
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5199
        if (bs->job) {
5200
            block_job_iostatus_reset(bs->job);
5201
        }
5202
    }
5203
}
5204

    
5205
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5206
{
5207
    assert(bdrv_iostatus_is_enabled(bs));
5208
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5209
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5210
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
5211
    }
5212
}
5213

    
5214
void
5215
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5216
        enum BlockAcctType type)
5217
{
5218
    assert(type < BDRV_MAX_IOTYPE);
5219

    
5220
    cookie->bytes = bytes;
5221
    cookie->start_time_ns = get_clock();
5222
    cookie->type = type;
5223
}
5224

    
5225
void
5226
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5227
{
5228
    assert(cookie->type < BDRV_MAX_IOTYPE);
5229

    
5230
    bs->nr_bytes[cookie->type] += cookie->bytes;
5231
    bs->nr_ops[cookie->type]++;
5232
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5233
}
5234

    
5235
void bdrv_img_create(const char *filename, const char *fmt,
5236
                     const char *base_filename, const char *base_fmt,
5237
                     char *options, uint64_t img_size, int flags,
5238
                     Error **errp, bool quiet)
5239
{
5240
    QEMUOptionParameter *param = NULL, *create_options = NULL;
5241
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
5242
    BlockDriver *drv, *proto_drv;
5243
    BlockDriver *backing_drv = NULL;
5244
    Error *local_err = NULL;
5245
    int ret = 0;
5246

    
5247
    /* Find driver and parse its options */
5248
    drv = bdrv_find_format(fmt);
5249
    if (!drv) {
5250
        error_setg(errp, "Unknown file format '%s'", fmt);
5251
        return;
5252
    }
5253

    
5254
    proto_drv = bdrv_find_protocol(filename, true);
5255
    if (!proto_drv) {
5256
        error_setg(errp, "Unknown protocol '%s'", filename);
5257
        return;
5258
    }
5259

    
5260
    create_options = append_option_parameters(create_options,
5261
                                              drv->create_options);
5262
    create_options = append_option_parameters(create_options,
5263
                                              proto_drv->create_options);
5264

    
5265
    /* Create parameter list with default values */
5266
    param = parse_option_parameters("", create_options, param);
5267

    
5268
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5269

    
5270
    /* Parse -o options */
5271
    if (options) {
5272
        param = parse_option_parameters(options, create_options, param);
5273
        if (param == NULL) {
5274
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
5275
            goto out;
5276
        }
5277
    }
5278

    
5279
    if (base_filename) {
5280
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5281
                                 base_filename)) {
5282
            error_setg(errp, "Backing file not supported for file format '%s'",
5283
                       fmt);
5284
            goto out;
5285
        }
5286
    }
5287

    
5288
    if (base_fmt) {
5289
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5290
            error_setg(errp, "Backing file format not supported for file "
5291
                             "format '%s'", fmt);
5292
            goto out;
5293
        }
5294
    }
5295

    
5296
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5297
    if (backing_file && backing_file->value.s) {
5298
        if (!strcmp(filename, backing_file->value.s)) {
5299
            error_setg(errp, "Error: Trying to create an image with the "
5300
                             "same filename as the backing file");
5301
            goto out;
5302
        }
5303
    }
5304

    
5305
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5306
    if (backing_fmt && backing_fmt->value.s) {
5307
        backing_drv = bdrv_find_format(backing_fmt->value.s);
5308
        if (!backing_drv) {
5309
            error_setg(errp, "Unknown backing file format '%s'",
5310
                       backing_fmt->value.s);
5311
            goto out;
5312
        }
5313
    }
5314

    
5315
    // The size for the image must always be specified, with one exception:
5316
    // If we are using a backing file, we can obtain the size from there
5317
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
5318
    if (size && size->value.n == -1) {
5319
        if (backing_file && backing_file->value.s) {
5320
            BlockDriverState *bs;
5321
            uint64_t size;
5322
            char buf[32];
5323
            int back_flags;
5324

    
5325
            /* backing files always opened read-only */
5326
            back_flags =
5327
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5328

    
5329
            bs = NULL;
5330
            ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5331
                            backing_drv, &local_err);
5332
            if (ret < 0) {
5333
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
5334
                                 backing_file->value.s,
5335
                                 error_get_pretty(local_err));
5336
                error_free(local_err);
5337
                local_err = NULL;
5338
                goto out;
5339
            }
5340
            bdrv_get_geometry(bs, &size);
5341
            size *= 512;
5342

    
5343
            snprintf(buf, sizeof(buf), "%" PRId64, size);
5344
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5345

    
5346
            bdrv_unref(bs);
5347
        } else {
5348
            error_setg(errp, "Image creation needs a size parameter");
5349
            goto out;
5350
        }
5351
    }
5352

    
5353
    if (!quiet) {
5354
        printf("Formatting '%s', fmt=%s ", filename, fmt);
5355
        print_option_parameters(param);
5356
        puts("");
5357
    }
5358
    ret = bdrv_create(drv, filename, param, &local_err);
5359
    if (ret == -EFBIG) {
5360
        /* This is generally a better message than whatever the driver would
5361
         * deliver (especially because of the cluster_size_hint), since that
5362
         * is most probably not much different from "image too large". */
5363
        const char *cluster_size_hint = "";
5364
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5365
            cluster_size_hint = " (try using a larger cluster size)";
5366
        }
5367
        error_setg(errp, "The image size is too large for file format '%s'"
5368
                   "%s", fmt, cluster_size_hint);
5369
        error_free(local_err);
5370
        local_err = NULL;
5371
    }
5372

    
5373
out:
5374
    free_option_parameters(create_options);
5375
    free_option_parameters(param);
5376

    
5377
    if (local_err) {
5378
        error_propagate(errp, local_err);
5379
    }
5380
}
5381

    
5382
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5383
{
5384
    /* Currently BlockDriverState always uses the main loop AioContext */
5385
    return qemu_get_aio_context();
5386
}
5387

    
5388
void bdrv_add_before_write_notifier(BlockDriverState *bs,
5389
                                    NotifierWithReturn *notifier)
5390
{
5391
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5392
}
5393

    
5394
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5395
{
5396
    if (bs->drv->bdrv_amend_options == NULL) {
5397
        return -ENOTSUP;
5398
    }
5399
    return bs->drv->bdrv_amend_options(bs, options);
5400
}
5401

    
5402
/* Used to recurse on single child block filters.
5403
 * Single child block filter will store their child in bs->file.
5404
 */
5405
bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5406
                                      BlockDriverState *candidate)
5407
{
5408
    if (!bs->drv) {
5409
        return false;
5410
    }
5411

    
5412
    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5413
        if (bs == candidate) {
5414
            return true;
5415
        } else {
5416
            return false;
5417
        }
5418
    }
5419

    
5420
    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5421
        return false;
5422
    }
5423

    
5424
    if (!bs->file) {
5425
        return false;
5426
    }
5427

    
5428
    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5429
}
5430

    
5431
bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5432
                                      BlockDriverState *candidate)
5433
{
5434
    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5435
        return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5436
    }
5437

    
5438
    return bdrv_generic_is_first_non_filter(bs, candidate);
5439
}
5440

    
5441
/* This function checks if the candidate is the first non filter bs down it's
5442
 * bs chain. Since we don't have pointers to parents it explore all bs chains
5443
 * from the top. Some filters can choose not to pass down the recursion.
5444
 */
5445
bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5446
{
5447
    BlockDriverState *bs;
5448

    
5449
    /* walk down the bs forest recursively */
5450
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5451
        bool perm;
5452

    
5453
        perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5454

    
5455
        /* candidate is the first non filter */
5456
        if (perm) {
5457
            return true;
5458
        }
5459
    }
5460

    
5461
    return false;
5462
}