Statistics
| Branch: | Revision:

root / block.c @ 3b8242e0

History | View | Annotate | Download (152.6 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "block/qapi.h"
36
#include "qmp-commands.h"
37
#include "qemu/timer.h"
38

    
39
#ifdef CONFIG_BSD
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <sys/ioctl.h>
43
#include <sys/queue.h>
44
#ifndef __DragonFly__
45
#include <sys/disk.h>
46
#endif
47
#endif
48

    
49
#ifdef _WIN32
50
#include <windows.h>
51
#endif
52

    
53
struct BdrvDirtyBitmap {
54
    HBitmap *bitmap;
55
    QLIST_ENTRY(BdrvDirtyBitmap) list;
56
};
57

    
58
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59

    
60
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63
        BlockDriverCompletionFunc *cb, void *opaque);
64
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66
        BlockDriverCompletionFunc *cb, void *opaque);
67
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68
                                         int64_t sector_num, int nb_sectors,
69
                                         QEMUIOVector *iov);
70
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71
                                         int64_t sector_num, int nb_sectors,
72
                                         QEMUIOVector *iov);
73
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
74
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
75
    BdrvRequestFlags flags);
76
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
77
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
78
    BdrvRequestFlags flags);
79
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80
                                               int64_t sector_num,
81
                                               QEMUIOVector *qiov,
82
                                               int nb_sectors,
83
                                               BdrvRequestFlags flags,
84
                                               BlockDriverCompletionFunc *cb,
85
                                               void *opaque,
86
                                               bool is_write);
87
static void coroutine_fn bdrv_co_do_rw(void *opaque);
88
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96

    
97
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_set_io_limits(BlockDriverState *bs,
125
                        ThrottleConfig *cfg)
126
{
127
    int i;
128

    
129
    throttle_config(&bs->throttle_state, cfg);
130

    
131
    for (i = 0; i < 2; i++) {
132
        qemu_co_enter_next(&bs->throttled_reqs[i]);
133
    }
134
}
135

    
136
/* this function drain all the throttled IOs */
137
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138
{
139
    bool drained = false;
140
    bool enabled = bs->io_limits_enabled;
141
    int i;
142

    
143
    bs->io_limits_enabled = false;
144

    
145
    for (i = 0; i < 2; i++) {
146
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147
            drained = true;
148
        }
149
    }
150

    
151
    bs->io_limits_enabled = enabled;
152

    
153
    return drained;
154
}
155

    
156
void bdrv_io_limits_disable(BlockDriverState *bs)
157
{
158
    bs->io_limits_enabled = false;
159

    
160
    bdrv_start_throttled_reqs(bs);
161

    
162
    throttle_destroy(&bs->throttle_state);
163
}
164

    
165
static void bdrv_throttle_read_timer_cb(void *opaque)
166
{
167
    BlockDriverState *bs = opaque;
168
    qemu_co_enter_next(&bs->throttled_reqs[0]);
169
}
170

    
171
static void bdrv_throttle_write_timer_cb(void *opaque)
172
{
173
    BlockDriverState *bs = opaque;
174
    qemu_co_enter_next(&bs->throttled_reqs[1]);
175
}
176

    
177
/* should be called before bdrv_set_io_limits if a limit is set */
178
void bdrv_io_limits_enable(BlockDriverState *bs)
179
{
180
    assert(!bs->io_limits_enabled);
181
    throttle_init(&bs->throttle_state,
182
                  QEMU_CLOCK_VIRTUAL,
183
                  bdrv_throttle_read_timer_cb,
184
                  bdrv_throttle_write_timer_cb,
185
                  bs);
186
    bs->io_limits_enabled = true;
187
}
188

    
189
/* This function makes an IO wait if needed
190
 *
191
 * @nb_sectors: the number of sectors of the IO
192
 * @is_write:   is the IO a write
193
 */
194
static void bdrv_io_limits_intercept(BlockDriverState *bs,
195
                                     int nb_sectors,
196
                                     bool is_write)
197
{
198
    /* does this io must wait */
199
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200

    
201
    /* if must wait or any request of this type throttled queue the IO */
202
    if (must_wait ||
203
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205
    }
206

    
207
    /* the IO will be executed, do the accounting */
208
    throttle_account(&bs->throttle_state,
209
                     is_write,
210
                     nb_sectors * BDRV_SECTOR_SIZE);
211

    
212
    /* if the next request must wait -> do nothing */
213
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214
        return;
215
    }
216

    
217
    /* else queue next request for execution */
218
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219
}
220

    
221
size_t bdrv_opt_mem_align(BlockDriverState *bs)
222
{
223
    if (!bs || !bs->drv) {
224
        /* 4k should be on the safe side */
225
        return 4096;
226
    }
227

    
228
    return bs->bl.opt_mem_alignment;
229
}
230

    
231
/* check if the path starts with "<protocol>:" */
232
static int path_has_protocol(const char *path)
233
{
234
    const char *p;
235

    
236
#ifdef _WIN32
237
    if (is_windows_drive(path) ||
238
        is_windows_drive_prefix(path)) {
239
        return 0;
240
    }
241
    p = path + strcspn(path, ":/\\");
242
#else
243
    p = path + strcspn(path, ":/");
244
#endif
245

    
246
    return *p == ':';
247
}
248

    
249
int path_is_absolute(const char *path)
250
{
251
#ifdef _WIN32
252
    /* specific case for names like: "\\.\d:" */
253
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254
        return 1;
255
    }
256
    return (*path == '/' || *path == '\\');
257
#else
258
    return (*path == '/');
259
#endif
260
}
261

    
262
/* if filename is absolute, just copy it to dest. Otherwise, build a
263
   path to it by considering it is relative to base_path. URL are
264
   supported. */
265
void path_combine(char *dest, int dest_size,
266
                  const char *base_path,
267
                  const char *filename)
268
{
269
    const char *p, *p1;
270
    int len;
271

    
272
    if (dest_size <= 0)
273
        return;
274
    if (path_is_absolute(filename)) {
275
        pstrcpy(dest, dest_size, filename);
276
    } else {
277
        p = strchr(base_path, ':');
278
        if (p)
279
            p++;
280
        else
281
            p = base_path;
282
        p1 = strrchr(base_path, '/');
283
#ifdef _WIN32
284
        {
285
            const char *p2;
286
            p2 = strrchr(base_path, '\\');
287
            if (!p1 || p2 > p1)
288
                p1 = p2;
289
        }
290
#endif
291
        if (p1)
292
            p1++;
293
        else
294
            p1 = base_path;
295
        if (p1 > p)
296
            p = p1;
297
        len = p - base_path;
298
        if (len > dest_size - 1)
299
            len = dest_size - 1;
300
        memcpy(dest, base_path, len);
301
        dest[len] = '\0';
302
        pstrcat(dest, dest_size, filename);
303
    }
304
}
305

    
306
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307
{
308
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309
        pstrcpy(dest, sz, bs->backing_file);
310
    } else {
311
        path_combine(dest, sz, bs->filename, bs->backing_file);
312
    }
313
}
314

    
315
void bdrv_register(BlockDriver *bdrv)
316
{
317
    /* Block drivers without coroutine functions need emulation */
318
    if (!bdrv->bdrv_co_readv) {
319
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
320
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
321

    
322
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323
         * the block driver lacks aio we need to emulate that too.
324
         */
325
        if (!bdrv->bdrv_aio_readv) {
326
            /* add AIO emulation layer */
327
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329
        }
330
    }
331

    
332
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333
}
334

    
335
/* create a new block device (by default it is empty) */
336
BlockDriverState *bdrv_new(const char *device_name)
337
{
338
    BlockDriverState *bs;
339

    
340
    bs = g_malloc0(sizeof(BlockDriverState));
341
    QLIST_INIT(&bs->dirty_bitmaps);
342
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
343
    if (device_name[0] != '\0') {
344
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
345
    }
346
    bdrv_iostatus_disable(bs);
347
    notifier_list_init(&bs->close_notifiers);
348
    notifier_with_return_list_init(&bs->before_write_notifiers);
349
    qemu_co_queue_init(&bs->throttled_reqs[0]);
350
    qemu_co_queue_init(&bs->throttled_reqs[1]);
351
    bs->refcnt = 1;
352

    
353
    return bs;
354
}
355

    
356
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
357
{
358
    notifier_list_add(&bs->close_notifiers, notify);
359
}
360

    
361
BlockDriver *bdrv_find_format(const char *format_name)
362
{
363
    BlockDriver *drv1;
364
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
365
        if (!strcmp(drv1->format_name, format_name)) {
366
            return drv1;
367
        }
368
    }
369
    return NULL;
370
}
371

    
372
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
373
{
374
    static const char *whitelist_rw[] = {
375
        CONFIG_BDRV_RW_WHITELIST
376
    };
377
    static const char *whitelist_ro[] = {
378
        CONFIG_BDRV_RO_WHITELIST
379
    };
380
    const char **p;
381

    
382
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
383
        return 1;               /* no whitelist, anything goes */
384
    }
385

    
386
    for (p = whitelist_rw; *p; p++) {
387
        if (!strcmp(drv->format_name, *p)) {
388
            return 1;
389
        }
390
    }
391
    if (read_only) {
392
        for (p = whitelist_ro; *p; p++) {
393
            if (!strcmp(drv->format_name, *p)) {
394
                return 1;
395
            }
396
        }
397
    }
398
    return 0;
399
}
400

    
401
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
402
                                          bool read_only)
403
{
404
    BlockDriver *drv = bdrv_find_format(format_name);
405
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
406
}
407

    
408
typedef struct CreateCo {
409
    BlockDriver *drv;
410
    char *filename;
411
    QEMUOptionParameter *options;
412
    int ret;
413
    Error *err;
414
} CreateCo;
415

    
416
static void coroutine_fn bdrv_create_co_entry(void *opaque)
417
{
418
    Error *local_err = NULL;
419
    int ret;
420

    
421
    CreateCo *cco = opaque;
422
    assert(cco->drv);
423

    
424
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
425
    if (error_is_set(&local_err)) {
426
        error_propagate(&cco->err, local_err);
427
    }
428
    cco->ret = ret;
429
}
430

    
431
int bdrv_create(BlockDriver *drv, const char* filename,
432
    QEMUOptionParameter *options, Error **errp)
433
{
434
    int ret;
435

    
436
    Coroutine *co;
437
    CreateCo cco = {
438
        .drv = drv,
439
        .filename = g_strdup(filename),
440
        .options = options,
441
        .ret = NOT_DONE,
442
        .err = NULL,
443
    };
444

    
445
    if (!drv->bdrv_create) {
446
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
447
        ret = -ENOTSUP;
448
        goto out;
449
    }
450

    
451
    if (qemu_in_coroutine()) {
452
        /* Fast-path if already in coroutine context */
453
        bdrv_create_co_entry(&cco);
454
    } else {
455
        co = qemu_coroutine_create(bdrv_create_co_entry);
456
        qemu_coroutine_enter(co, &cco);
457
        while (cco.ret == NOT_DONE) {
458
            qemu_aio_wait();
459
        }
460
    }
461

    
462
    ret = cco.ret;
463
    if (ret < 0) {
464
        if (error_is_set(&cco.err)) {
465
            error_propagate(errp, cco.err);
466
        } else {
467
            error_setg_errno(errp, -ret, "Could not create image");
468
        }
469
    }
470

    
471
out:
472
    g_free(cco.filename);
473
    return ret;
474
}
475

    
476
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
477
                     Error **errp)
478
{
479
    BlockDriver *drv;
480
    Error *local_err = NULL;
481
    int ret;
482

    
483
    drv = bdrv_find_protocol(filename, true);
484
    if (drv == NULL) {
485
        error_setg(errp, "Could not find protocol for file '%s'", filename);
486
        return -ENOENT;
487
    }
488

    
489
    ret = bdrv_create(drv, filename, options, &local_err);
490
    if (error_is_set(&local_err)) {
491
        error_propagate(errp, local_err);
492
    }
493
    return ret;
494
}
495

    
496
int bdrv_refresh_limits(BlockDriverState *bs)
497
{
498
    BlockDriver *drv = bs->drv;
499

    
500
    memset(&bs->bl, 0, sizeof(bs->bl));
501

    
502
    if (!drv) {
503
        return 0;
504
    }
505

    
506
    /* Take some limits from the children as a default */
507
    if (bs->file) {
508
        bdrv_refresh_limits(bs->file);
509
        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
510
        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
511
    } else {
512
        bs->bl.opt_mem_alignment = 512;
513
    }
514

    
515
    if (bs->backing_hd) {
516
        bdrv_refresh_limits(bs->backing_hd);
517
        bs->bl.opt_transfer_length =
518
            MAX(bs->bl.opt_transfer_length,
519
                bs->backing_hd->bl.opt_transfer_length);
520
        bs->bl.opt_mem_alignment =
521
            MAX(bs->bl.opt_mem_alignment,
522
                bs->backing_hd->bl.opt_mem_alignment);
523
    }
524

    
525
    /* Then let the driver override it */
526
    if (drv->bdrv_refresh_limits) {
527
        return drv->bdrv_refresh_limits(bs);
528
    }
529

    
530
    return 0;
531
}
532

    
533
/*
534
 * Create a uniquely-named empty temporary file.
535
 * Return 0 upon success, otherwise a negative errno value.
536
 */
537
int get_tmp_filename(char *filename, int size)
538
{
539
#ifdef _WIN32
540
    char temp_dir[MAX_PATH];
541
    /* GetTempFileName requires that its output buffer (4th param)
542
       have length MAX_PATH or greater.  */
543
    assert(size >= MAX_PATH);
544
    return (GetTempPath(MAX_PATH, temp_dir)
545
            && GetTempFileName(temp_dir, "qem", 0, filename)
546
            ? 0 : -GetLastError());
547
#else
548
    int fd;
549
    const char *tmpdir;
550
    tmpdir = getenv("TMPDIR");
551
    if (!tmpdir)
552
        tmpdir = "/tmp";
553
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
554
        return -EOVERFLOW;
555
    }
556
    fd = mkstemp(filename);
557
    if (fd < 0) {
558
        return -errno;
559
    }
560
    if (close(fd) != 0) {
561
        unlink(filename);
562
        return -errno;
563
    }
564
    return 0;
565
#endif
566
}
567

    
568
/*
569
 * Detect host devices. By convention, /dev/cdrom[N] is always
570
 * recognized as a host CDROM.
571
 */
572
static BlockDriver *find_hdev_driver(const char *filename)
573
{
574
    int score_max = 0, score;
575
    BlockDriver *drv = NULL, *d;
576

    
577
    QLIST_FOREACH(d, &bdrv_drivers, list) {
578
        if (d->bdrv_probe_device) {
579
            score = d->bdrv_probe_device(filename);
580
            if (score > score_max) {
581
                score_max = score;
582
                drv = d;
583
            }
584
        }
585
    }
586

    
587
    return drv;
588
}
589

    
590
BlockDriver *bdrv_find_protocol(const char *filename,
591
                                bool allow_protocol_prefix)
592
{
593
    BlockDriver *drv1;
594
    char protocol[128];
595
    int len;
596
    const char *p;
597

    
598
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
599

    
600
    /*
601
     * XXX(hch): we really should not let host device detection
602
     * override an explicit protocol specification, but moving this
603
     * later breaks access to device names with colons in them.
604
     * Thanks to the brain-dead persistent naming schemes on udev-
605
     * based Linux systems those actually are quite common.
606
     */
607
    drv1 = find_hdev_driver(filename);
608
    if (drv1) {
609
        return drv1;
610
    }
611

    
612
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
613
        return bdrv_find_format("file");
614
    }
615

    
616
    p = strchr(filename, ':');
617
    assert(p != NULL);
618
    len = p - filename;
619
    if (len > sizeof(protocol) - 1)
620
        len = sizeof(protocol) - 1;
621
    memcpy(protocol, filename, len);
622
    protocol[len] = '\0';
623
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
624
        if (drv1->protocol_name &&
625
            !strcmp(drv1->protocol_name, protocol)) {
626
            return drv1;
627
        }
628
    }
629
    return NULL;
630
}
631

    
632
static int find_image_format(BlockDriverState *bs, const char *filename,
633
                             BlockDriver **pdrv, Error **errp)
634
{
635
    int score, score_max;
636
    BlockDriver *drv1, *drv;
637
    uint8_t buf[2048];
638
    int ret = 0;
639

    
640
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
641
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
642
        drv = bdrv_find_format("raw");
643
        if (!drv) {
644
            error_setg(errp, "Could not find raw image format");
645
            ret = -ENOENT;
646
        }
647
        *pdrv = drv;
648
        return ret;
649
    }
650

    
651
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
652
    if (ret < 0) {
653
        error_setg_errno(errp, -ret, "Could not read image for determining its "
654
                         "format");
655
        *pdrv = NULL;
656
        return ret;
657
    }
658

    
659
    score_max = 0;
660
    drv = NULL;
661
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
662
        if (drv1->bdrv_probe) {
663
            score = drv1->bdrv_probe(buf, ret, filename);
664
            if (score > score_max) {
665
                score_max = score;
666
                drv = drv1;
667
            }
668
        }
669
    }
670
    if (!drv) {
671
        error_setg(errp, "Could not determine image format: No compatible "
672
                   "driver found");
673
        ret = -ENOENT;
674
    }
675
    *pdrv = drv;
676
    return ret;
677
}
678

    
679
/**
680
 * Set the current 'total_sectors' value
681
 */
682
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
683
{
684
    BlockDriver *drv = bs->drv;
685

    
686
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
687
    if (bs->sg)
688
        return 0;
689

    
690
    /* query actual device if possible, otherwise just trust the hint */
691
    if (drv->bdrv_getlength) {
692
        int64_t length = drv->bdrv_getlength(bs);
693
        if (length < 0) {
694
            return length;
695
        }
696
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
697
    }
698

    
699
    bs->total_sectors = hint;
700
    return 0;
701
}
702

    
703
/**
704
 * Set open flags for a given discard mode
705
 *
706
 * Return 0 on success, -1 if the discard mode was invalid.
707
 */
708
int bdrv_parse_discard_flags(const char *mode, int *flags)
709
{
710
    *flags &= ~BDRV_O_UNMAP;
711

    
712
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
713
        /* do nothing */
714
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
715
        *flags |= BDRV_O_UNMAP;
716
    } else {
717
        return -1;
718
    }
719

    
720
    return 0;
721
}
722

    
723
/**
724
 * Set open flags for a given cache mode
725
 *
726
 * Return 0 on success, -1 if the cache mode was invalid.
727
 */
728
int bdrv_parse_cache_flags(const char *mode, int *flags)
729
{
730
    *flags &= ~BDRV_O_CACHE_MASK;
731

    
732
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
733
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
734
    } else if (!strcmp(mode, "directsync")) {
735
        *flags |= BDRV_O_NOCACHE;
736
    } else if (!strcmp(mode, "writeback")) {
737
        *flags |= BDRV_O_CACHE_WB;
738
    } else if (!strcmp(mode, "unsafe")) {
739
        *flags |= BDRV_O_CACHE_WB;
740
        *flags |= BDRV_O_NO_FLUSH;
741
    } else if (!strcmp(mode, "writethrough")) {
742
        /* this is the default */
743
    } else {
744
        return -1;
745
    }
746

    
747
    return 0;
748
}
749

    
750
/**
751
 * The copy-on-read flag is actually a reference count so multiple users may
752
 * use the feature without worrying about clobbering its previous state.
753
 * Copy-on-read stays enabled until all users have called to disable it.
754
 */
755
void bdrv_enable_copy_on_read(BlockDriverState *bs)
756
{
757
    bs->copy_on_read++;
758
}
759

    
760
void bdrv_disable_copy_on_read(BlockDriverState *bs)
761
{
762
    assert(bs->copy_on_read > 0);
763
    bs->copy_on_read--;
764
}
765

    
766
static int bdrv_open_flags(BlockDriverState *bs, int flags)
767
{
768
    int open_flags = flags | BDRV_O_CACHE_WB;
769

    
770
    /*
771
     * Clear flags that are internal to the block layer before opening the
772
     * image.
773
     */
774
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
775

    
776
    /*
777
     * Snapshots should be writable.
778
     */
779
    if (bs->is_temporary) {
780
        open_flags |= BDRV_O_RDWR;
781
    }
782

    
783
    return open_flags;
784
}
785

    
786
static int bdrv_assign_node_name(BlockDriverState *bs,
787
                                 const char *node_name,
788
                                 Error **errp)
789
{
790
    if (!node_name) {
791
        return 0;
792
    }
793

    
794
    /* empty string node name is invalid */
795
    if (node_name[0] == '\0') {
796
        error_setg(errp, "Empty node name");
797
        return -EINVAL;
798
    }
799

    
800
    /* takes care of avoiding duplicates node names */
801
    if (bdrv_find_node(node_name)) {
802
        error_setg(errp, "Duplicate node name");
803
        return -EINVAL;
804
    }
805

    
806
    /* copy node name into the bs and insert it into the graph list */
807
    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
808
    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
809

    
810
    return 0;
811
}
812

    
813
/*
814
 * Common part for opening disk images and files
815
 *
816
 * Removes all processed options from *options.
817
 */
818
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
819
    QDict *options, int flags, BlockDriver *drv, Error **errp)
820
{
821
    int ret, open_flags;
822
    const char *filename;
823
    const char *node_name = NULL;
824
    Error *local_err = NULL;
825

    
826
    assert(drv != NULL);
827
    assert(bs->file == NULL);
828
    assert(options != NULL && bs->options != options);
829

    
830
    if (file != NULL) {
831
        filename = file->filename;
832
    } else {
833
        filename = qdict_get_try_str(options, "filename");
834
    }
835

    
836
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
837

    
838
    node_name = qdict_get_try_str(options, "node-name");
839
    ret = bdrv_assign_node_name(bs, node_name, errp);
840
    if (ret < 0) {
841
        return ret;
842
    }
843
    qdict_del(options, "node-name");
844

    
845
    /* bdrv_open() with directly using a protocol as drv. This layer is already
846
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
847
     * and return immediately. */
848
    if (file != NULL && drv->bdrv_file_open) {
849
        bdrv_swap(file, bs);
850
        return 0;
851
    }
852

    
853
    bs->open_flags = flags;
854
    bs->guest_block_size = 512;
855
    bs->request_alignment = 512;
856
    bs->zero_beyond_eof = true;
857
    open_flags = bdrv_open_flags(bs, flags);
858
    bs->read_only = !(open_flags & BDRV_O_RDWR);
859

    
860
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
861
        error_setg(errp,
862
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
863
                        ? "Driver '%s' can only be used for read-only devices"
864
                        : "Driver '%s' is not whitelisted",
865
                   drv->format_name);
866
        return -ENOTSUP;
867
    }
868

    
869
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
870
    if (flags & BDRV_O_COPY_ON_READ) {
871
        if (!bs->read_only) {
872
            bdrv_enable_copy_on_read(bs);
873
        } else {
874
            error_setg(errp, "Can't use copy-on-read on read-only device");
875
            return -EINVAL;
876
        }
877
    }
878

    
879
    if (filename != NULL) {
880
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
881
    } else {
882
        bs->filename[0] = '\0';
883
    }
884

    
885
    bs->drv = drv;
886
    bs->opaque = g_malloc0(drv->instance_size);
887

    
888
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
889

    
890
    /* Open the image, either directly or using a protocol */
891
    if (drv->bdrv_file_open) {
892
        assert(file == NULL);
893
        assert(!drv->bdrv_needs_filename || filename != NULL);
894
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
895
    } else {
896
        if (file == NULL) {
897
            error_setg(errp, "Can't use '%s' as a block driver for the "
898
                       "protocol level", drv->format_name);
899
            ret = -EINVAL;
900
            goto free_and_fail;
901
        }
902
        bs->file = file;
903
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
904
    }
905

    
906
    if (ret < 0) {
907
        if (error_is_set(&local_err)) {
908
            error_propagate(errp, local_err);
909
        } else if (bs->filename[0]) {
910
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
911
        } else {
912
            error_setg_errno(errp, -ret, "Could not open image");
913
        }
914
        goto free_and_fail;
915
    }
916

    
917
    ret = refresh_total_sectors(bs, bs->total_sectors);
918
    if (ret < 0) {
919
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
920
        goto free_and_fail;
921
    }
922

    
923
    bdrv_refresh_limits(bs);
924
    assert(bdrv_opt_mem_align(bs) != 0);
925
    assert(bs->request_alignment != 0);
926

    
927
#ifndef _WIN32
928
    if (bs->is_temporary) {
929
        assert(bs->filename[0] != '\0');
930
        unlink(bs->filename);
931
    }
932
#endif
933
    return 0;
934

    
935
free_and_fail:
936
    bs->file = NULL;
937
    g_free(bs->opaque);
938
    bs->opaque = NULL;
939
    bs->drv = NULL;
940
    return ret;
941
}
942

    
943
/*
944
 * Opens a file using a protocol (file, host_device, nbd, ...)
945
 *
946
 * options is a QDict of options to pass to the block drivers, or NULL for an
947
 * empty set of options. The reference to the QDict belongs to the block layer
948
 * after the call (even on failure), so if the caller intends to reuse the
949
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
950
 */
951
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
952
                   const char *reference, QDict *options, int flags,
953
                   Error **errp)
954
{
955
    BlockDriverState *bs = NULL;
956
    BlockDriver *drv;
957
    const char *drvname;
958
    bool allow_protocol_prefix = false;
959
    Error *local_err = NULL;
960
    int ret;
961

    
962
    /* NULL means an empty set of options */
963
    if (options == NULL) {
964
        options = qdict_new();
965
    }
966

    
967
    if (reference) {
968
        if (filename || qdict_size(options)) {
969
            error_setg(errp, "Cannot reference an existing block device with "
970
                       "additional options or a new filename");
971
            return -EINVAL;
972
        }
973
        QDECREF(options);
974

    
975
        bs = bdrv_find(reference);
976
        if (!bs) {
977
            error_setg(errp, "Cannot find block device '%s'", reference);
978
            return -ENODEV;
979
        }
980
        bdrv_ref(bs);
981
        *pbs = bs;
982
        return 0;
983
    }
984

    
985
    bs = bdrv_new("");
986
    bs->options = options;
987
    options = qdict_clone_shallow(options);
988

    
989
    /* Fetch the file name from the options QDict if necessary */
990
    if (!filename) {
991
        filename = qdict_get_try_str(options, "filename");
992
    } else if (filename && !qdict_haskey(options, "filename")) {
993
        qdict_put(options, "filename", qstring_from_str(filename));
994
        allow_protocol_prefix = true;
995
    } else {
996
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
997
                   "same time");
998
        ret = -EINVAL;
999
        goto fail;
1000
    }
1001

    
1002
    /* Find the right block driver */
1003
    drvname = qdict_get_try_str(options, "driver");
1004
    if (drvname) {
1005
        drv = bdrv_find_format(drvname);
1006
        if (!drv) {
1007
            error_setg(errp, "Unknown driver '%s'", drvname);
1008
        }
1009
        qdict_del(options, "driver");
1010
    } else if (filename) {
1011
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1012
        if (!drv) {
1013
            error_setg(errp, "Unknown protocol");
1014
        }
1015
    } else {
1016
        error_setg(errp, "Must specify either driver or file");
1017
        drv = NULL;
1018
    }
1019

    
1020
    if (!drv) {
1021
        /* errp has been set already */
1022
        ret = -ENOENT;
1023
        goto fail;
1024
    }
1025

    
1026
    /* Parse the filename and open it */
1027
    if (drv->bdrv_parse_filename && filename) {
1028
        drv->bdrv_parse_filename(filename, options, &local_err);
1029
        if (error_is_set(&local_err)) {
1030
            error_propagate(errp, local_err);
1031
            ret = -EINVAL;
1032
            goto fail;
1033
        }
1034
        qdict_del(options, "filename");
1035
    } else if (drv->bdrv_needs_filename && !filename) {
1036
        error_setg(errp, "The '%s' block driver requires a file name",
1037
                   drv->format_name);
1038
        ret = -EINVAL;
1039
        goto fail;
1040
    }
1041

    
1042
    if (!drv->bdrv_file_open) {
1043
        ret = bdrv_open(bs, filename, options, flags, drv, &local_err);
1044
        options = NULL;
1045
    } else {
1046
        ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1047
    }
1048
    if (ret < 0) {
1049
        error_propagate(errp, local_err);
1050
        goto fail;
1051
    }
1052

    
1053
    /* Check if any unknown options were used */
1054
    if (options && (qdict_size(options) != 0)) {
1055
        const QDictEntry *entry = qdict_first(options);
1056
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1057
                   drv->format_name, entry->key);
1058
        ret = -EINVAL;
1059
        goto fail;
1060
    }
1061
    QDECREF(options);
1062

    
1063
    bs->growable = 1;
1064
    *pbs = bs;
1065
    return 0;
1066

    
1067
fail:
1068
    QDECREF(options);
1069
    if (!bs->drv) {
1070
        QDECREF(bs->options);
1071
    }
1072
    bdrv_unref(bs);
1073
    return ret;
1074
}
1075

    
1076
/*
1077
 * Opens the backing file for a BlockDriverState if not yet open
1078
 *
1079
 * options is a QDict of options to pass to the block drivers, or NULL for an
1080
 * empty set of options. The reference to the QDict is transferred to this
1081
 * function (even on failure), so if the caller intends to reuse the dictionary,
1082
 * it needs to use QINCREF() before calling bdrv_file_open.
1083
 */
1084
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1085
{
1086
    char backing_filename[PATH_MAX];
1087
    int back_flags, ret;
1088
    BlockDriver *back_drv = NULL;
1089
    Error *local_err = NULL;
1090

    
1091
    if (bs->backing_hd != NULL) {
1092
        QDECREF(options);
1093
        return 0;
1094
    }
1095

    
1096
    /* NULL means an empty set of options */
1097
    if (options == NULL) {
1098
        options = qdict_new();
1099
    }
1100

    
1101
    bs->open_flags &= ~BDRV_O_NO_BACKING;
1102
    if (qdict_haskey(options, "file.filename")) {
1103
        backing_filename[0] = '\0';
1104
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1105
        QDECREF(options);
1106
        return 0;
1107
    } else {
1108
        bdrv_get_full_backing_filename(bs, backing_filename,
1109
                                       sizeof(backing_filename));
1110
    }
1111

    
1112
    bs->backing_hd = bdrv_new("");
1113

    
1114
    if (bs->backing_format[0] != '\0') {
1115
        back_drv = bdrv_find_format(bs->backing_format);
1116
    }
1117

    
1118
    /* backing files always opened read-only */
1119
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1120
                                    BDRV_O_COPY_ON_READ);
1121

    
1122
    ret = bdrv_open(bs->backing_hd,
1123
                    *backing_filename ? backing_filename : NULL, options,
1124
                    back_flags, back_drv, &local_err);
1125
    if (ret < 0) {
1126
        bdrv_unref(bs->backing_hd);
1127
        bs->backing_hd = NULL;
1128
        bs->open_flags |= BDRV_O_NO_BACKING;
1129
        error_setg(errp, "Could not open backing file: %s",
1130
                   error_get_pretty(local_err));
1131
        error_free(local_err);
1132
        return ret;
1133
    }
1134

    
1135
    if (bs->backing_hd->file) {
1136
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1137
                bs->backing_hd->file->filename);
1138
    }
1139

    
1140
    /* Recalculate the BlockLimits with the backing file */
1141
    bdrv_refresh_limits(bs);
1142

    
1143
    return 0;
1144
}
1145

    
1146
/*
1147
 * Opens a disk image whose options are given as BlockdevRef in another block
1148
 * device's options.
1149
 *
1150
 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1151
 * image format auto-detection. If it is false and a filename is given,
1152
 * bdrv_open() will be used for auto-detection.
1153
 *
1154
 * If allow_none is true, no image will be opened if filename is false and no
1155
 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1156
 *
1157
 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1158
 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1159
 * itself, all options starting with "${bdref_key}." are considered part of the
1160
 * BlockdevRef.
1161
 *
1162
 * The BlockdevRef will be removed from the options QDict.
1163
 */
1164
int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1165
                    QDict *options, const char *bdref_key, int flags,
1166
                    bool force_raw, bool allow_none, Error **errp)
1167
{
1168
    QDict *image_options;
1169
    int ret;
1170
    char *bdref_key_dot;
1171
    const char *reference;
1172

    
1173
    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1174
    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1175
    g_free(bdref_key_dot);
1176

    
1177
    reference = qdict_get_try_str(options, bdref_key);
1178
    if (!filename && !reference && !qdict_size(image_options)) {
1179
        if (allow_none) {
1180
            ret = 0;
1181
        } else {
1182
            error_setg(errp, "A block device must be specified for \"%s\"",
1183
                       bdref_key);
1184
            ret = -EINVAL;
1185
        }
1186
        goto done;
1187
    }
1188

    
1189
    if (filename && !force_raw) {
1190
        /* If a filename is given and the block driver should be detected
1191
           automatically (instead of using none), use bdrv_open() in order to do
1192
           that auto-detection. */
1193
        BlockDriverState *bs;
1194

    
1195
        if (reference) {
1196
            error_setg(errp, "Cannot reference an existing block device while "
1197
                       "giving a filename");
1198
            ret = -EINVAL;
1199
            goto done;
1200
        }
1201

    
1202
        bs = bdrv_new("");
1203
        ret = bdrv_open(bs, filename, image_options, flags, NULL, errp);
1204
        if (ret < 0) {
1205
            bdrv_unref(bs);
1206
        } else {
1207
            *pbs = bs;
1208
        }
1209
    } else {
1210
        ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
1211
                             errp);
1212
    }
1213

    
1214
done:
1215
    qdict_del(options, bdref_key);
1216
    return ret;
1217
}
1218

    
1219
/*
1220
 * Opens a disk image (raw, qcow2, vmdk, ...)
1221
 *
1222
 * options is a QDict of options to pass to the block drivers, or NULL for an
1223
 * empty set of options. The reference to the QDict belongs to the block layer
1224
 * after the call (even on failure), so if the caller intends to reuse the
1225
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1226
 */
1227
int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1228
              int flags, BlockDriver *drv, Error **errp)
1229
{
1230
    int ret;
1231
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1232
    char tmp_filename[PATH_MAX + 1];
1233
    BlockDriverState *file = NULL;
1234
    const char *drvname;
1235
    Error *local_err = NULL;
1236

    
1237
    /* NULL means an empty set of options */
1238
    if (options == NULL) {
1239
        options = qdict_new();
1240
    }
1241

    
1242
    bs->options = options;
1243
    options = qdict_clone_shallow(options);
1244

    
1245
    /* For snapshot=on, create a temporary qcow2 overlay */
1246
    if (flags & BDRV_O_SNAPSHOT) {
1247
        BlockDriverState *bs1;
1248
        int64_t total_size;
1249
        BlockDriver *bdrv_qcow2;
1250
        QEMUOptionParameter *create_options;
1251
        QDict *snapshot_options;
1252

    
1253
        /* if snapshot, we create a temporary backing file and open it
1254
           instead of opening 'filename' directly */
1255

    
1256
        /* Get the required size from the image */
1257
        bs1 = bdrv_new("");
1258
        QINCREF(options);
1259
        ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
1260
                        drv, &local_err);
1261
        if (ret < 0) {
1262
            bdrv_unref(bs1);
1263
            goto fail;
1264
        }
1265
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1266

    
1267
        bdrv_unref(bs1);
1268

    
1269
        /* Create the temporary image */
1270
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1271
        if (ret < 0) {
1272
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1273
            goto fail;
1274
        }
1275

    
1276
        bdrv_qcow2 = bdrv_find_format("qcow2");
1277
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1278
                                                 NULL);
1279

    
1280
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1281

    
1282
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1283
        free_option_parameters(create_options);
1284
        if (ret < 0) {
1285
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1286
                             "'%s': %s", tmp_filename,
1287
                             error_get_pretty(local_err));
1288
            error_free(local_err);
1289
            local_err = NULL;
1290
            goto fail;
1291
        }
1292

    
1293
        /* Prepare a new options QDict for the temporary file, where user
1294
         * options refer to the backing file */
1295
        if (filename) {
1296
            qdict_put(options, "file.filename", qstring_from_str(filename));
1297
        }
1298
        if (drv) {
1299
            qdict_put(options, "driver", qstring_from_str(drv->format_name));
1300
        }
1301

    
1302
        snapshot_options = qdict_new();
1303
        qdict_put(snapshot_options, "backing", options);
1304
        qdict_flatten(snapshot_options);
1305

    
1306
        bs->options = snapshot_options;
1307
        options = qdict_clone_shallow(bs->options);
1308

    
1309
        filename = tmp_filename;
1310
        drv = bdrv_qcow2;
1311
        bs->is_temporary = 1;
1312
    }
1313

    
1314
    /* Open image file without format layer */
1315
    if (flags & BDRV_O_RDWR) {
1316
        flags |= BDRV_O_ALLOW_RDWR;
1317
    }
1318

    
1319
    ret = bdrv_open_image(&file, filename, options, "file",
1320
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1321
                          &local_err);
1322
    if (ret < 0) {
1323
        goto fail;
1324
    }
1325

    
1326
    /* Find the right image format driver */
1327
    drvname = qdict_get_try_str(options, "driver");
1328
    if (drvname) {
1329
        drv = bdrv_find_format(drvname);
1330
        qdict_del(options, "driver");
1331
        if (!drv) {
1332
            error_setg(errp, "Invalid driver: '%s'", drvname);
1333
            ret = -EINVAL;
1334
            goto unlink_and_fail;
1335
        }
1336
    }
1337

    
1338
    if (!drv) {
1339
        if (file) {
1340
            ret = find_image_format(file, filename, &drv, &local_err);
1341
        } else {
1342
            error_setg(errp, "Must specify either driver or file");
1343
            ret = -EINVAL;
1344
            goto unlink_and_fail;
1345
        }
1346
    }
1347

    
1348
    if (!drv) {
1349
        goto unlink_and_fail;
1350
    }
1351

    
1352
    /* Open the image */
1353
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1354
    if (ret < 0) {
1355
        goto unlink_and_fail;
1356
    }
1357

    
1358
    if (file && (bs->file != file)) {
1359
        bdrv_unref(file);
1360
        file = NULL;
1361
    }
1362

    
1363
    /* If there is a backing file, use it */
1364
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1365
        QDict *backing_options;
1366

    
1367
        qdict_extract_subqdict(options, &backing_options, "backing.");
1368
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1369
        if (ret < 0) {
1370
            goto close_and_fail;
1371
        }
1372
    }
1373

    
1374
    /* Check if any unknown options were used */
1375
    if (qdict_size(options) != 0) {
1376
        const QDictEntry *entry = qdict_first(options);
1377
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1378
                   "support the option '%s'", drv->format_name, bs->device_name,
1379
                   entry->key);
1380

    
1381
        ret = -EINVAL;
1382
        goto close_and_fail;
1383
    }
1384
    QDECREF(options);
1385

    
1386
    if (!bdrv_key_required(bs)) {
1387
        bdrv_dev_change_media_cb(bs, true);
1388
    }
1389

    
1390
    return 0;
1391

    
1392
unlink_and_fail:
1393
    if (file != NULL) {
1394
        bdrv_unref(file);
1395
    }
1396
    if (bs->is_temporary) {
1397
        unlink(filename);
1398
    }
1399
fail:
1400
    QDECREF(bs->options);
1401
    QDECREF(options);
1402
    bs->options = NULL;
1403
    if (error_is_set(&local_err)) {
1404
        error_propagate(errp, local_err);
1405
    }
1406
    return ret;
1407

    
1408
close_and_fail:
1409
    bdrv_close(bs);
1410
    QDECREF(options);
1411
    if (error_is_set(&local_err)) {
1412
        error_propagate(errp, local_err);
1413
    }
1414
    return ret;
1415
}
1416

    
1417
typedef struct BlockReopenQueueEntry {
1418
     bool prepared;
1419
     BDRVReopenState state;
1420
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1421
} BlockReopenQueueEntry;
1422

    
1423
/*
1424
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1425
 * reopen of multiple devices.
1426
 *
1427
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1428
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1429
 * be created and initialized. This newly created BlockReopenQueue should be
1430
 * passed back in for subsequent calls that are intended to be of the same
1431
 * atomic 'set'.
1432
 *
1433
 * bs is the BlockDriverState to add to the reopen queue.
1434
 *
1435
 * flags contains the open flags for the associated bs
1436
 *
1437
 * returns a pointer to bs_queue, which is either the newly allocated
1438
 * bs_queue, or the existing bs_queue being used.
1439
 *
1440
 */
1441
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1442
                                    BlockDriverState *bs, int flags)
1443
{
1444
    assert(bs != NULL);
1445

    
1446
    BlockReopenQueueEntry *bs_entry;
1447
    if (bs_queue == NULL) {
1448
        bs_queue = g_new0(BlockReopenQueue, 1);
1449
        QSIMPLEQ_INIT(bs_queue);
1450
    }
1451

    
1452
    if (bs->file) {
1453
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1454
    }
1455

    
1456
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1457
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1458

    
1459
    bs_entry->state.bs = bs;
1460
    bs_entry->state.flags = flags;
1461

    
1462
    return bs_queue;
1463
}
1464

    
1465
/*
1466
 * Reopen multiple BlockDriverStates atomically & transactionally.
1467
 *
1468
 * The queue passed in (bs_queue) must have been built up previous
1469
 * via bdrv_reopen_queue().
1470
 *
1471
 * Reopens all BDS specified in the queue, with the appropriate
1472
 * flags.  All devices are prepared for reopen, and failure of any
1473
 * device will cause all device changes to be abandonded, and intermediate
1474
 * data cleaned up.
1475
 *
1476
 * If all devices prepare successfully, then the changes are committed
1477
 * to all devices.
1478
 *
1479
 */
1480
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1481
{
1482
    int ret = -1;
1483
    BlockReopenQueueEntry *bs_entry, *next;
1484
    Error *local_err = NULL;
1485

    
1486
    assert(bs_queue != NULL);
1487

    
1488
    bdrv_drain_all();
1489

    
1490
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1491
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1492
            error_propagate(errp, local_err);
1493
            goto cleanup;
1494
        }
1495
        bs_entry->prepared = true;
1496
    }
1497

    
1498
    /* If we reach this point, we have success and just need to apply the
1499
     * changes
1500
     */
1501
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1502
        bdrv_reopen_commit(&bs_entry->state);
1503
    }
1504

    
1505
    ret = 0;
1506

    
1507
cleanup:
1508
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1509
        if (ret && bs_entry->prepared) {
1510
            bdrv_reopen_abort(&bs_entry->state);
1511
        }
1512
        g_free(bs_entry);
1513
    }
1514
    g_free(bs_queue);
1515
    return ret;
1516
}
1517

    
1518

    
1519
/* Reopen a single BlockDriverState with the specified flags. */
1520
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1521
{
1522
    int ret = -1;
1523
    Error *local_err = NULL;
1524
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1525

    
1526
    ret = bdrv_reopen_multiple(queue, &local_err);
1527
    if (local_err != NULL) {
1528
        error_propagate(errp, local_err);
1529
    }
1530
    return ret;
1531
}
1532

    
1533

    
1534
/*
1535
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1536
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1537
 * the block driver layer .bdrv_reopen_prepare()
1538
 *
1539
 * bs is the BlockDriverState to reopen
1540
 * flags are the new open flags
1541
 * queue is the reopen queue
1542
 *
1543
 * Returns 0 on success, non-zero on error.  On error errp will be set
1544
 * as well.
1545
 *
1546
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1547
 * It is the responsibility of the caller to then call the abort() or
1548
 * commit() for any other BDS that have been left in a prepare() state
1549
 *
1550
 */
1551
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1552
                        Error **errp)
1553
{
1554
    int ret = -1;
1555
    Error *local_err = NULL;
1556
    BlockDriver *drv;
1557

    
1558
    assert(reopen_state != NULL);
1559
    assert(reopen_state->bs->drv != NULL);
1560
    drv = reopen_state->bs->drv;
1561

    
1562
    /* if we are to stay read-only, do not allow permission change
1563
     * to r/w */
1564
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1565
        reopen_state->flags & BDRV_O_RDWR) {
1566
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1567
                  reopen_state->bs->device_name);
1568
        goto error;
1569
    }
1570

    
1571

    
1572
    ret = bdrv_flush(reopen_state->bs);
1573
    if (ret) {
1574
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1575
                  strerror(-ret));
1576
        goto error;
1577
    }
1578

    
1579
    if (drv->bdrv_reopen_prepare) {
1580
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1581
        if (ret) {
1582
            if (local_err != NULL) {
1583
                error_propagate(errp, local_err);
1584
            } else {
1585
                error_setg(errp, "failed while preparing to reopen image '%s'",
1586
                           reopen_state->bs->filename);
1587
            }
1588
            goto error;
1589
        }
1590
    } else {
1591
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1592
         * handler for each supported drv. */
1593
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1594
                  drv->format_name, reopen_state->bs->device_name,
1595
                 "reopening of file");
1596
        ret = -1;
1597
        goto error;
1598
    }
1599

    
1600
    ret = 0;
1601

    
1602
error:
1603
    return ret;
1604
}
1605

    
1606
/*
1607
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1608
 * makes them final by swapping the staging BlockDriverState contents into
1609
 * the active BlockDriverState contents.
1610
 */
1611
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1612
{
1613
    BlockDriver *drv;
1614

    
1615
    assert(reopen_state != NULL);
1616
    drv = reopen_state->bs->drv;
1617
    assert(drv != NULL);
1618

    
1619
    /* If there are any driver level actions to take */
1620
    if (drv->bdrv_reopen_commit) {
1621
        drv->bdrv_reopen_commit(reopen_state);
1622
    }
1623

    
1624
    /* set BDS specific flags now */
1625
    reopen_state->bs->open_flags         = reopen_state->flags;
1626
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1627
                                              BDRV_O_CACHE_WB);
1628
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1629

    
1630
    bdrv_refresh_limits(reopen_state->bs);
1631
}
1632

    
1633
/*
1634
 * Abort the reopen, and delete and free the staged changes in
1635
 * reopen_state
1636
 */
1637
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1638
{
1639
    BlockDriver *drv;
1640

    
1641
    assert(reopen_state != NULL);
1642
    drv = reopen_state->bs->drv;
1643
    assert(drv != NULL);
1644

    
1645
    if (drv->bdrv_reopen_abort) {
1646
        drv->bdrv_reopen_abort(reopen_state);
1647
    }
1648
}
1649

    
1650

    
1651
void bdrv_close(BlockDriverState *bs)
1652
{
1653
    if (bs->job) {
1654
        block_job_cancel_sync(bs->job);
1655
    }
1656
    bdrv_drain_all(); /* complete I/O */
1657
    bdrv_flush(bs);
1658
    bdrv_drain_all(); /* in case flush left pending I/O */
1659
    notifier_list_notify(&bs->close_notifiers, bs);
1660

    
1661
    if (bs->drv) {
1662
        if (bs->backing_hd) {
1663
            bdrv_unref(bs->backing_hd);
1664
            bs->backing_hd = NULL;
1665
        }
1666
        bs->drv->bdrv_close(bs);
1667
        g_free(bs->opaque);
1668
#ifdef _WIN32
1669
        if (bs->is_temporary) {
1670
            unlink(bs->filename);
1671
        }
1672
#endif
1673
        bs->opaque = NULL;
1674
        bs->drv = NULL;
1675
        bs->copy_on_read = 0;
1676
        bs->backing_file[0] = '\0';
1677
        bs->backing_format[0] = '\0';
1678
        bs->total_sectors = 0;
1679
        bs->encrypted = 0;
1680
        bs->valid_key = 0;
1681
        bs->sg = 0;
1682
        bs->growable = 0;
1683
        bs->zero_beyond_eof = false;
1684
        QDECREF(bs->options);
1685
        bs->options = NULL;
1686

    
1687
        if (bs->file != NULL) {
1688
            bdrv_unref(bs->file);
1689
            bs->file = NULL;
1690
        }
1691
    }
1692

    
1693
    bdrv_dev_change_media_cb(bs, false);
1694

    
1695
    /*throttling disk I/O limits*/
1696
    if (bs->io_limits_enabled) {
1697
        bdrv_io_limits_disable(bs);
1698
    }
1699
}
1700

    
1701
void bdrv_close_all(void)
1702
{
1703
    BlockDriverState *bs;
1704

    
1705
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1706
        bdrv_close(bs);
1707
    }
1708
}
1709

    
1710
/* Check if any requests are in-flight (including throttled requests) */
1711
static bool bdrv_requests_pending(BlockDriverState *bs)
1712
{
1713
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1714
        return true;
1715
    }
1716
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1717
        return true;
1718
    }
1719
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1720
        return true;
1721
    }
1722
    if (bs->file && bdrv_requests_pending(bs->file)) {
1723
        return true;
1724
    }
1725
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1726
        return true;
1727
    }
1728
    return false;
1729
}
1730

    
1731
static bool bdrv_requests_pending_all(void)
1732
{
1733
    BlockDriverState *bs;
1734
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1735
        if (bdrv_requests_pending(bs)) {
1736
            return true;
1737
        }
1738
    }
1739
    return false;
1740
}
1741

    
1742
/*
1743
 * Wait for pending requests to complete across all BlockDriverStates
1744
 *
1745
 * This function does not flush data to disk, use bdrv_flush_all() for that
1746
 * after calling this function.
1747
 *
1748
 * Note that completion of an asynchronous I/O operation can trigger any
1749
 * number of other I/O operations on other devices---for example a coroutine
1750
 * can be arbitrarily complex and a constant flow of I/O can come until the
1751
 * coroutine is complete.  Because of this, it is not possible to have a
1752
 * function to drain a single device's I/O queue.
1753
 */
1754
void bdrv_drain_all(void)
1755
{
1756
    /* Always run first iteration so any pending completion BHs run */
1757
    bool busy = true;
1758
    BlockDriverState *bs;
1759

    
1760
    while (busy) {
1761
        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1762
            bdrv_start_throttled_reqs(bs);
1763
        }
1764

    
1765
        busy = bdrv_requests_pending_all();
1766
        busy |= aio_poll(qemu_get_aio_context(), busy);
1767
    }
1768
}
1769

    
1770
/* make a BlockDriverState anonymous by removing from bdrv_state and
1771
 * graph_bdrv_state list.
1772
   Also, NULL terminate the device_name to prevent double remove */
1773
void bdrv_make_anon(BlockDriverState *bs)
1774
{
1775
    if (bs->device_name[0] != '\0') {
1776
        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1777
    }
1778
    bs->device_name[0] = '\0';
1779
    if (bs->node_name[0] != '\0') {
1780
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1781
    }
1782
    bs->node_name[0] = '\0';
1783
}
1784

    
1785
static void bdrv_rebind(BlockDriverState *bs)
1786
{
1787
    if (bs->drv && bs->drv->bdrv_rebind) {
1788
        bs->drv->bdrv_rebind(bs);
1789
    }
1790
}
1791

    
1792
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1793
                                     BlockDriverState *bs_src)
1794
{
1795
    /* move some fields that need to stay attached to the device */
1796
    bs_dest->open_flags         = bs_src->open_flags;
1797

    
1798
    /* dev info */
1799
    bs_dest->dev_ops            = bs_src->dev_ops;
1800
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1801
    bs_dest->dev                = bs_src->dev;
1802
    bs_dest->guest_block_size   = bs_src->guest_block_size;
1803
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1804

    
1805
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1806

    
1807
    /* i/o throttled req */
1808
    memcpy(&bs_dest->throttle_state,
1809
           &bs_src->throttle_state,
1810
           sizeof(ThrottleState));
1811
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1812
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1813
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1814

    
1815
    /* r/w error */
1816
    bs_dest->on_read_error      = bs_src->on_read_error;
1817
    bs_dest->on_write_error     = bs_src->on_write_error;
1818

    
1819
    /* i/o status */
1820
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1821
    bs_dest->iostatus           = bs_src->iostatus;
1822

    
1823
    /* dirty bitmap */
1824
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1825

    
1826
    /* reference count */
1827
    bs_dest->refcnt             = bs_src->refcnt;
1828

    
1829
    /* job */
1830
    bs_dest->in_use             = bs_src->in_use;
1831
    bs_dest->job                = bs_src->job;
1832

    
1833
    /* keep the same entry in bdrv_states */
1834
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1835
            bs_src->device_name);
1836
    bs_dest->device_list = bs_src->device_list;
1837

    
1838
    /* keep the same entry in graph_bdrv_states
1839
     * We do want to swap name but don't want to swap linked list entries
1840
     */
1841
    bs_dest->node_list   = bs_src->node_list;
1842
}
1843

    
1844
/*
1845
 * Swap bs contents for two image chains while they are live,
1846
 * while keeping required fields on the BlockDriverState that is
1847
 * actually attached to a device.
1848
 *
1849
 * This will modify the BlockDriverState fields, and swap contents
1850
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1851
 *
1852
 * bs_new is required to be anonymous.
1853
 *
1854
 * This function does not create any image files.
1855
 */
1856
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1857
{
1858
    BlockDriverState tmp;
1859

    
1860
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1861
    assert(bs_new->device_name[0] == '\0');
1862
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1863
    assert(bs_new->job == NULL);
1864
    assert(bs_new->dev == NULL);
1865
    assert(bs_new->in_use == 0);
1866
    assert(bs_new->io_limits_enabled == false);
1867
    assert(!throttle_have_timer(&bs_new->throttle_state));
1868

    
1869
    tmp = *bs_new;
1870
    *bs_new = *bs_old;
1871
    *bs_old = tmp;
1872

    
1873
    /* there are some fields that should not be swapped, move them back */
1874
    bdrv_move_feature_fields(&tmp, bs_old);
1875
    bdrv_move_feature_fields(bs_old, bs_new);
1876
    bdrv_move_feature_fields(bs_new, &tmp);
1877

    
1878
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1879
    assert(bs_new->device_name[0] == '\0');
1880

    
1881
    /* Check a few fields that should remain attached to the device */
1882
    assert(bs_new->dev == NULL);
1883
    assert(bs_new->job == NULL);
1884
    assert(bs_new->in_use == 0);
1885
    assert(bs_new->io_limits_enabled == false);
1886
    assert(!throttle_have_timer(&bs_new->throttle_state));
1887

    
1888
    bdrv_rebind(bs_new);
1889
    bdrv_rebind(bs_old);
1890
}
1891

    
1892
/*
1893
 * Add new bs contents at the top of an image chain while the chain is
1894
 * live, while keeping required fields on the top layer.
1895
 *
1896
 * This will modify the BlockDriverState fields, and swap contents
1897
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1898
 *
1899
 * bs_new is required to be anonymous.
1900
 *
1901
 * This function does not create any image files.
1902
 */
1903
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1904
{
1905
    bdrv_swap(bs_new, bs_top);
1906

    
1907
    /* The contents of 'tmp' will become bs_top, as we are
1908
     * swapping bs_new and bs_top contents. */
1909
    bs_top->backing_hd = bs_new;
1910
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1911
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1912
            bs_new->filename);
1913
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1914
            bs_new->drv ? bs_new->drv->format_name : "");
1915
}
1916

    
1917
static void bdrv_delete(BlockDriverState *bs)
1918
{
1919
    assert(!bs->dev);
1920
    assert(!bs->job);
1921
    assert(!bs->in_use);
1922
    assert(!bs->refcnt);
1923
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1924

    
1925
    bdrv_close(bs);
1926

    
1927
    /* remove from list, if necessary */
1928
    bdrv_make_anon(bs);
1929

    
1930
    g_free(bs);
1931
}
1932

    
1933
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1934
/* TODO change to DeviceState *dev when all users are qdevified */
1935
{
1936
    if (bs->dev) {
1937
        return -EBUSY;
1938
    }
1939
    bs->dev = dev;
1940
    bdrv_iostatus_reset(bs);
1941
    return 0;
1942
}
1943

    
1944
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1945
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1946
{
1947
    if (bdrv_attach_dev(bs, dev) < 0) {
1948
        abort();
1949
    }
1950
}
1951

    
1952
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1953
/* TODO change to DeviceState *dev when all users are qdevified */
1954
{
1955
    assert(bs->dev == dev);
1956
    bs->dev = NULL;
1957
    bs->dev_ops = NULL;
1958
    bs->dev_opaque = NULL;
1959
    bs->guest_block_size = 512;
1960
}
1961

    
1962
/* TODO change to return DeviceState * when all users are qdevified */
1963
void *bdrv_get_attached_dev(BlockDriverState *bs)
1964
{
1965
    return bs->dev;
1966
}
1967

    
1968
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1969
                      void *opaque)
1970
{
1971
    bs->dev_ops = ops;
1972
    bs->dev_opaque = opaque;
1973
}
1974

    
1975
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1976
                               enum MonitorEvent ev,
1977
                               BlockErrorAction action, bool is_read)
1978
{
1979
    QObject *data;
1980
    const char *action_str;
1981

    
1982
    switch (action) {
1983
    case BDRV_ACTION_REPORT:
1984
        action_str = "report";
1985
        break;
1986
    case BDRV_ACTION_IGNORE:
1987
        action_str = "ignore";
1988
        break;
1989
    case BDRV_ACTION_STOP:
1990
        action_str = "stop";
1991
        break;
1992
    default:
1993
        abort();
1994
    }
1995

    
1996
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1997
                              bdrv->device_name,
1998
                              action_str,
1999
                              is_read ? "read" : "write");
2000
    monitor_protocol_event(ev, data);
2001

    
2002
    qobject_decref(data);
2003
}
2004

    
2005
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2006
{
2007
    QObject *data;
2008

    
2009
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2010
                              bdrv_get_device_name(bs), ejected);
2011
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2012

    
2013
    qobject_decref(data);
2014
}
2015

    
2016
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2017
{
2018
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2019
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2020
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2021
        if (tray_was_closed) {
2022
            /* tray open */
2023
            bdrv_emit_qmp_eject_event(bs, true);
2024
        }
2025
        if (load) {
2026
            /* tray close */
2027
            bdrv_emit_qmp_eject_event(bs, false);
2028
        }
2029
    }
2030
}
2031

    
2032
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2033
{
2034
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2035
}
2036

    
2037
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2038
{
2039
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2040
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2041
    }
2042
}
2043

    
2044
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2045
{
2046
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2047
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
2048
    }
2049
    return false;
2050
}
2051

    
2052
static void bdrv_dev_resize_cb(BlockDriverState *bs)
2053
{
2054
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
2055
        bs->dev_ops->resize_cb(bs->dev_opaque);
2056
    }
2057
}
2058

    
2059
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2060
{
2061
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2062
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2063
    }
2064
    return false;
2065
}
2066

    
2067
/*
2068
 * Run consistency checks on an image
2069
 *
2070
 * Returns 0 if the check could be completed (it doesn't mean that the image is
2071
 * free of errors) or -errno when an internal error occurred. The results of the
2072
 * check are stored in res.
2073
 */
2074
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2075
{
2076
    if (bs->drv->bdrv_check == NULL) {
2077
        return -ENOTSUP;
2078
    }
2079

    
2080
    memset(res, 0, sizeof(*res));
2081
    return bs->drv->bdrv_check(bs, res, fix);
2082
}
2083

    
2084
#define COMMIT_BUF_SECTORS 2048
2085

    
2086
/* commit COW file into the raw image */
2087
int bdrv_commit(BlockDriverState *bs)
2088
{
2089
    BlockDriver *drv = bs->drv;
2090
    int64_t sector, total_sectors, length, backing_length;
2091
    int n, ro, open_flags;
2092
    int ret = 0;
2093
    uint8_t *buf = NULL;
2094
    char filename[PATH_MAX];
2095

    
2096
    if (!drv)
2097
        return -ENOMEDIUM;
2098
    
2099
    if (!bs->backing_hd) {
2100
        return -ENOTSUP;
2101
    }
2102

    
2103
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2104
        return -EBUSY;
2105
    }
2106

    
2107
    ro = bs->backing_hd->read_only;
2108
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2109
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2110
    open_flags =  bs->backing_hd->open_flags;
2111

    
2112
    if (ro) {
2113
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2114
            return -EACCES;
2115
        }
2116
    }
2117

    
2118
    length = bdrv_getlength(bs);
2119
    if (length < 0) {
2120
        ret = length;
2121
        goto ro_cleanup;
2122
    }
2123

    
2124
    backing_length = bdrv_getlength(bs->backing_hd);
2125
    if (backing_length < 0) {
2126
        ret = backing_length;
2127
        goto ro_cleanup;
2128
    }
2129

    
2130
    /* If our top snapshot is larger than the backing file image,
2131
     * grow the backing file image if possible.  If not possible,
2132
     * we must return an error */
2133
    if (length > backing_length) {
2134
        ret = bdrv_truncate(bs->backing_hd, length);
2135
        if (ret < 0) {
2136
            goto ro_cleanup;
2137
        }
2138
    }
2139

    
2140
    total_sectors = length >> BDRV_SECTOR_BITS;
2141
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2142

    
2143
    for (sector = 0; sector < total_sectors; sector += n) {
2144
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2145
        if (ret < 0) {
2146
            goto ro_cleanup;
2147
        }
2148
        if (ret) {
2149
            ret = bdrv_read(bs, sector, buf, n);
2150
            if (ret < 0) {
2151
                goto ro_cleanup;
2152
            }
2153

    
2154
            ret = bdrv_write(bs->backing_hd, sector, buf, n);
2155
            if (ret < 0) {
2156
                goto ro_cleanup;
2157
            }
2158
        }
2159
    }
2160

    
2161
    if (drv->bdrv_make_empty) {
2162
        ret = drv->bdrv_make_empty(bs);
2163
        if (ret < 0) {
2164
            goto ro_cleanup;
2165
        }
2166
        bdrv_flush(bs);
2167
    }
2168

    
2169
    /*
2170
     * Make sure all data we wrote to the backing device is actually
2171
     * stable on disk.
2172
     */
2173
    if (bs->backing_hd) {
2174
        bdrv_flush(bs->backing_hd);
2175
    }
2176

    
2177
    ret = 0;
2178
ro_cleanup:
2179
    g_free(buf);
2180

    
2181
    if (ro) {
2182
        /* ignoring error return here */
2183
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2184
    }
2185

    
2186
    return ret;
2187
}
2188

    
2189
int bdrv_commit_all(void)
2190
{
2191
    BlockDriverState *bs;
2192

    
2193
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2194
        if (bs->drv && bs->backing_hd) {
2195
            int ret = bdrv_commit(bs);
2196
            if (ret < 0) {
2197
                return ret;
2198
            }
2199
        }
2200
    }
2201
    return 0;
2202
}
2203

    
2204
/**
2205
 * Remove an active request from the tracked requests list
2206
 *
2207
 * This function should be called when a tracked request is completing.
2208
 */
2209
static void tracked_request_end(BdrvTrackedRequest *req)
2210
{
2211
    if (req->serialising) {
2212
        req->bs->serialising_in_flight--;
2213
    }
2214

    
2215
    QLIST_REMOVE(req, list);
2216
    qemu_co_queue_restart_all(&req->wait_queue);
2217
}
2218

    
2219
/**
2220
 * Add an active request to the tracked requests list
2221
 */
2222
static void tracked_request_begin(BdrvTrackedRequest *req,
2223
                                  BlockDriverState *bs,
2224
                                  int64_t offset,
2225
                                  unsigned int bytes, bool is_write)
2226
{
2227
    *req = (BdrvTrackedRequest){
2228
        .bs = bs,
2229
        .offset         = offset,
2230
        .bytes          = bytes,
2231
        .is_write       = is_write,
2232
        .co             = qemu_coroutine_self(),
2233
        .serialising    = false,
2234
        .overlap_offset = offset,
2235
        .overlap_bytes  = bytes,
2236
    };
2237

    
2238
    qemu_co_queue_init(&req->wait_queue);
2239

    
2240
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2241
}
2242

    
2243
static void mark_request_serialising(BdrvTrackedRequest *req, size_t align)
2244
{
2245
    int64_t overlap_offset = req->offset & ~(align - 1);
2246
    int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2247
                      - overlap_offset;
2248

    
2249
    if (!req->serialising) {
2250
        req->bs->serialising_in_flight++;
2251
        req->serialising = true;
2252
    }
2253

    
2254
    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2255
    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2256
}
2257

    
2258
/**
2259
 * Round a region to cluster boundaries
2260
 */
2261
void bdrv_round_to_clusters(BlockDriverState *bs,
2262
                            int64_t sector_num, int nb_sectors,
2263
                            int64_t *cluster_sector_num,
2264
                            int *cluster_nb_sectors)
2265
{
2266
    BlockDriverInfo bdi;
2267

    
2268
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2269
        *cluster_sector_num = sector_num;
2270
        *cluster_nb_sectors = nb_sectors;
2271
    } else {
2272
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2273
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2274
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2275
                                            nb_sectors, c);
2276
    }
2277
}
2278

    
2279
static int bdrv_get_cluster_size(BlockDriverState *bs)
2280
{
2281
    BlockDriverInfo bdi;
2282
    int ret;
2283

    
2284
    ret = bdrv_get_info(bs, &bdi);
2285
    if (ret < 0 || bdi.cluster_size == 0) {
2286
        return bs->request_alignment;
2287
    } else {
2288
        return bdi.cluster_size;
2289
    }
2290
}
2291

    
2292
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2293
                                     int64_t offset, unsigned int bytes)
2294
{
2295
    /*        aaaa   bbbb */
2296
    if (offset >= req->overlap_offset + req->overlap_bytes) {
2297
        return false;
2298
    }
2299
    /* bbbb   aaaa        */
2300
    if (req->overlap_offset >= offset + bytes) {
2301
        return false;
2302
    }
2303
    return true;
2304
}
2305

    
2306
static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2307
{
2308
    BlockDriverState *bs = self->bs;
2309
    BdrvTrackedRequest *req;
2310
    bool retry;
2311

    
2312
    if (!bs->serialising_in_flight) {
2313
        return;
2314
    }
2315

    
2316
    do {
2317
        retry = false;
2318
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2319
            if (req == self || (!req->serialising && !self->serialising)) {
2320
                continue;
2321
            }
2322
            if (tracked_request_overlaps(req, self->overlap_offset,
2323
                                         self->overlap_bytes))
2324
            {
2325
                /* Hitting this means there was a reentrant request, for
2326
                 * example, a block driver issuing nested requests.  This must
2327
                 * never happen since it means deadlock.
2328
                 */
2329
                assert(qemu_coroutine_self() != req->co);
2330

    
2331
                /* If the request is already (indirectly) waiting for us, or
2332
                 * will wait for us as soon as it wakes up, then just go on
2333
                 * (instead of producing a deadlock in the former case). */
2334
                if (!req->waiting_for) {
2335
                    self->waiting_for = req;
2336
                    qemu_co_queue_wait(&req->wait_queue);
2337
                    self->waiting_for = NULL;
2338
                    retry = true;
2339
                    break;
2340
                }
2341
            }
2342
        }
2343
    } while (retry);
2344
}
2345

    
2346
/*
2347
 * Return values:
2348
 * 0        - success
2349
 * -EINVAL  - backing format specified, but no file
2350
 * -ENOSPC  - can't update the backing file because no space is left in the
2351
 *            image file header
2352
 * -ENOTSUP - format driver doesn't support changing the backing file
2353
 */
2354
int bdrv_change_backing_file(BlockDriverState *bs,
2355
    const char *backing_file, const char *backing_fmt)
2356
{
2357
    BlockDriver *drv = bs->drv;
2358
    int ret;
2359

    
2360
    /* Backing file format doesn't make sense without a backing file */
2361
    if (backing_fmt && !backing_file) {
2362
        return -EINVAL;
2363
    }
2364

    
2365
    if (drv->bdrv_change_backing_file != NULL) {
2366
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2367
    } else {
2368
        ret = -ENOTSUP;
2369
    }
2370

    
2371
    if (ret == 0) {
2372
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2373
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2374
    }
2375
    return ret;
2376
}
2377

    
2378
/*
2379
 * Finds the image layer in the chain that has 'bs' as its backing file.
2380
 *
2381
 * active is the current topmost image.
2382
 *
2383
 * Returns NULL if bs is not found in active's image chain,
2384
 * or if active == bs.
2385
 */
2386
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2387
                                    BlockDriverState *bs)
2388
{
2389
    BlockDriverState *overlay = NULL;
2390
    BlockDriverState *intermediate;
2391

    
2392
    assert(active != NULL);
2393
    assert(bs != NULL);
2394

    
2395
    /* if bs is the same as active, then by definition it has no overlay
2396
     */
2397
    if (active == bs) {
2398
        return NULL;
2399
    }
2400

    
2401
    intermediate = active;
2402
    while (intermediate->backing_hd) {
2403
        if (intermediate->backing_hd == bs) {
2404
            overlay = intermediate;
2405
            break;
2406
        }
2407
        intermediate = intermediate->backing_hd;
2408
    }
2409

    
2410
    return overlay;
2411
}
2412

    
2413
typedef struct BlkIntermediateStates {
2414
    BlockDriverState *bs;
2415
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2416
} BlkIntermediateStates;
2417

    
2418

    
2419
/*
2420
 * Drops images above 'base' up to and including 'top', and sets the image
2421
 * above 'top' to have base as its backing file.
2422
 *
2423
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2424
 * information in 'bs' can be properly updated.
2425
 *
2426
 * E.g., this will convert the following chain:
2427
 * bottom <- base <- intermediate <- top <- active
2428
 *
2429
 * to
2430
 *
2431
 * bottom <- base <- active
2432
 *
2433
 * It is allowed for bottom==base, in which case it converts:
2434
 *
2435
 * base <- intermediate <- top <- active
2436
 *
2437
 * to
2438
 *
2439
 * base <- active
2440
 *
2441
 * Error conditions:
2442
 *  if active == top, that is considered an error
2443
 *
2444
 */
2445
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2446
                           BlockDriverState *base)
2447
{
2448
    BlockDriverState *intermediate;
2449
    BlockDriverState *base_bs = NULL;
2450
    BlockDriverState *new_top_bs = NULL;
2451
    BlkIntermediateStates *intermediate_state, *next;
2452
    int ret = -EIO;
2453

    
2454
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2455
    QSIMPLEQ_INIT(&states_to_delete);
2456

    
2457
    if (!top->drv || !base->drv) {
2458
        goto exit;
2459
    }
2460

    
2461
    new_top_bs = bdrv_find_overlay(active, top);
2462

    
2463
    if (new_top_bs == NULL) {
2464
        /* we could not find the image above 'top', this is an error */
2465
        goto exit;
2466
    }
2467

    
2468
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2469
     * to do, no intermediate images */
2470
    if (new_top_bs->backing_hd == base) {
2471
        ret = 0;
2472
        goto exit;
2473
    }
2474

    
2475
    intermediate = top;
2476

    
2477
    /* now we will go down through the list, and add each BDS we find
2478
     * into our deletion queue, until we hit the 'base'
2479
     */
2480
    while (intermediate) {
2481
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2482
        intermediate_state->bs = intermediate;
2483
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2484

    
2485
        if (intermediate->backing_hd == base) {
2486
            base_bs = intermediate->backing_hd;
2487
            break;
2488
        }
2489
        intermediate = intermediate->backing_hd;
2490
    }
2491
    if (base_bs == NULL) {
2492
        /* something went wrong, we did not end at the base. safely
2493
         * unravel everything, and exit with error */
2494
        goto exit;
2495
    }
2496

    
2497
    /* success - we can delete the intermediate states, and link top->base */
2498
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2499
                                   base_bs->drv ? base_bs->drv->format_name : "");
2500
    if (ret) {
2501
        goto exit;
2502
    }
2503
    new_top_bs->backing_hd = base_bs;
2504

    
2505
    bdrv_refresh_limits(new_top_bs);
2506

    
2507
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2508
        /* so that bdrv_close() does not recursively close the chain */
2509
        intermediate_state->bs->backing_hd = NULL;
2510
        bdrv_unref(intermediate_state->bs);
2511
    }
2512
    ret = 0;
2513

    
2514
exit:
2515
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2516
        g_free(intermediate_state);
2517
    }
2518
    return ret;
2519
}
2520

    
2521

    
2522
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2523
                                   size_t size)
2524
{
2525
    int64_t len;
2526

    
2527
    if (!bdrv_is_inserted(bs))
2528
        return -ENOMEDIUM;
2529

    
2530
    if (bs->growable)
2531
        return 0;
2532

    
2533
    len = bdrv_getlength(bs);
2534

    
2535
    if (offset < 0)
2536
        return -EIO;
2537

    
2538
    if ((offset > len) || (len - offset < size))
2539
        return -EIO;
2540

    
2541
    return 0;
2542
}
2543

    
2544
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2545
                              int nb_sectors)
2546
{
2547
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2548
                                   nb_sectors * BDRV_SECTOR_SIZE);
2549
}
2550

    
2551
typedef struct RwCo {
2552
    BlockDriverState *bs;
2553
    int64_t sector_num;
2554
    int nb_sectors;
2555
    QEMUIOVector *qiov;
2556
    bool is_write;
2557
    int ret;
2558
    BdrvRequestFlags flags;
2559
} RwCo;
2560

    
2561
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2562
{
2563
    RwCo *rwco = opaque;
2564

    
2565
    if (!rwco->is_write) {
2566
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2567
                                     rwco->nb_sectors, rwco->qiov,
2568
                                     rwco->flags);
2569
    } else {
2570
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2571
                                      rwco->nb_sectors, rwco->qiov,
2572
                                      rwco->flags);
2573
    }
2574
}
2575

    
2576
/*
2577
 * Process a vectored synchronous request using coroutines
2578
 */
2579
static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2580
                       QEMUIOVector *qiov, bool is_write,
2581
                       BdrvRequestFlags flags)
2582
{
2583
    Coroutine *co;
2584
    RwCo rwco = {
2585
        .bs = bs,
2586
        .sector_num = sector_num,
2587
        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2588
        .qiov = qiov,
2589
        .is_write = is_write,
2590
        .ret = NOT_DONE,
2591
        .flags = flags,
2592
    };
2593
    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2594

    
2595
    /**
2596
     * In sync call context, when the vcpu is blocked, this throttling timer
2597
     * will not fire; so the I/O throttling function has to be disabled here
2598
     * if it has been enabled.
2599
     */
2600
    if (bs->io_limits_enabled) {
2601
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2602
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2603
        bdrv_io_limits_disable(bs);
2604
    }
2605

    
2606
    if (qemu_in_coroutine()) {
2607
        /* Fast-path if already in coroutine context */
2608
        bdrv_rw_co_entry(&rwco);
2609
    } else {
2610
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2611
        qemu_coroutine_enter(co, &rwco);
2612
        while (rwco.ret == NOT_DONE) {
2613
            qemu_aio_wait();
2614
        }
2615
    }
2616
    return rwco.ret;
2617
}
2618

    
2619
/*
2620
 * Process a synchronous request using coroutines
2621
 */
2622
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2623
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2624
{
2625
    QEMUIOVector qiov;
2626
    struct iovec iov = {
2627
        .iov_base = (void *)buf,
2628
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2629
    };
2630

    
2631
    qemu_iovec_init_external(&qiov, &iov, 1);
2632
    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2633
}
2634

    
2635
/* return < 0 if error. See bdrv_write() for the return codes */
2636
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2637
              uint8_t *buf, int nb_sectors)
2638
{
2639
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2640
}
2641

    
2642
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2643
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2644
                          uint8_t *buf, int nb_sectors)
2645
{
2646
    bool enabled;
2647
    int ret;
2648

    
2649
    enabled = bs->io_limits_enabled;
2650
    bs->io_limits_enabled = false;
2651
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2652
    bs->io_limits_enabled = enabled;
2653
    return ret;
2654
}
2655

    
2656
/* Return < 0 if error. Important errors are:
2657
  -EIO         generic I/O error (may happen for all errors)
2658
  -ENOMEDIUM   No media inserted.
2659
  -EINVAL      Invalid sector number or nb_sectors
2660
  -EACCES      Trying to write a read-only device
2661
*/
2662
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2663
               const uint8_t *buf, int nb_sectors)
2664
{
2665
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2666
}
2667

    
2668
int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2669
{
2670
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2671
}
2672

    
2673
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2674
                      int nb_sectors, BdrvRequestFlags flags)
2675
{
2676
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2677
                      BDRV_REQ_ZERO_WRITE | flags);
2678
}
2679

    
2680
/*
2681
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2682
 * The operation is sped up by checking the block status and only writing
2683
 * zeroes to the device if they currently do not return zeroes. Optional
2684
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2685
 *
2686
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2687
 */
2688
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2689
{
2690
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2691
    int64_t ret, nb_sectors, sector_num = 0;
2692
    int n;
2693

    
2694
    for (;;) {
2695
        nb_sectors = target_size - sector_num;
2696
        if (nb_sectors <= 0) {
2697
            return 0;
2698
        }
2699
        if (nb_sectors > INT_MAX) {
2700
            nb_sectors = INT_MAX;
2701
        }
2702
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2703
        if (ret < 0) {
2704
            error_report("error getting block status at sector %" PRId64 ": %s",
2705
                         sector_num, strerror(-ret));
2706
            return ret;
2707
        }
2708
        if (ret & BDRV_BLOCK_ZERO) {
2709
            sector_num += n;
2710
            continue;
2711
        }
2712
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2713
        if (ret < 0) {
2714
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2715
                         sector_num, strerror(-ret));
2716
            return ret;
2717
        }
2718
        sector_num += n;
2719
    }
2720
}
2721

    
2722
int bdrv_pread(BlockDriverState *bs, int64_t offset,
2723
               void *buf, int count1)
2724
{
2725
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2726
    int len, nb_sectors, count;
2727
    int64_t sector_num;
2728
    int ret;
2729

    
2730
    count = count1;
2731
    /* first read to align to sector start */
2732
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2733
    if (len > count)
2734
        len = count;
2735
    sector_num = offset >> BDRV_SECTOR_BITS;
2736
    if (len > 0) {
2737
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2738
            return ret;
2739
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2740
        count -= len;
2741
        if (count == 0)
2742
            return count1;
2743
        sector_num++;
2744
        buf += len;
2745
    }
2746

    
2747
    /* read the sectors "in place" */
2748
    nb_sectors = count >> BDRV_SECTOR_BITS;
2749
    if (nb_sectors > 0) {
2750
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2751
            return ret;
2752
        sector_num += nb_sectors;
2753
        len = nb_sectors << BDRV_SECTOR_BITS;
2754
        buf += len;
2755
        count -= len;
2756
    }
2757

    
2758
    /* add data from the last sector */
2759
    if (count > 0) {
2760
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2761
            return ret;
2762
        memcpy(buf, tmp_buf, count);
2763
    }
2764
    return count1;
2765
}
2766

    
2767
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2768
{
2769
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2770
    int len, nb_sectors, count;
2771
    int64_t sector_num;
2772
    int ret;
2773

    
2774
    count = qiov->size;
2775

    
2776
    /* first write to align to sector start */
2777
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2778
    if (len > count)
2779
        len = count;
2780
    sector_num = offset >> BDRV_SECTOR_BITS;
2781
    if (len > 0) {
2782
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2783
            return ret;
2784
        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2785
                          len);
2786
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2787
            return ret;
2788
        count -= len;
2789
        if (count == 0)
2790
            return qiov->size;
2791
        sector_num++;
2792
    }
2793

    
2794
    /* write the sectors "in place" */
2795
    nb_sectors = count >> BDRV_SECTOR_BITS;
2796
    if (nb_sectors > 0) {
2797
        QEMUIOVector qiov_inplace;
2798

    
2799
        qemu_iovec_init(&qiov_inplace, qiov->niov);
2800
        qemu_iovec_concat(&qiov_inplace, qiov, len,
2801
                          nb_sectors << BDRV_SECTOR_BITS);
2802
        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2803
        qemu_iovec_destroy(&qiov_inplace);
2804
        if (ret < 0) {
2805
            return ret;
2806
        }
2807

    
2808
        sector_num += nb_sectors;
2809
        len = nb_sectors << BDRV_SECTOR_BITS;
2810
        count -= len;
2811
    }
2812

    
2813
    /* add data from the last sector */
2814
    if (count > 0) {
2815
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2816
            return ret;
2817
        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2818
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2819
            return ret;
2820
    }
2821
    return qiov->size;
2822
}
2823

    
2824
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2825
                const void *buf, int count1)
2826
{
2827
    QEMUIOVector qiov;
2828
    struct iovec iov = {
2829
        .iov_base   = (void *) buf,
2830
        .iov_len    = count1,
2831
    };
2832

    
2833
    qemu_iovec_init_external(&qiov, &iov, 1);
2834
    return bdrv_pwritev(bs, offset, &qiov);
2835
}
2836

    
2837
/*
2838
 * Writes to the file and ensures that no writes are reordered across this
2839
 * request (acts as a barrier)
2840
 *
2841
 * Returns 0 on success, -errno in error cases.
2842
 */
2843
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2844
    const void *buf, int count)
2845
{
2846
    int ret;
2847

    
2848
    ret = bdrv_pwrite(bs, offset, buf, count);
2849
    if (ret < 0) {
2850
        return ret;
2851
    }
2852

    
2853
    /* No flush needed for cache modes that already do it */
2854
    if (bs->enable_write_cache) {
2855
        bdrv_flush(bs);
2856
    }
2857

    
2858
    return 0;
2859
}
2860

    
2861
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2862
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2863
{
2864
    /* Perform I/O through a temporary buffer so that users who scribble over
2865
     * their read buffer while the operation is in progress do not end up
2866
     * modifying the image file.  This is critical for zero-copy guest I/O
2867
     * where anything might happen inside guest memory.
2868
     */
2869
    void *bounce_buffer;
2870

    
2871
    BlockDriver *drv = bs->drv;
2872
    struct iovec iov;
2873
    QEMUIOVector bounce_qiov;
2874
    int64_t cluster_sector_num;
2875
    int cluster_nb_sectors;
2876
    size_t skip_bytes;
2877
    int ret;
2878

    
2879
    /* Cover entire cluster so no additional backing file I/O is required when
2880
     * allocating cluster in the image file.
2881
     */
2882
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2883
                           &cluster_sector_num, &cluster_nb_sectors);
2884

    
2885
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2886
                                   cluster_sector_num, cluster_nb_sectors);
2887

    
2888
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2889
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2890
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2891

    
2892
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2893
                             &bounce_qiov);
2894
    if (ret < 0) {
2895
        goto err;
2896
    }
2897

    
2898
    if (drv->bdrv_co_write_zeroes &&
2899
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2900
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2901
                                      cluster_nb_sectors, 0);
2902
    } else {
2903
        /* This does not change the data on the disk, it is not necessary
2904
         * to flush even in cache=writethrough mode.
2905
         */
2906
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2907
                                  &bounce_qiov);
2908
    }
2909

    
2910
    if (ret < 0) {
2911
        /* It might be okay to ignore write errors for guest requests.  If this
2912
         * is a deliberate copy-on-read then we don't want to ignore the error.
2913
         * Simply report it in all cases.
2914
         */
2915
        goto err;
2916
    }
2917

    
2918
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2919
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2920
                        nb_sectors * BDRV_SECTOR_SIZE);
2921

    
2922
err:
2923
    qemu_vfree(bounce_buffer);
2924
    return ret;
2925
}
2926

    
2927
/*
2928
 * Forwards an already correctly aligned request to the BlockDriver. This
2929
 * handles copy on read and zeroing after EOF; any other features must be
2930
 * implemented by the caller.
2931
 */
2932
static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2933
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2934
    int64_t align, QEMUIOVector *qiov, int flags)
2935
{
2936
    BlockDriver *drv = bs->drv;
2937
    int ret;
2938

    
2939
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2940
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2941

    
2942
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2943
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2944

    
2945
    /* Handle Copy on Read and associated serialisation */
2946
    if (flags & BDRV_REQ_COPY_ON_READ) {
2947
        /* If we touch the same cluster it counts as an overlap.  This
2948
         * guarantees that allocating writes will be serialized and not race
2949
         * with each other for the same cluster.  For example, in copy-on-read
2950
         * it ensures that the CoR read and write operations are atomic and
2951
         * guest writes cannot interleave between them. */
2952
        mark_request_serialising(req, bdrv_get_cluster_size(bs));
2953
    }
2954

    
2955
    wait_serialising_requests(req);
2956

    
2957
    if (flags & BDRV_REQ_COPY_ON_READ) {
2958
        int pnum;
2959

    
2960
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2961
        if (ret < 0) {
2962
            goto out;
2963
        }
2964

    
2965
        if (!ret || pnum != nb_sectors) {
2966
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2967
            goto out;
2968
        }
2969
    }
2970

    
2971
    /* Forward the request to the BlockDriver */
2972
    if (!(bs->zero_beyond_eof && bs->growable)) {
2973
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2974
    } else {
2975
        /* Read zeros after EOF of growable BDSes */
2976
        int64_t len, total_sectors, max_nb_sectors;
2977

    
2978
        len = bdrv_getlength(bs);
2979
        if (len < 0) {
2980
            ret = len;
2981
            goto out;
2982
        }
2983

    
2984
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2985
        max_nb_sectors = MAX(0, ROUND_UP(total_sectors - sector_num,
2986
                                         align >> BDRV_SECTOR_BITS));
2987
        if (max_nb_sectors > 0) {
2988
            ret = drv->bdrv_co_readv(bs, sector_num,
2989
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2990
        } else {
2991
            ret = 0;
2992
        }
2993

    
2994
        /* Reading beyond end of file is supposed to produce zeroes */
2995
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2996
            uint64_t offset = MAX(0, total_sectors - sector_num);
2997
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2998
                              BDRV_SECTOR_SIZE;
2999
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3000
        }
3001
    }
3002

    
3003
out:
3004
    return ret;
3005
}
3006

    
3007
/*
3008
 * Handle a read request in coroutine context
3009
 */
3010
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3011
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3012
    BdrvRequestFlags flags)
3013
{
3014
    BlockDriver *drv = bs->drv;
3015
    BdrvTrackedRequest req;
3016

    
3017
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3018
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3019
    uint8_t *head_buf = NULL;
3020
    uint8_t *tail_buf = NULL;
3021
    QEMUIOVector local_qiov;
3022
    bool use_local_qiov = false;
3023
    int ret;
3024

    
3025
    if (!drv) {
3026
        return -ENOMEDIUM;
3027
    }
3028
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3029
        return -EIO;
3030
    }
3031

    
3032
    if (bs->copy_on_read) {
3033
        flags |= BDRV_REQ_COPY_ON_READ;
3034
    }
3035

    
3036
    /* throttling disk I/O */
3037
    if (bs->io_limits_enabled) {
3038
        /* TODO Switch to byte granularity */
3039
        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, false);
3040
    }
3041

    
3042
    /* Align read if necessary by padding qiov */
3043
    if (offset & (align - 1)) {
3044
        head_buf = qemu_blockalign(bs, align);
3045
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3046
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3047
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3048
        use_local_qiov = true;
3049

    
3050
        bytes += offset & (align - 1);
3051
        offset = offset & ~(align - 1);
3052
    }
3053

    
3054
    if ((offset + bytes) & (align - 1)) {
3055
        if (!use_local_qiov) {
3056
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3057
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3058
            use_local_qiov = true;
3059
        }
3060
        tail_buf = qemu_blockalign(bs, align);
3061
        qemu_iovec_add(&local_qiov, tail_buf,
3062
                       align - ((offset + bytes) & (align - 1)));
3063

    
3064
        bytes = ROUND_UP(bytes, align);
3065
    }
3066

    
3067
    tracked_request_begin(&req, bs, offset, bytes, false);
3068
    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3069
                              use_local_qiov ? &local_qiov : qiov,
3070
                              flags);
3071
    tracked_request_end(&req);
3072

    
3073
    if (use_local_qiov) {
3074
        qemu_iovec_destroy(&local_qiov);
3075
        qemu_vfree(head_buf);
3076
        qemu_vfree(tail_buf);
3077
    }
3078

    
3079
    return ret;
3080
}
3081

    
3082
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3083
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3084
    BdrvRequestFlags flags)
3085
{
3086
    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3087
        return -EINVAL;
3088
    }
3089

    
3090
    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3091
                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3092
}
3093

    
3094
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3095
    int nb_sectors, QEMUIOVector *qiov)
3096
{
3097
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3098

    
3099
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3100
}
3101

    
3102
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3103
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3104
{
3105
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3106

    
3107
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3108
                            BDRV_REQ_COPY_ON_READ);
3109
}
3110

    
3111
/* if no limit is specified in the BlockLimits use a default
3112
 * of 32768 512-byte sectors (16 MiB) per request.
3113
 */
3114
#define MAX_WRITE_ZEROES_DEFAULT 32768
3115

    
3116
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3117
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3118
{
3119
    BlockDriver *drv = bs->drv;
3120
    QEMUIOVector qiov;
3121
    struct iovec iov = {0};
3122
    int ret = 0;
3123

    
3124
    int max_write_zeroes = bs->bl.max_write_zeroes ?
3125
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3126

    
3127
    while (nb_sectors > 0 && !ret) {
3128
        int num = nb_sectors;
3129

    
3130
        /* Align request.  Block drivers can expect the "bulk" of the request
3131
         * to be aligned.
3132
         */
3133
        if (bs->bl.write_zeroes_alignment
3134
            && num > bs->bl.write_zeroes_alignment) {
3135
            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3136
                /* Make a small request up to the first aligned sector.  */
3137
                num = bs->bl.write_zeroes_alignment;
3138
                num -= sector_num % bs->bl.write_zeroes_alignment;
3139
            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3140
                /* Shorten the request to the last aligned sector.  num cannot
3141
                 * underflow because num > bs->bl.write_zeroes_alignment.
3142
                 */
3143
                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3144
            }
3145
        }
3146

    
3147
        /* limit request size */
3148
        if (num > max_write_zeroes) {
3149
            num = max_write_zeroes;
3150
        }
3151

    
3152
        ret = -ENOTSUP;
3153
        /* First try the efficient write zeroes operation */
3154
        if (drv->bdrv_co_write_zeroes) {
3155
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3156
        }
3157

    
3158
        if (ret == -ENOTSUP) {
3159
            /* Fall back to bounce buffer if write zeroes is unsupported */
3160
            iov.iov_len = num * BDRV_SECTOR_SIZE;
3161
            if (iov.iov_base == NULL) {
3162
                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3163
                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3164
            }
3165
            qemu_iovec_init_external(&qiov, &iov, 1);
3166

    
3167
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3168

    
3169
            /* Keep bounce buffer around if it is big enough for all
3170
             * all future requests.
3171
             */
3172
            if (num < max_write_zeroes) {
3173
                qemu_vfree(iov.iov_base);
3174
                iov.iov_base = NULL;
3175
            }
3176
        }
3177

    
3178
        sector_num += num;
3179
        nb_sectors -= num;
3180
    }
3181

    
3182
    qemu_vfree(iov.iov_base);
3183
    return ret;
3184
}
3185

    
3186
/*
3187
 * Forwards an already correctly aligned write request to the BlockDriver.
3188
 */
3189
static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3190
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3191
    QEMUIOVector *qiov, int flags)
3192
{
3193
    BlockDriver *drv = bs->drv;
3194
    int ret;
3195

    
3196
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3197
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3198

    
3199
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3200
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3201

    
3202
    wait_serialising_requests(req);
3203

    
3204
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3205

    
3206
    if (ret < 0) {
3207
        /* Do nothing, write notifier decided to fail this request */
3208
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
3209
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3210
    } else {
3211
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3212
    }
3213

    
3214
    if (ret == 0 && !bs->enable_write_cache) {
3215
        ret = bdrv_co_flush(bs);
3216
    }
3217

    
3218
    bdrv_set_dirty(bs, sector_num, nb_sectors);
3219

    
3220
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3221
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
3222
    }
3223
    if (bs->growable && ret >= 0) {
3224
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3225
    }
3226

    
3227
    return ret;
3228
}
3229

    
3230
/*
3231
 * Handle a write request in coroutine context
3232
 */
3233
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3234
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3235
    BdrvRequestFlags flags)
3236
{
3237
    BdrvTrackedRequest req;
3238
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3239
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3240
    uint8_t *head_buf = NULL;
3241
    uint8_t *tail_buf = NULL;
3242
    QEMUIOVector local_qiov;
3243
    bool use_local_qiov = false;
3244
    int ret;
3245

    
3246
    if (!bs->drv) {
3247
        return -ENOMEDIUM;
3248
    }
3249
    if (bs->read_only) {
3250
        return -EACCES;
3251
    }
3252
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3253
        return -EIO;
3254
    }
3255

    
3256
    /* throttling disk I/O */
3257
    if (bs->io_limits_enabled) {
3258
        /* TODO Switch to byte granularity */
3259
        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true);
3260
    }
3261

    
3262
    /*
3263
     * Align write if necessary by performing a read-modify-write cycle.
3264
     * Pad qiov with the read parts and be sure to have a tracked request not
3265
     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3266
     */
3267
    tracked_request_begin(&req, bs, offset, bytes, true);
3268

    
3269
    if (offset & (align - 1)) {
3270
        QEMUIOVector head_qiov;
3271
        struct iovec head_iov;
3272

    
3273
        mark_request_serialising(&req, align);
3274
        wait_serialising_requests(&req);
3275

    
3276
        head_buf = qemu_blockalign(bs, align);
3277
        head_iov = (struct iovec) {
3278
            .iov_base   = head_buf,
3279
            .iov_len    = align,
3280
        };
3281
        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3282

    
3283
        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3284
                                  align, &head_qiov, 0);
3285
        if (ret < 0) {
3286
            goto fail;
3287
        }
3288

    
3289
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3290
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3291
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3292
        use_local_qiov = true;
3293

    
3294
        bytes += offset & (align - 1);
3295
        offset = offset & ~(align - 1);
3296
    }
3297

    
3298
    if ((offset + bytes) & (align - 1)) {
3299
        QEMUIOVector tail_qiov;
3300
        struct iovec tail_iov;
3301
        size_t tail_bytes;
3302

    
3303
        mark_request_serialising(&req, align);
3304
        wait_serialising_requests(&req);
3305

    
3306
        tail_buf = qemu_blockalign(bs, align);
3307
        tail_iov = (struct iovec) {
3308
            .iov_base   = tail_buf,
3309
            .iov_len    = align,
3310
        };
3311
        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3312

    
3313
        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3314
                                  align, &tail_qiov, 0);
3315
        if (ret < 0) {
3316
            goto fail;
3317
        }
3318

    
3319
        if (!use_local_qiov) {
3320
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3321
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3322
            use_local_qiov = true;
3323
        }
3324

    
3325
        tail_bytes = (offset + bytes) & (align - 1);
3326
        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3327

    
3328
        bytes = ROUND_UP(bytes, align);
3329
    }
3330

    
3331
    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3332
                               use_local_qiov ? &local_qiov : qiov,
3333
                               flags);
3334

    
3335
fail:
3336
    tracked_request_end(&req);
3337

    
3338
    if (use_local_qiov) {
3339
        qemu_iovec_destroy(&local_qiov);
3340
        qemu_vfree(head_buf);
3341
        qemu_vfree(tail_buf);
3342
    }
3343

    
3344
    return ret;
3345
}
3346

    
3347
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3348
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3349
    BdrvRequestFlags flags)
3350
{
3351
    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3352
        return -EINVAL;
3353
    }
3354

    
3355
    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3356
                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3357
}
3358

    
3359
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3360
    int nb_sectors, QEMUIOVector *qiov)
3361
{
3362
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3363

    
3364
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3365
}
3366

    
3367
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3368
                                      int64_t sector_num, int nb_sectors,
3369
                                      BdrvRequestFlags flags)
3370
{
3371
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3372

    
3373
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
3374
        flags &= ~BDRV_REQ_MAY_UNMAP;
3375
    }
3376

    
3377
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3378
                             BDRV_REQ_ZERO_WRITE | flags);
3379
}
3380

    
3381
/**
3382
 * Truncate file to 'offset' bytes (needed only for file protocols)
3383
 */
3384
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3385
{
3386
    BlockDriver *drv = bs->drv;
3387
    int ret;
3388
    if (!drv)
3389
        return -ENOMEDIUM;
3390
    if (!drv->bdrv_truncate)
3391
        return -ENOTSUP;
3392
    if (bs->read_only)
3393
        return -EACCES;
3394
    if (bdrv_in_use(bs))
3395
        return -EBUSY;
3396
    ret = drv->bdrv_truncate(bs, offset);
3397
    if (ret == 0) {
3398
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3399
        bdrv_dev_resize_cb(bs);
3400
    }
3401
    return ret;
3402
}
3403

    
3404
/**
3405
 * Length of a allocated file in bytes. Sparse files are counted by actual
3406
 * allocated space. Return < 0 if error or unknown.
3407
 */
3408
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3409
{
3410
    BlockDriver *drv = bs->drv;
3411
    if (!drv) {
3412
        return -ENOMEDIUM;
3413
    }
3414
    if (drv->bdrv_get_allocated_file_size) {
3415
        return drv->bdrv_get_allocated_file_size(bs);
3416
    }
3417
    if (bs->file) {
3418
        return bdrv_get_allocated_file_size(bs->file);
3419
    }
3420
    return -ENOTSUP;
3421
}
3422

    
3423
/**
3424
 * Length of a file in bytes. Return < 0 if error or unknown.
3425
 */
3426
int64_t bdrv_getlength(BlockDriverState *bs)
3427
{
3428
    BlockDriver *drv = bs->drv;
3429
    if (!drv)
3430
        return -ENOMEDIUM;
3431

    
3432
    if (drv->has_variable_length) {
3433
        int ret = refresh_total_sectors(bs, bs->total_sectors);
3434
        if (ret < 0) {
3435
            return ret;
3436
        }
3437
    }
3438
    return bs->total_sectors * BDRV_SECTOR_SIZE;
3439
}
3440

    
3441
/* return 0 as number of sectors if no device present or error */
3442
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3443
{
3444
    int64_t length;
3445
    length = bdrv_getlength(bs);
3446
    if (length < 0)
3447
        length = 0;
3448
    else
3449
        length = length >> BDRV_SECTOR_BITS;
3450
    *nb_sectors_ptr = length;
3451
}
3452

    
3453
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3454
                       BlockdevOnError on_write_error)
3455
{
3456
    bs->on_read_error = on_read_error;
3457
    bs->on_write_error = on_write_error;
3458
}
3459

    
3460
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3461
{
3462
    return is_read ? bs->on_read_error : bs->on_write_error;
3463
}
3464

    
3465
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3466
{
3467
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3468

    
3469
    switch (on_err) {
3470
    case BLOCKDEV_ON_ERROR_ENOSPC:
3471
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3472
    case BLOCKDEV_ON_ERROR_STOP:
3473
        return BDRV_ACTION_STOP;
3474
    case BLOCKDEV_ON_ERROR_REPORT:
3475
        return BDRV_ACTION_REPORT;
3476
    case BLOCKDEV_ON_ERROR_IGNORE:
3477
        return BDRV_ACTION_IGNORE;
3478
    default:
3479
        abort();
3480
    }
3481
}
3482

    
3483
/* This is done by device models because, while the block layer knows
3484
 * about the error, it does not know whether an operation comes from
3485
 * the device or the block layer (from a job, for example).
3486
 */
3487
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3488
                       bool is_read, int error)
3489
{
3490
    assert(error >= 0);
3491
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3492
    if (action == BDRV_ACTION_STOP) {
3493
        vm_stop(RUN_STATE_IO_ERROR);
3494
        bdrv_iostatus_set_err(bs, error);
3495
    }
3496
}
3497

    
3498
int bdrv_is_read_only(BlockDriverState *bs)
3499
{
3500
    return bs->read_only;
3501
}
3502

    
3503
int bdrv_is_sg(BlockDriverState *bs)
3504
{
3505
    return bs->sg;
3506
}
3507

    
3508
int bdrv_enable_write_cache(BlockDriverState *bs)
3509
{
3510
    return bs->enable_write_cache;
3511
}
3512

    
3513
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3514
{
3515
    bs->enable_write_cache = wce;
3516

    
3517
    /* so a reopen() will preserve wce */
3518
    if (wce) {
3519
        bs->open_flags |= BDRV_O_CACHE_WB;
3520
    } else {
3521
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3522
    }
3523
}
3524

    
3525
int bdrv_is_encrypted(BlockDriverState *bs)
3526
{
3527
    if (bs->backing_hd && bs->backing_hd->encrypted)
3528
        return 1;
3529
    return bs->encrypted;
3530
}
3531

    
3532
int bdrv_key_required(BlockDriverState *bs)
3533
{
3534
    BlockDriverState *backing_hd = bs->backing_hd;
3535

    
3536
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3537
        return 1;
3538
    return (bs->encrypted && !bs->valid_key);
3539
}
3540

    
3541
int bdrv_set_key(BlockDriverState *bs, const char *key)
3542
{
3543
    int ret;
3544
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3545
        ret = bdrv_set_key(bs->backing_hd, key);
3546
        if (ret < 0)
3547
            return ret;
3548
        if (!bs->encrypted)
3549
            return 0;
3550
    }
3551
    if (!bs->encrypted) {
3552
        return -EINVAL;
3553
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3554
        return -ENOMEDIUM;
3555
    }
3556
    ret = bs->drv->bdrv_set_key(bs, key);
3557
    if (ret < 0) {
3558
        bs->valid_key = 0;
3559
    } else if (!bs->valid_key) {
3560
        bs->valid_key = 1;
3561
        /* call the change callback now, we skipped it on open */
3562
        bdrv_dev_change_media_cb(bs, true);
3563
    }
3564
    return ret;
3565
}
3566

    
3567
const char *bdrv_get_format_name(BlockDriverState *bs)
3568
{
3569
    return bs->drv ? bs->drv->format_name : NULL;
3570
}
3571

    
3572
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3573
                         void *opaque)
3574
{
3575
    BlockDriver *drv;
3576

    
3577
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3578
        it(opaque, drv->format_name);
3579
    }
3580
}
3581

    
3582
/* This function is to find block backend bs */
3583
BlockDriverState *bdrv_find(const char *name)
3584
{
3585
    BlockDriverState *bs;
3586

    
3587
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3588
        if (!strcmp(name, bs->device_name)) {
3589
            return bs;
3590
        }
3591
    }
3592
    return NULL;
3593
}
3594

    
3595
/* This function is to find a node in the bs graph */
3596
BlockDriverState *bdrv_find_node(const char *node_name)
3597
{
3598
    BlockDriverState *bs;
3599

    
3600
    assert(node_name);
3601

    
3602
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3603
        if (!strcmp(node_name, bs->node_name)) {
3604
            return bs;
3605
        }
3606
    }
3607
    return NULL;
3608
}
3609

    
3610
/* Put this QMP function here so it can access the static graph_bdrv_states. */
3611
BlockDeviceInfoList *bdrv_named_nodes_list(void)
3612
{
3613
    BlockDeviceInfoList *list, *entry;
3614
    BlockDriverState *bs;
3615

    
3616
    list = NULL;
3617
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3618
        entry = g_malloc0(sizeof(*entry));
3619
        entry->value = bdrv_block_device_info(bs);
3620
        entry->next = list;
3621
        list = entry;
3622
    }
3623

    
3624
    return list;
3625
}
3626

    
3627
BlockDriverState *bdrv_lookup_bs(const char *device,
3628
                                 const char *node_name,
3629
                                 Error **errp)
3630
{
3631
    BlockDriverState *bs = NULL;
3632

    
3633
    if ((!device && !node_name) || (device && node_name)) {
3634
        error_setg(errp, "Use either device or node-name but not both");
3635
        return NULL;
3636
    }
3637

    
3638
    if (device) {
3639
        bs = bdrv_find(device);
3640

    
3641
        if (!bs) {
3642
            error_set(errp, QERR_DEVICE_NOT_FOUND, device);
3643
            return NULL;
3644
        }
3645

    
3646
        return bs;
3647
    }
3648

    
3649
    bs = bdrv_find_node(node_name);
3650

    
3651
    if (!bs) {
3652
        error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
3653
        return NULL;
3654
    }
3655

    
3656
    return bs;
3657
}
3658

    
3659
BlockDriverState *bdrv_next(BlockDriverState *bs)
3660
{
3661
    if (!bs) {
3662
        return QTAILQ_FIRST(&bdrv_states);
3663
    }
3664
    return QTAILQ_NEXT(bs, device_list);
3665
}
3666

    
3667
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3668
{
3669
    BlockDriverState *bs;
3670

    
3671
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3672
        it(opaque, bs);
3673
    }
3674
}
3675

    
3676
const char *bdrv_get_device_name(BlockDriverState *bs)
3677
{
3678
    return bs->device_name;
3679
}
3680

    
3681
int bdrv_get_flags(BlockDriverState *bs)
3682
{
3683
    return bs->open_flags;
3684
}
3685

    
3686
int bdrv_flush_all(void)
3687
{
3688
    BlockDriverState *bs;
3689
    int result = 0;
3690

    
3691
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3692
        int ret = bdrv_flush(bs);
3693
        if (ret < 0 && !result) {
3694
            result = ret;
3695
        }
3696
    }
3697

    
3698
    return result;
3699
}
3700

    
3701
int bdrv_has_zero_init_1(BlockDriverState *bs)
3702
{
3703
    return 1;
3704
}
3705

    
3706
int bdrv_has_zero_init(BlockDriverState *bs)
3707
{
3708
    assert(bs->drv);
3709

    
3710
    /* If BS is a copy on write image, it is initialized to
3711
       the contents of the base image, which may not be zeroes.  */
3712
    if (bs->backing_hd) {
3713
        return 0;
3714
    }
3715
    if (bs->drv->bdrv_has_zero_init) {
3716
        return bs->drv->bdrv_has_zero_init(bs);
3717
    }
3718

    
3719
    /* safe default */
3720
    return 0;
3721
}
3722

    
3723
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3724
{
3725
    BlockDriverInfo bdi;
3726

    
3727
    if (bs->backing_hd) {
3728
        return false;
3729
    }
3730

    
3731
    if (bdrv_get_info(bs, &bdi) == 0) {
3732
        return bdi.unallocated_blocks_are_zero;
3733
    }
3734

    
3735
    return false;
3736
}
3737

    
3738
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3739
{
3740
    BlockDriverInfo bdi;
3741

    
3742
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3743
        return false;
3744
    }
3745

    
3746
    if (bdrv_get_info(bs, &bdi) == 0) {
3747
        return bdi.can_write_zeroes_with_unmap;
3748
    }
3749

    
3750
    return false;
3751
}
3752

    
3753
typedef struct BdrvCoGetBlockStatusData {
3754
    BlockDriverState *bs;
3755
    BlockDriverState *base;
3756
    int64_t sector_num;
3757
    int nb_sectors;
3758
    int *pnum;
3759
    int64_t ret;
3760
    bool done;
3761
} BdrvCoGetBlockStatusData;
3762

    
3763
/*
3764
 * Returns true iff the specified sector is present in the disk image. Drivers
3765
 * not implementing the functionality are assumed to not support backing files,
3766
 * hence all their sectors are reported as allocated.
3767
 *
3768
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3769
 * and 'pnum' is set to 0.
3770
 *
3771
 * 'pnum' is set to the number of sectors (including and immediately following
3772
 * the specified sector) that are known to be in the same
3773
 * allocated/unallocated state.
3774
 *
3775
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3776
 * beyond the end of the disk image it will be clamped.
3777
 */
3778
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3779
                                                     int64_t sector_num,
3780
                                                     int nb_sectors, int *pnum)
3781
{
3782
    int64_t length;
3783
    int64_t n;
3784
    int64_t ret, ret2;
3785

    
3786
    length = bdrv_getlength(bs);
3787
    if (length < 0) {
3788
        return length;
3789
    }
3790

    
3791
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3792
        *pnum = 0;
3793
        return 0;
3794
    }
3795

    
3796
    n = bs->total_sectors - sector_num;
3797
    if (n < nb_sectors) {
3798
        nb_sectors = n;
3799
    }
3800

    
3801
    if (!bs->drv->bdrv_co_get_block_status) {
3802
        *pnum = nb_sectors;
3803
        ret = BDRV_BLOCK_DATA;
3804
        if (bs->drv->protocol_name) {
3805
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3806
        }
3807
        return ret;
3808
    }
3809

    
3810
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3811
    if (ret < 0) {
3812
        *pnum = 0;
3813
        return ret;
3814
    }
3815

    
3816
    if (ret & BDRV_BLOCK_RAW) {
3817
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3818
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3819
                                     *pnum, pnum);
3820
    }
3821

    
3822
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3823
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3824
            ret |= BDRV_BLOCK_ZERO;
3825
        } else if (bs->backing_hd) {
3826
            BlockDriverState *bs2 = bs->backing_hd;
3827
            int64_t length2 = bdrv_getlength(bs2);
3828
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3829
                ret |= BDRV_BLOCK_ZERO;
3830
            }
3831
        }
3832
    }
3833

    
3834
    if (bs->file &&
3835
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3836
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3837
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3838
                                        *pnum, pnum);
3839
        if (ret2 >= 0) {
3840
            /* Ignore errors.  This is just providing extra information, it
3841
             * is useful but not necessary.
3842
             */
3843
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3844
        }
3845
    }
3846

    
3847
    return ret;
3848
}
3849

    
3850
/* Coroutine wrapper for bdrv_get_block_status() */
3851
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3852
{
3853
    BdrvCoGetBlockStatusData *data = opaque;
3854
    BlockDriverState *bs = data->bs;
3855

    
3856
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3857
                                         data->pnum);
3858
    data->done = true;
3859
}
3860

    
3861
/*
3862
 * Synchronous wrapper around bdrv_co_get_block_status().
3863
 *
3864
 * See bdrv_co_get_block_status() for details.
3865
 */
3866
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3867
                              int nb_sectors, int *pnum)
3868
{
3869
    Coroutine *co;
3870
    BdrvCoGetBlockStatusData data = {
3871
        .bs = bs,
3872
        .sector_num = sector_num,
3873
        .nb_sectors = nb_sectors,
3874
        .pnum = pnum,
3875
        .done = false,
3876
    };
3877

    
3878
    if (qemu_in_coroutine()) {
3879
        /* Fast-path if already in coroutine context */
3880
        bdrv_get_block_status_co_entry(&data);
3881
    } else {
3882
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3883
        qemu_coroutine_enter(co, &data);
3884
        while (!data.done) {
3885
            qemu_aio_wait();
3886
        }
3887
    }
3888
    return data.ret;
3889
}
3890

    
3891
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3892
                                   int nb_sectors, int *pnum)
3893
{
3894
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3895
    if (ret < 0) {
3896
        return ret;
3897
    }
3898
    return
3899
        (ret & BDRV_BLOCK_DATA) ||
3900
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3901
}
3902

    
3903
/*
3904
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3905
 *
3906
 * Return true if the given sector is allocated in any image between
3907
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3908
 * sector is allocated in any image of the chain.  Return false otherwise.
3909
 *
3910
 * 'pnum' is set to the number of sectors (including and immediately following
3911
 *  the specified sector) that are known to be in the same
3912
 *  allocated/unallocated state.
3913
 *
3914
 */
3915
int bdrv_is_allocated_above(BlockDriverState *top,
3916
                            BlockDriverState *base,
3917
                            int64_t sector_num,
3918
                            int nb_sectors, int *pnum)
3919
{
3920
    BlockDriverState *intermediate;
3921
    int ret, n = nb_sectors;
3922

    
3923
    intermediate = top;
3924
    while (intermediate && intermediate != base) {
3925
        int pnum_inter;
3926
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3927
                                &pnum_inter);
3928
        if (ret < 0) {
3929
            return ret;
3930
        } else if (ret) {
3931
            *pnum = pnum_inter;
3932
            return 1;
3933
        }
3934

    
3935
        /*
3936
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3937
         * might have
3938
         *
3939
         * [sector_num+x, nr_sectors] allocated.
3940
         */
3941
        if (n > pnum_inter &&
3942
            (intermediate == top ||
3943
             sector_num + pnum_inter < intermediate->total_sectors)) {
3944
            n = pnum_inter;
3945
        }
3946

    
3947
        intermediate = intermediate->backing_hd;
3948
    }
3949

    
3950
    *pnum = n;
3951
    return 0;
3952
}
3953

    
3954
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3955
{
3956
    if (bs->backing_hd && bs->backing_hd->encrypted)
3957
        return bs->backing_file;
3958
    else if (bs->encrypted)
3959
        return bs->filename;
3960
    else
3961
        return NULL;
3962
}
3963

    
3964
void bdrv_get_backing_filename(BlockDriverState *bs,
3965
                               char *filename, int filename_size)
3966
{
3967
    pstrcpy(filename, filename_size, bs->backing_file);
3968
}
3969

    
3970
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3971
                          const uint8_t *buf, int nb_sectors)
3972
{
3973
    BlockDriver *drv = bs->drv;
3974
    if (!drv)
3975
        return -ENOMEDIUM;
3976
    if (!drv->bdrv_write_compressed)
3977
        return -ENOTSUP;
3978
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3979
        return -EIO;
3980

    
3981
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3982

    
3983
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3984
}
3985

    
3986
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3987
{
3988
    BlockDriver *drv = bs->drv;
3989
    if (!drv)
3990
        return -ENOMEDIUM;
3991
    if (!drv->bdrv_get_info)
3992
        return -ENOTSUP;
3993
    memset(bdi, 0, sizeof(*bdi));
3994
    return drv->bdrv_get_info(bs, bdi);
3995
}
3996

    
3997
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3998
{
3999
    BlockDriver *drv = bs->drv;
4000
    if (drv && drv->bdrv_get_specific_info) {
4001
        return drv->bdrv_get_specific_info(bs);
4002
    }
4003
    return NULL;
4004
}
4005

    
4006
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4007
                      int64_t pos, int size)
4008
{
4009
    QEMUIOVector qiov;
4010
    struct iovec iov = {
4011
        .iov_base   = (void *) buf,
4012
        .iov_len    = size,
4013
    };
4014

    
4015
    qemu_iovec_init_external(&qiov, &iov, 1);
4016
    return bdrv_writev_vmstate(bs, &qiov, pos);
4017
}
4018

    
4019
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4020
{
4021
    BlockDriver *drv = bs->drv;
4022

    
4023
    if (!drv) {
4024
        return -ENOMEDIUM;
4025
    } else if (drv->bdrv_save_vmstate) {
4026
        return drv->bdrv_save_vmstate(bs, qiov, pos);
4027
    } else if (bs->file) {
4028
        return bdrv_writev_vmstate(bs->file, qiov, pos);
4029
    }
4030

    
4031
    return -ENOTSUP;
4032
}
4033

    
4034
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4035
                      int64_t pos, int size)
4036
{
4037
    BlockDriver *drv = bs->drv;
4038
    if (!drv)
4039
        return -ENOMEDIUM;
4040
    if (drv->bdrv_load_vmstate)
4041
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
4042
    if (bs->file)
4043
        return bdrv_load_vmstate(bs->file, buf, pos, size);
4044
    return -ENOTSUP;
4045
}
4046

    
4047
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4048
{
4049
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4050
        return;
4051
    }
4052

    
4053
    bs->drv->bdrv_debug_event(bs, event);
4054
}
4055

    
4056
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4057
                          const char *tag)
4058
{
4059
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4060
        bs = bs->file;
4061
    }
4062

    
4063
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4064
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4065
    }
4066

    
4067
    return -ENOTSUP;
4068
}
4069

    
4070
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4071
{
4072
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4073
        bs = bs->file;
4074
    }
4075

    
4076
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4077
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4078
    }
4079

    
4080
    return -ENOTSUP;
4081
}
4082

    
4083
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4084
{
4085
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4086
        bs = bs->file;
4087
    }
4088

    
4089
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4090
        return bs->drv->bdrv_debug_resume(bs, tag);
4091
    }
4092

    
4093
    return -ENOTSUP;
4094
}
4095

    
4096
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4097
{
4098
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4099
        bs = bs->file;
4100
    }
4101

    
4102
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4103
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
4104
    }
4105

    
4106
    return false;
4107
}
4108

    
4109
int bdrv_is_snapshot(BlockDriverState *bs)
4110
{
4111
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4112
}
4113

    
4114
/* backing_file can either be relative, or absolute, or a protocol.  If it is
4115
 * relative, it must be relative to the chain.  So, passing in bs->filename
4116
 * from a BDS as backing_file should not be done, as that may be relative to
4117
 * the CWD rather than the chain. */
4118
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4119
        const char *backing_file)
4120
{
4121
    char *filename_full = NULL;
4122
    char *backing_file_full = NULL;
4123
    char *filename_tmp = NULL;
4124
    int is_protocol = 0;
4125
    BlockDriverState *curr_bs = NULL;
4126
    BlockDriverState *retval = NULL;
4127

    
4128
    if (!bs || !bs->drv || !backing_file) {
4129
        return NULL;
4130
    }
4131

    
4132
    filename_full     = g_malloc(PATH_MAX);
4133
    backing_file_full = g_malloc(PATH_MAX);
4134
    filename_tmp      = g_malloc(PATH_MAX);
4135

    
4136
    is_protocol = path_has_protocol(backing_file);
4137

    
4138
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4139

    
4140
        /* If either of the filename paths is actually a protocol, then
4141
         * compare unmodified paths; otherwise make paths relative */
4142
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4143
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4144
                retval = curr_bs->backing_hd;
4145
                break;
4146
            }
4147
        } else {
4148
            /* If not an absolute filename path, make it relative to the current
4149
             * image's filename path */
4150
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4151
                         backing_file);
4152

    
4153
            /* We are going to compare absolute pathnames */
4154
            if (!realpath(filename_tmp, filename_full)) {
4155
                continue;
4156
            }
4157

    
4158
            /* We need to make sure the backing filename we are comparing against
4159
             * is relative to the current image filename (or absolute) */
4160
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4161
                         curr_bs->backing_file);
4162

    
4163
            if (!realpath(filename_tmp, backing_file_full)) {
4164
                continue;
4165
            }
4166

    
4167
            if (strcmp(backing_file_full, filename_full) == 0) {
4168
                retval = curr_bs->backing_hd;
4169
                break;
4170
            }
4171
        }
4172
    }
4173

    
4174
    g_free(filename_full);
4175
    g_free(backing_file_full);
4176
    g_free(filename_tmp);
4177
    return retval;
4178
}
4179

    
4180
int bdrv_get_backing_file_depth(BlockDriverState *bs)
4181
{
4182
    if (!bs->drv) {
4183
        return 0;
4184
    }
4185

    
4186
    if (!bs->backing_hd) {
4187
        return 0;
4188
    }
4189

    
4190
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4191
}
4192

    
4193
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4194
{
4195
    BlockDriverState *curr_bs = NULL;
4196

    
4197
    if (!bs) {
4198
        return NULL;
4199
    }
4200

    
4201
    curr_bs = bs;
4202

    
4203
    while (curr_bs->backing_hd) {
4204
        curr_bs = curr_bs->backing_hd;
4205
    }
4206
    return curr_bs;
4207
}
4208

    
4209
/**************************************************************/
4210
/* async I/Os */
4211

    
4212
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4213
                                 QEMUIOVector *qiov, int nb_sectors,
4214
                                 BlockDriverCompletionFunc *cb, void *opaque)
4215
{
4216
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4217

    
4218
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4219
                                 cb, opaque, false);
4220
}
4221

    
4222
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4223
                                  QEMUIOVector *qiov, int nb_sectors,
4224
                                  BlockDriverCompletionFunc *cb, void *opaque)
4225
{
4226
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4227

    
4228
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4229
                                 cb, opaque, true);
4230
}
4231

    
4232
BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4233
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4234
        BlockDriverCompletionFunc *cb, void *opaque)
4235
{
4236
    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4237

    
4238
    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4239
                                 BDRV_REQ_ZERO_WRITE | flags,
4240
                                 cb, opaque, true);
4241
}
4242

    
4243

    
4244
typedef struct MultiwriteCB {
4245
    int error;
4246
    int num_requests;
4247
    int num_callbacks;
4248
    struct {
4249
        BlockDriverCompletionFunc *cb;
4250
        void *opaque;
4251
        QEMUIOVector *free_qiov;
4252
    } callbacks[];
4253
} MultiwriteCB;
4254

    
4255
static void multiwrite_user_cb(MultiwriteCB *mcb)
4256
{
4257
    int i;
4258

    
4259
    for (i = 0; i < mcb->num_callbacks; i++) {
4260
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4261
        if (mcb->callbacks[i].free_qiov) {
4262
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4263
        }
4264
        g_free(mcb->callbacks[i].free_qiov);
4265
    }
4266
}
4267

    
4268
static void multiwrite_cb(void *opaque, int ret)
4269
{
4270
    MultiwriteCB *mcb = opaque;
4271

    
4272
    trace_multiwrite_cb(mcb, ret);
4273

    
4274
    if (ret < 0 && !mcb->error) {
4275
        mcb->error = ret;
4276
    }
4277

    
4278
    mcb->num_requests--;
4279
    if (mcb->num_requests == 0) {
4280
        multiwrite_user_cb(mcb);
4281
        g_free(mcb);
4282
    }
4283
}
4284

    
4285
static int multiwrite_req_compare(const void *a, const void *b)
4286
{
4287
    const BlockRequest *req1 = a, *req2 = b;
4288

    
4289
    /*
4290
     * Note that we can't simply subtract req2->sector from req1->sector
4291
     * here as that could overflow the return value.
4292
     */
4293
    if (req1->sector > req2->sector) {
4294
        return 1;
4295
    } else if (req1->sector < req2->sector) {
4296
        return -1;
4297
    } else {
4298
        return 0;
4299
    }
4300
}
4301

    
4302
/*
4303
 * Takes a bunch of requests and tries to merge them. Returns the number of
4304
 * requests that remain after merging.
4305
 */
4306
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4307
    int num_reqs, MultiwriteCB *mcb)
4308
{
4309
    int i, outidx;
4310

    
4311
    // Sort requests by start sector
4312
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4313

    
4314
    // Check if adjacent requests touch the same clusters. If so, combine them,
4315
    // filling up gaps with zero sectors.
4316
    outidx = 0;
4317
    for (i = 1; i < num_reqs; i++) {
4318
        int merge = 0;
4319
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4320

    
4321
        // Handle exactly sequential writes and overlapping writes.
4322
        if (reqs[i].sector <= oldreq_last) {
4323
            merge = 1;
4324
        }
4325

    
4326
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4327
            merge = 0;
4328
        }
4329

    
4330
        if (merge) {
4331
            size_t size;
4332
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4333
            qemu_iovec_init(qiov,
4334
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4335

    
4336
            // Add the first request to the merged one. If the requests are
4337
            // overlapping, drop the last sectors of the first request.
4338
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
4339
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4340

    
4341
            // We should need to add any zeros between the two requests
4342
            assert (reqs[i].sector <= oldreq_last);
4343

    
4344
            // Add the second request
4345
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4346

    
4347
            reqs[outidx].nb_sectors = qiov->size >> 9;
4348
            reqs[outidx].qiov = qiov;
4349

    
4350
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4351
        } else {
4352
            outidx++;
4353
            reqs[outidx].sector     = reqs[i].sector;
4354
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4355
            reqs[outidx].qiov       = reqs[i].qiov;
4356
        }
4357
    }
4358

    
4359
    return outidx + 1;
4360
}
4361

    
4362
/*
4363
 * Submit multiple AIO write requests at once.
4364
 *
4365
 * On success, the function returns 0 and all requests in the reqs array have
4366
 * been submitted. In error case this function returns -1, and any of the
4367
 * requests may or may not be submitted yet. In particular, this means that the
4368
 * callback will be called for some of the requests, for others it won't. The
4369
 * caller must check the error field of the BlockRequest to wait for the right
4370
 * callbacks (if error != 0, no callback will be called).
4371
 *
4372
 * The implementation may modify the contents of the reqs array, e.g. to merge
4373
 * requests. However, the fields opaque and error are left unmodified as they
4374
 * are used to signal failure for a single request to the caller.
4375
 */
4376
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4377
{
4378
    MultiwriteCB *mcb;
4379
    int i;
4380

    
4381
    /* don't submit writes if we don't have a medium */
4382
    if (bs->drv == NULL) {
4383
        for (i = 0; i < num_reqs; i++) {
4384
            reqs[i].error = -ENOMEDIUM;
4385
        }
4386
        return -1;
4387
    }
4388

    
4389
    if (num_reqs == 0) {
4390
        return 0;
4391
    }
4392

    
4393
    // Create MultiwriteCB structure
4394
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4395
    mcb->num_requests = 0;
4396
    mcb->num_callbacks = num_reqs;
4397

    
4398
    for (i = 0; i < num_reqs; i++) {
4399
        mcb->callbacks[i].cb = reqs[i].cb;
4400
        mcb->callbacks[i].opaque = reqs[i].opaque;
4401
    }
4402

    
4403
    // Check for mergable requests
4404
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4405

    
4406
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4407

    
4408
    /* Run the aio requests. */
4409
    mcb->num_requests = num_reqs;
4410
    for (i = 0; i < num_reqs; i++) {
4411
        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4412
                              reqs[i].nb_sectors, reqs[i].flags,
4413
                              multiwrite_cb, mcb,
4414
                              true);
4415
    }
4416

    
4417
    return 0;
4418
}
4419

    
4420
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4421
{
4422
    acb->aiocb_info->cancel(acb);
4423
}
4424

    
4425
/**************************************************************/
4426
/* async block device emulation */
4427

    
4428
typedef struct BlockDriverAIOCBSync {
4429
    BlockDriverAIOCB common;
4430
    QEMUBH *bh;
4431
    int ret;
4432
    /* vector translation state */
4433
    QEMUIOVector *qiov;
4434
    uint8_t *bounce;
4435
    int is_write;
4436
} BlockDriverAIOCBSync;
4437

    
4438
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4439
{
4440
    BlockDriverAIOCBSync *acb =
4441
        container_of(blockacb, BlockDriverAIOCBSync, common);
4442
    qemu_bh_delete(acb->bh);
4443
    acb->bh = NULL;
4444
    qemu_aio_release(acb);
4445
}
4446

    
4447
static const AIOCBInfo bdrv_em_aiocb_info = {
4448
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4449
    .cancel             = bdrv_aio_cancel_em,
4450
};
4451

    
4452
static void bdrv_aio_bh_cb(void *opaque)
4453
{
4454
    BlockDriverAIOCBSync *acb = opaque;
4455

    
4456
    if (!acb->is_write)
4457
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4458
    qemu_vfree(acb->bounce);
4459
    acb->common.cb(acb->common.opaque, acb->ret);
4460
    qemu_bh_delete(acb->bh);
4461
    acb->bh = NULL;
4462
    qemu_aio_release(acb);
4463
}
4464

    
4465
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4466
                                            int64_t sector_num,
4467
                                            QEMUIOVector *qiov,
4468
                                            int nb_sectors,
4469
                                            BlockDriverCompletionFunc *cb,
4470
                                            void *opaque,
4471
                                            int is_write)
4472

    
4473
{
4474
    BlockDriverAIOCBSync *acb;
4475

    
4476
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4477
    acb->is_write = is_write;
4478
    acb->qiov = qiov;
4479
    acb->bounce = qemu_blockalign(bs, qiov->size);
4480
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4481

    
4482
    if (is_write) {
4483
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4484
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4485
    } else {
4486
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4487
    }
4488

    
4489
    qemu_bh_schedule(acb->bh);
4490

    
4491
    return &acb->common;
4492
}
4493

    
4494
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4495
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4496
        BlockDriverCompletionFunc *cb, void *opaque)
4497
{
4498
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4499
}
4500

    
4501
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4502
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4503
        BlockDriverCompletionFunc *cb, void *opaque)
4504
{
4505
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4506
}
4507

    
4508

    
4509
typedef struct BlockDriverAIOCBCoroutine {
4510
    BlockDriverAIOCB common;
4511
    BlockRequest req;
4512
    bool is_write;
4513
    bool *done;
4514
    QEMUBH* bh;
4515
} BlockDriverAIOCBCoroutine;
4516

    
4517
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4518
{
4519
    BlockDriverAIOCBCoroutine *acb =
4520
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4521
    bool done = false;
4522

    
4523
    acb->done = &done;
4524
    while (!done) {
4525
        qemu_aio_wait();
4526
    }
4527
}
4528

    
4529
static const AIOCBInfo bdrv_em_co_aiocb_info = {
4530
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4531
    .cancel             = bdrv_aio_co_cancel_em,
4532
};
4533

    
4534
static void bdrv_co_em_bh(void *opaque)
4535
{
4536
    BlockDriverAIOCBCoroutine *acb = opaque;
4537

    
4538
    acb->common.cb(acb->common.opaque, acb->req.error);
4539

    
4540
    if (acb->done) {
4541
        *acb->done = true;
4542
    }
4543

    
4544
    qemu_bh_delete(acb->bh);
4545
    qemu_aio_release(acb);
4546
}
4547

    
4548
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4549
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4550
{
4551
    BlockDriverAIOCBCoroutine *acb = opaque;
4552
    BlockDriverState *bs = acb->common.bs;
4553

    
4554
    if (!acb->is_write) {
4555
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4556
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4557
    } else {
4558
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4559
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4560
    }
4561

    
4562
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4563
    qemu_bh_schedule(acb->bh);
4564
}
4565

    
4566
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4567
                                               int64_t sector_num,
4568
                                               QEMUIOVector *qiov,
4569
                                               int nb_sectors,
4570
                                               BdrvRequestFlags flags,
4571
                                               BlockDriverCompletionFunc *cb,
4572
                                               void *opaque,
4573
                                               bool is_write)
4574
{
4575
    Coroutine *co;
4576
    BlockDriverAIOCBCoroutine *acb;
4577

    
4578
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4579
    acb->req.sector = sector_num;
4580
    acb->req.nb_sectors = nb_sectors;
4581
    acb->req.qiov = qiov;
4582
    acb->req.flags = flags;
4583
    acb->is_write = is_write;
4584
    acb->done = NULL;
4585

    
4586
    co = qemu_coroutine_create(bdrv_co_do_rw);
4587
    qemu_coroutine_enter(co, acb);
4588

    
4589
    return &acb->common;
4590
}
4591

    
4592
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4593
{
4594
    BlockDriverAIOCBCoroutine *acb = opaque;
4595
    BlockDriverState *bs = acb->common.bs;
4596

    
4597
    acb->req.error = bdrv_co_flush(bs);
4598
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4599
    qemu_bh_schedule(acb->bh);
4600
}
4601

    
4602
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4603
        BlockDriverCompletionFunc *cb, void *opaque)
4604
{
4605
    trace_bdrv_aio_flush(bs, opaque);
4606

    
4607
    Coroutine *co;
4608
    BlockDriverAIOCBCoroutine *acb;
4609

    
4610
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4611
    acb->done = NULL;
4612

    
4613
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4614
    qemu_coroutine_enter(co, acb);
4615

    
4616
    return &acb->common;
4617
}
4618

    
4619
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4620
{
4621
    BlockDriverAIOCBCoroutine *acb = opaque;
4622
    BlockDriverState *bs = acb->common.bs;
4623

    
4624
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4625
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4626
    qemu_bh_schedule(acb->bh);
4627
}
4628

    
4629
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4630
        int64_t sector_num, int nb_sectors,
4631
        BlockDriverCompletionFunc *cb, void *opaque)
4632
{
4633
    Coroutine *co;
4634
    BlockDriverAIOCBCoroutine *acb;
4635

    
4636
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4637

    
4638
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4639
    acb->req.sector = sector_num;
4640
    acb->req.nb_sectors = nb_sectors;
4641
    acb->done = NULL;
4642
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4643
    qemu_coroutine_enter(co, acb);
4644

    
4645
    return &acb->common;
4646
}
4647

    
4648
void bdrv_init(void)
4649
{
4650
    module_call_init(MODULE_INIT_BLOCK);
4651
}
4652

    
4653
void bdrv_init_with_whitelist(void)
4654
{
4655
    use_bdrv_whitelist = 1;
4656
    bdrv_init();
4657
}
4658

    
4659
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4660
                   BlockDriverCompletionFunc *cb, void *opaque)
4661
{
4662
    BlockDriverAIOCB *acb;
4663

    
4664
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4665
    acb->aiocb_info = aiocb_info;
4666
    acb->bs = bs;
4667
    acb->cb = cb;
4668
    acb->opaque = opaque;
4669
    return acb;
4670
}
4671

    
4672
void qemu_aio_release(void *p)
4673
{
4674
    BlockDriverAIOCB *acb = p;
4675
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4676
}
4677

    
4678
/**************************************************************/
4679
/* Coroutine block device emulation */
4680

    
4681
typedef struct CoroutineIOCompletion {
4682
    Coroutine *coroutine;
4683
    int ret;
4684
} CoroutineIOCompletion;
4685

    
4686
static void bdrv_co_io_em_complete(void *opaque, int ret)
4687
{
4688
    CoroutineIOCompletion *co = opaque;
4689

    
4690
    co->ret = ret;
4691
    qemu_coroutine_enter(co->coroutine, NULL);
4692
}
4693

    
4694
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4695
                                      int nb_sectors, QEMUIOVector *iov,
4696
                                      bool is_write)
4697
{
4698
    CoroutineIOCompletion co = {
4699
        .coroutine = qemu_coroutine_self(),
4700
    };
4701
    BlockDriverAIOCB *acb;
4702

    
4703
    if (is_write) {
4704
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4705
                                       bdrv_co_io_em_complete, &co);
4706
    } else {
4707
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4708
                                      bdrv_co_io_em_complete, &co);
4709
    }
4710

    
4711
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4712
    if (!acb) {
4713
        return -EIO;
4714
    }
4715
    qemu_coroutine_yield();
4716

    
4717
    return co.ret;
4718
}
4719

    
4720
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4721
                                         int64_t sector_num, int nb_sectors,
4722
                                         QEMUIOVector *iov)
4723
{
4724
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4725
}
4726

    
4727
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4728
                                         int64_t sector_num, int nb_sectors,
4729
                                         QEMUIOVector *iov)
4730
{
4731
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4732
}
4733

    
4734
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4735
{
4736
    RwCo *rwco = opaque;
4737

    
4738
    rwco->ret = bdrv_co_flush(rwco->bs);
4739
}
4740

    
4741
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4742
{
4743
    int ret;
4744

    
4745
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4746
        return 0;
4747
    }
4748

    
4749
    /* Write back cached data to the OS even with cache=unsafe */
4750
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4751
    if (bs->drv->bdrv_co_flush_to_os) {
4752
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4753
        if (ret < 0) {
4754
            return ret;
4755
        }
4756
    }
4757

    
4758
    /* But don't actually force it to the disk with cache=unsafe */
4759
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4760
        goto flush_parent;
4761
    }
4762

    
4763
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4764
    if (bs->drv->bdrv_co_flush_to_disk) {
4765
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4766
    } else if (bs->drv->bdrv_aio_flush) {
4767
        BlockDriverAIOCB *acb;
4768
        CoroutineIOCompletion co = {
4769
            .coroutine = qemu_coroutine_self(),
4770
        };
4771

    
4772
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4773
        if (acb == NULL) {
4774
            ret = -EIO;
4775
        } else {
4776
            qemu_coroutine_yield();
4777
            ret = co.ret;
4778
        }
4779
    } else {
4780
        /*
4781
         * Some block drivers always operate in either writethrough or unsafe
4782
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4783
         * know how the server works (because the behaviour is hardcoded or
4784
         * depends on server-side configuration), so we can't ensure that
4785
         * everything is safe on disk. Returning an error doesn't work because
4786
         * that would break guests even if the server operates in writethrough
4787
         * mode.
4788
         *
4789
         * Let's hope the user knows what he's doing.
4790
         */
4791
        ret = 0;
4792
    }
4793
    if (ret < 0) {
4794
        return ret;
4795
    }
4796

    
4797
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4798
     * in the case of cache=unsafe, so there are no useless flushes.
4799
     */
4800
flush_parent:
4801
    return bdrv_co_flush(bs->file);
4802
}
4803

    
4804
void bdrv_invalidate_cache(BlockDriverState *bs)
4805
{
4806
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4807
        bs->drv->bdrv_invalidate_cache(bs);
4808
    }
4809
}
4810

    
4811
void bdrv_invalidate_cache_all(void)
4812
{
4813
    BlockDriverState *bs;
4814

    
4815
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4816
        bdrv_invalidate_cache(bs);
4817
    }
4818
}
4819

    
4820
void bdrv_clear_incoming_migration_all(void)
4821
{
4822
    BlockDriverState *bs;
4823

    
4824
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4825
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4826
    }
4827
}
4828

    
4829
int bdrv_flush(BlockDriverState *bs)
4830
{
4831
    Coroutine *co;
4832
    RwCo rwco = {
4833
        .bs = bs,
4834
        .ret = NOT_DONE,
4835
    };
4836

    
4837
    if (qemu_in_coroutine()) {
4838
        /* Fast-path if already in coroutine context */
4839
        bdrv_flush_co_entry(&rwco);
4840
    } else {
4841
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4842
        qemu_coroutine_enter(co, &rwco);
4843
        while (rwco.ret == NOT_DONE) {
4844
            qemu_aio_wait();
4845
        }
4846
    }
4847

    
4848
    return rwco.ret;
4849
}
4850

    
4851
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4852
{
4853
    RwCo *rwco = opaque;
4854

    
4855
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4856
}
4857

    
4858
/* if no limit is specified in the BlockLimits use a default
4859
 * of 32768 512-byte sectors (16 MiB) per request.
4860
 */
4861
#define MAX_DISCARD_DEFAULT 32768
4862

    
4863
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4864
                                 int nb_sectors)
4865
{
4866
    int max_discard;
4867

    
4868
    if (!bs->drv) {
4869
        return -ENOMEDIUM;
4870
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4871
        return -EIO;
4872
    } else if (bs->read_only) {
4873
        return -EROFS;
4874
    }
4875

    
4876
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4877

    
4878
    /* Do nothing if disabled.  */
4879
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4880
        return 0;
4881
    }
4882

    
4883
    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4884
        return 0;
4885
    }
4886

    
4887
    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4888
    while (nb_sectors > 0) {
4889
        int ret;
4890
        int num = nb_sectors;
4891

    
4892
        /* align request */
4893
        if (bs->bl.discard_alignment &&
4894
            num >= bs->bl.discard_alignment &&
4895
            sector_num % bs->bl.discard_alignment) {
4896
            if (num > bs->bl.discard_alignment) {
4897
                num = bs->bl.discard_alignment;
4898
            }
4899
            num -= sector_num % bs->bl.discard_alignment;
4900
        }
4901

    
4902
        /* limit request size */
4903
        if (num > max_discard) {
4904
            num = max_discard;
4905
        }
4906

    
4907
        if (bs->drv->bdrv_co_discard) {
4908
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4909
        } else {
4910
            BlockDriverAIOCB *acb;
4911
            CoroutineIOCompletion co = {
4912
                .coroutine = qemu_coroutine_self(),
4913
            };
4914

    
4915
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4916
                                            bdrv_co_io_em_complete, &co);
4917
            if (acb == NULL) {
4918
                return -EIO;
4919
            } else {
4920
                qemu_coroutine_yield();
4921
                ret = co.ret;
4922
            }
4923
        }
4924
        if (ret && ret != -ENOTSUP) {
4925
            return ret;
4926
        }
4927

    
4928
        sector_num += num;
4929
        nb_sectors -= num;
4930
    }
4931
    return 0;
4932
}
4933

    
4934
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4935
{
4936
    Coroutine *co;
4937
    RwCo rwco = {
4938
        .bs = bs,
4939
        .sector_num = sector_num,
4940
        .nb_sectors = nb_sectors,
4941
        .ret = NOT_DONE,
4942
    };
4943

    
4944
    if (qemu_in_coroutine()) {
4945
        /* Fast-path if already in coroutine context */
4946
        bdrv_discard_co_entry(&rwco);
4947
    } else {
4948
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4949
        qemu_coroutine_enter(co, &rwco);
4950
        while (rwco.ret == NOT_DONE) {
4951
            qemu_aio_wait();
4952
        }
4953
    }
4954

    
4955
    return rwco.ret;
4956
}
4957

    
4958
/**************************************************************/
4959
/* removable device support */
4960

    
4961
/**
4962
 * Return TRUE if the media is present
4963
 */
4964
int bdrv_is_inserted(BlockDriverState *bs)
4965
{
4966
    BlockDriver *drv = bs->drv;
4967

    
4968
    if (!drv)
4969
        return 0;
4970
    if (!drv->bdrv_is_inserted)
4971
        return 1;
4972
    return drv->bdrv_is_inserted(bs);
4973
}
4974

    
4975
/**
4976
 * Return whether the media changed since the last call to this
4977
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4978
 */
4979
int bdrv_media_changed(BlockDriverState *bs)
4980
{
4981
    BlockDriver *drv = bs->drv;
4982

    
4983
    if (drv && drv->bdrv_media_changed) {
4984
        return drv->bdrv_media_changed(bs);
4985
    }
4986
    return -ENOTSUP;
4987
}
4988

    
4989
/**
4990
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4991
 */
4992
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4993
{
4994
    BlockDriver *drv = bs->drv;
4995

    
4996
    if (drv && drv->bdrv_eject) {
4997
        drv->bdrv_eject(bs, eject_flag);
4998
    }
4999

    
5000
    if (bs->device_name[0] != '\0') {
5001
        bdrv_emit_qmp_eject_event(bs, eject_flag);
5002
    }
5003
}
5004

    
5005
/**
5006
 * Lock or unlock the media (if it is locked, the user won't be able
5007
 * to eject it manually).
5008
 */
5009
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5010
{
5011
    BlockDriver *drv = bs->drv;
5012

    
5013
    trace_bdrv_lock_medium(bs, locked);
5014

    
5015
    if (drv && drv->bdrv_lock_medium) {
5016
        drv->bdrv_lock_medium(bs, locked);
5017
    }
5018
}
5019

    
5020
/* needed for generic scsi interface */
5021

    
5022
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5023
{
5024
    BlockDriver *drv = bs->drv;
5025

    
5026
    if (drv && drv->bdrv_ioctl)
5027
        return drv->bdrv_ioctl(bs, req, buf);
5028
    return -ENOTSUP;
5029
}
5030

    
5031
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5032
        unsigned long int req, void *buf,
5033
        BlockDriverCompletionFunc *cb, void *opaque)
5034
{
5035
    BlockDriver *drv = bs->drv;
5036

    
5037
    if (drv && drv->bdrv_aio_ioctl)
5038
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5039
    return NULL;
5040
}
5041

    
5042
void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5043
{
5044
    bs->guest_block_size = align;
5045
}
5046

    
5047
void *qemu_blockalign(BlockDriverState *bs, size_t size)
5048
{
5049
    return qemu_memalign(bdrv_opt_mem_align(bs), size);
5050
}
5051

    
5052
/*
5053
 * Check if all memory in this vector is sector aligned.
5054
 */
5055
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5056
{
5057
    int i;
5058
    size_t alignment = bdrv_opt_mem_align(bs);
5059

    
5060
    for (i = 0; i < qiov->niov; i++) {
5061
        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5062
            return false;
5063
        }
5064
        if (qiov->iov[i].iov_len % alignment) {
5065
            return false;
5066
        }
5067
    }
5068

    
5069
    return true;
5070
}
5071

    
5072
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5073
{
5074
    int64_t bitmap_size;
5075
    BdrvDirtyBitmap *bitmap;
5076

    
5077
    assert((granularity & (granularity - 1)) == 0);
5078

    
5079
    granularity >>= BDRV_SECTOR_BITS;
5080
    assert(granularity);
5081
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5082
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5083
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5084
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5085
    return bitmap;
5086
}
5087

    
5088
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5089
{
5090
    BdrvDirtyBitmap *bm, *next;
5091
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5092
        if (bm == bitmap) {
5093
            QLIST_REMOVE(bitmap, list);
5094
            hbitmap_free(bitmap->bitmap);
5095
            g_free(bitmap);
5096
            return;
5097
        }
5098
    }
5099
}
5100

    
5101
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5102
{
5103
    BdrvDirtyBitmap *bm;
5104
    BlockDirtyInfoList *list = NULL;
5105
    BlockDirtyInfoList **plist = &list;
5106

    
5107
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5108
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5109
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5110
        info->count = bdrv_get_dirty_count(bs, bm);
5111
        info->granularity =
5112
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5113
        entry->value = info;
5114
        *plist = entry;
5115
        plist = &entry->next;
5116
    }
5117

    
5118
    return list;
5119
}
5120

    
5121
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5122
{
5123
    if (bitmap) {
5124
        return hbitmap_get(bitmap->bitmap, sector);
5125
    } else {
5126
        return 0;
5127
    }
5128
}
5129

    
5130
void bdrv_dirty_iter_init(BlockDriverState *bs,
5131
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5132
{
5133
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5134
}
5135

    
5136
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5137
                    int nr_sectors)
5138
{
5139
    BdrvDirtyBitmap *bitmap;
5140
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5141
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5142
    }
5143
}
5144

    
5145
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5146
{
5147
    BdrvDirtyBitmap *bitmap;
5148
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5149
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5150
    }
5151
}
5152

    
5153
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5154
{
5155
    return hbitmap_count(bitmap->bitmap);
5156
}
5157

    
5158
/* Get a reference to bs */
5159
void bdrv_ref(BlockDriverState *bs)
5160
{
5161
    bs->refcnt++;
5162
}
5163

    
5164
/* Release a previously grabbed reference to bs.
5165
 * If after releasing, reference count is zero, the BlockDriverState is
5166
 * deleted. */
5167
void bdrv_unref(BlockDriverState *bs)
5168
{
5169
    assert(bs->refcnt > 0);
5170
    if (--bs->refcnt == 0) {
5171
        bdrv_delete(bs);
5172
    }
5173
}
5174

    
5175
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5176
{
5177
    assert(bs->in_use != in_use);
5178
    bs->in_use = in_use;
5179
}
5180

    
5181
int bdrv_in_use(BlockDriverState *bs)
5182
{
5183
    return bs->in_use;
5184
}
5185

    
5186
void bdrv_iostatus_enable(BlockDriverState *bs)
5187
{
5188
    bs->iostatus_enabled = true;
5189
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5190
}
5191

    
5192
/* The I/O status is only enabled if the drive explicitly
5193
 * enables it _and_ the VM is configured to stop on errors */
5194
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5195
{
5196
    return (bs->iostatus_enabled &&
5197
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5198
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5199
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5200
}
5201

    
5202
void bdrv_iostatus_disable(BlockDriverState *bs)
5203
{
5204
    bs->iostatus_enabled = false;
5205
}
5206

    
5207
void bdrv_iostatus_reset(BlockDriverState *bs)
5208
{
5209
    if (bdrv_iostatus_is_enabled(bs)) {
5210
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5211
        if (bs->job) {
5212
            block_job_iostatus_reset(bs->job);
5213
        }
5214
    }
5215
}
5216

    
5217
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5218
{
5219
    assert(bdrv_iostatus_is_enabled(bs));
5220
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5221
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5222
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
5223
    }
5224
}
5225

    
5226
void
5227
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5228
        enum BlockAcctType type)
5229
{
5230
    assert(type < BDRV_MAX_IOTYPE);
5231

    
5232
    cookie->bytes = bytes;
5233
    cookie->start_time_ns = get_clock();
5234
    cookie->type = type;
5235
}
5236

    
5237
void
5238
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5239
{
5240
    assert(cookie->type < BDRV_MAX_IOTYPE);
5241

    
5242
    bs->nr_bytes[cookie->type] += cookie->bytes;
5243
    bs->nr_ops[cookie->type]++;
5244
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5245
}
5246

    
5247
void bdrv_img_create(const char *filename, const char *fmt,
5248
                     const char *base_filename, const char *base_fmt,
5249
                     char *options, uint64_t img_size, int flags,
5250
                     Error **errp, bool quiet)
5251
{
5252
    QEMUOptionParameter *param = NULL, *create_options = NULL;
5253
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
5254
    BlockDriver *drv, *proto_drv;
5255
    BlockDriver *backing_drv = NULL;
5256
    Error *local_err = NULL;
5257
    int ret = 0;
5258

    
5259
    /* Find driver and parse its options */
5260
    drv = bdrv_find_format(fmt);
5261
    if (!drv) {
5262
        error_setg(errp, "Unknown file format '%s'", fmt);
5263
        return;
5264
    }
5265

    
5266
    proto_drv = bdrv_find_protocol(filename, true);
5267
    if (!proto_drv) {
5268
        error_setg(errp, "Unknown protocol '%s'", filename);
5269
        return;
5270
    }
5271

    
5272
    create_options = append_option_parameters(create_options,
5273
                                              drv->create_options);
5274
    create_options = append_option_parameters(create_options,
5275
                                              proto_drv->create_options);
5276

    
5277
    /* Create parameter list with default values */
5278
    param = parse_option_parameters("", create_options, param);
5279

    
5280
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5281

    
5282
    /* Parse -o options */
5283
    if (options) {
5284
        param = parse_option_parameters(options, create_options, param);
5285
        if (param == NULL) {
5286
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
5287
            goto out;
5288
        }
5289
    }
5290

    
5291
    if (base_filename) {
5292
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5293
                                 base_filename)) {
5294
            error_setg(errp, "Backing file not supported for file format '%s'",
5295
                       fmt);
5296
            goto out;
5297
        }
5298
    }
5299

    
5300
    if (base_fmt) {
5301
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5302
            error_setg(errp, "Backing file format not supported for file "
5303
                             "format '%s'", fmt);
5304
            goto out;
5305
        }
5306
    }
5307

    
5308
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5309
    if (backing_file && backing_file->value.s) {
5310
        if (!strcmp(filename, backing_file->value.s)) {
5311
            error_setg(errp, "Error: Trying to create an image with the "
5312
                             "same filename as the backing file");
5313
            goto out;
5314
        }
5315
    }
5316

    
5317
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5318
    if (backing_fmt && backing_fmt->value.s) {
5319
        backing_drv = bdrv_find_format(backing_fmt->value.s);
5320
        if (!backing_drv) {
5321
            error_setg(errp, "Unknown backing file format '%s'",
5322
                       backing_fmt->value.s);
5323
            goto out;
5324
        }
5325
    }
5326

    
5327
    // The size for the image must always be specified, with one exception:
5328
    // If we are using a backing file, we can obtain the size from there
5329
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
5330
    if (size && size->value.n == -1) {
5331
        if (backing_file && backing_file->value.s) {
5332
            BlockDriverState *bs;
5333
            uint64_t size;
5334
            char buf[32];
5335
            int back_flags;
5336

    
5337
            /* backing files always opened read-only */
5338
            back_flags =
5339
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5340

    
5341
            bs = bdrv_new("");
5342

    
5343
            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
5344
                            backing_drv, &local_err);
5345
            if (ret < 0) {
5346
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
5347
                                 backing_file->value.s,
5348
                                 error_get_pretty(local_err));
5349
                error_free(local_err);
5350
                local_err = NULL;
5351
                bdrv_unref(bs);
5352
                goto out;
5353
            }
5354
            bdrv_get_geometry(bs, &size);
5355
            size *= 512;
5356

    
5357
            snprintf(buf, sizeof(buf), "%" PRId64, size);
5358
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5359

    
5360
            bdrv_unref(bs);
5361
        } else {
5362
            error_setg(errp, "Image creation needs a size parameter");
5363
            goto out;
5364
        }
5365
    }
5366

    
5367
    if (!quiet) {
5368
        printf("Formatting '%s', fmt=%s ", filename, fmt);
5369
        print_option_parameters(param);
5370
        puts("");
5371
    }
5372
    ret = bdrv_create(drv, filename, param, &local_err);
5373
    if (ret == -EFBIG) {
5374
        /* This is generally a better message than whatever the driver would
5375
         * deliver (especially because of the cluster_size_hint), since that
5376
         * is most probably not much different from "image too large". */
5377
        const char *cluster_size_hint = "";
5378
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5379
            cluster_size_hint = " (try using a larger cluster size)";
5380
        }
5381
        error_setg(errp, "The image size is too large for file format '%s'"
5382
                   "%s", fmt, cluster_size_hint);
5383
        error_free(local_err);
5384
        local_err = NULL;
5385
    }
5386

    
5387
out:
5388
    free_option_parameters(create_options);
5389
    free_option_parameters(param);
5390

    
5391
    if (error_is_set(&local_err)) {
5392
        error_propagate(errp, local_err);
5393
    }
5394
}
5395

    
5396
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5397
{
5398
    /* Currently BlockDriverState always uses the main loop AioContext */
5399
    return qemu_get_aio_context();
5400
}
5401

    
5402
void bdrv_add_before_write_notifier(BlockDriverState *bs,
5403
                                    NotifierWithReturn *notifier)
5404
{
5405
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5406
}
5407

    
5408
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5409
{
5410
    if (bs->drv->bdrv_amend_options == NULL) {
5411
        return -ENOTSUP;
5412
    }
5413
    return bs->drv->bdrv_amend_options(bs, options);
5414
}
5415

    
5416
/* Used to recurse on single child block filters.
5417
 * Single child block filter will store their child in bs->file.
5418
 */
5419
bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5420
                                      BlockDriverState *candidate)
5421
{
5422
    if (!bs->drv) {
5423
        return false;
5424
    }
5425

    
5426
    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5427
        if (bs == candidate) {
5428
            return true;
5429
        } else {
5430
            return false;
5431
        }
5432
    }
5433

    
5434
    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5435
        return false;
5436
    }
5437

    
5438
    if (!bs->file) {
5439
        return false;
5440
    }
5441

    
5442
    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5443
}
5444

    
5445
bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5446
                                      BlockDriverState *candidate)
5447
{
5448
    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5449
        return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5450
    }
5451

    
5452
    return bdrv_generic_is_first_non_filter(bs, candidate);
5453
}
5454

    
5455
/* This function checks if the candidate is the first non filter bs down it's
5456
 * bs chain. Since we don't have pointers to parents it explore all bs chains
5457
 * from the top. Some filters can choose not to pass down the recursion.
5458
 */
5459
bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5460
{
5461
    BlockDriverState *bs;
5462

    
5463
    /* walk down the bs forest recursively */
5464
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5465
        bool perm;
5466

    
5467
        if (!bs->file) {
5468
            continue;
5469
        }
5470

    
5471
        perm = bdrv_recurse_is_first_non_filter(bs->file, candidate);
5472

    
5473
        /* candidate is the first non filter */
5474
        if (perm) {
5475
            return true;
5476
        }
5477
    }
5478

    
5479
    return false;
5480
}