Statistics
| Branch: | Revision:

root / block.c @ ddf5636d

History | View | Annotate | Download (152.1 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "block/qapi.h"
36
#include "qmp-commands.h"
37
#include "qemu/timer.h"
38

    
39
#ifdef CONFIG_BSD
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <sys/ioctl.h>
43
#include <sys/queue.h>
44
#ifndef __DragonFly__
45
#include <sys/disk.h>
46
#endif
47
#endif
48

    
49
#ifdef _WIN32
50
#include <windows.h>
51
#endif
52

    
53
struct BdrvDirtyBitmap {
54
    HBitmap *bitmap;
55
    QLIST_ENTRY(BdrvDirtyBitmap) list;
56
};
57

    
58
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59

    
60
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63
        BlockDriverCompletionFunc *cb, void *opaque);
64
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66
        BlockDriverCompletionFunc *cb, void *opaque);
67
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68
                                         int64_t sector_num, int nb_sectors,
69
                                         QEMUIOVector *iov);
70
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71
                                         int64_t sector_num, int nb_sectors,
72
                                         QEMUIOVector *iov);
73
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75
    BdrvRequestFlags flags);
76
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78
    BdrvRequestFlags flags);
79
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80
                                               int64_t sector_num,
81
                                               QEMUIOVector *qiov,
82
                                               int nb_sectors,
83
                                               BdrvRequestFlags flags,
84
                                               BlockDriverCompletionFunc *cb,
85
                                               void *opaque,
86
                                               bool is_write);
87
static void coroutine_fn bdrv_co_do_rw(void *opaque);
88
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96

    
97
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_set_io_limits(BlockDriverState *bs,
125
                        ThrottleConfig *cfg)
126
{
127
    int i;
128

    
129
    throttle_config(&bs->throttle_state, cfg);
130

    
131
    for (i = 0; i < 2; i++) {
132
        qemu_co_enter_next(&bs->throttled_reqs[i]);
133
    }
134
}
135

    
136
/* this function drain all the throttled IOs */
137
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138
{
139
    bool drained = false;
140
    bool enabled = bs->io_limits_enabled;
141
    int i;
142

    
143
    bs->io_limits_enabled = false;
144

    
145
    for (i = 0; i < 2; i++) {
146
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147
            drained = true;
148
        }
149
    }
150

    
151
    bs->io_limits_enabled = enabled;
152

    
153
    return drained;
154
}
155

    
156
void bdrv_io_limits_disable(BlockDriverState *bs)
157
{
158
    bs->io_limits_enabled = false;
159

    
160
    bdrv_start_throttled_reqs(bs);
161

    
162
    throttle_destroy(&bs->throttle_state);
163
}
164

    
165
static void bdrv_throttle_read_timer_cb(void *opaque)
166
{
167
    BlockDriverState *bs = opaque;
168
    qemu_co_enter_next(&bs->throttled_reqs[0]);
169
}
170

    
171
static void bdrv_throttle_write_timer_cb(void *opaque)
172
{
173
    BlockDriverState *bs = opaque;
174
    qemu_co_enter_next(&bs->throttled_reqs[1]);
175
}
176

    
177
/* should be called before bdrv_set_io_limits if a limit is set */
178
void bdrv_io_limits_enable(BlockDriverState *bs)
179
{
180
    assert(!bs->io_limits_enabled);
181
    throttle_init(&bs->throttle_state,
182
                  QEMU_CLOCK_VIRTUAL,
183
                  bdrv_throttle_read_timer_cb,
184
                  bdrv_throttle_write_timer_cb,
185
                  bs);
186
    bs->io_limits_enabled = true;
187
}
188

    
189
/* This function makes an IO wait if needed
190
 *
191
 * @nb_sectors: the number of sectors of the IO
192
 * @is_write:   is the IO a write
193
 */
194
static void bdrv_io_limits_intercept(BlockDriverState *bs,
195
                                     unsigned int bytes,
196
                                     bool is_write)
197
{
198
    /* does this io must wait */
199
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200

    
201
    /* if must wait or any request of this type throttled queue the IO */
202
    if (must_wait ||
203
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205
    }
206

    
207
    /* the IO will be executed, do the accounting */
208
    throttle_account(&bs->throttle_state, is_write, bytes);
209

    
210

    
211
    /* if the next request must wait -> do nothing */
212
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213
        return;
214
    }
215

    
216
    /* else queue next request for execution */
217
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218
}
219

    
220
size_t bdrv_opt_mem_align(BlockDriverState *bs)
221
{
222
    if (!bs || !bs->drv) {
223
        /* 4k should be on the safe side */
224
        return 4096;
225
    }
226

    
227
    return bs->bl.opt_mem_alignment;
228
}
229

    
230
/* check if the path starts with "<protocol>:" */
231
static int path_has_protocol(const char *path)
232
{
233
    const char *p;
234

    
235
#ifdef _WIN32
236
    if (is_windows_drive(path) ||
237
        is_windows_drive_prefix(path)) {
238
        return 0;
239
    }
240
    p = path + strcspn(path, ":/\\");
241
#else
242
    p = path + strcspn(path, ":/");
243
#endif
244

    
245
    return *p == ':';
246
}
247

    
248
int path_is_absolute(const char *path)
249
{
250
#ifdef _WIN32
251
    /* specific case for names like: "\\.\d:" */
252
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253
        return 1;
254
    }
255
    return (*path == '/' || *path == '\\');
256
#else
257
    return (*path == '/');
258
#endif
259
}
260

    
261
/* if filename is absolute, just copy it to dest. Otherwise, build a
262
   path to it by considering it is relative to base_path. URL are
263
   supported. */
264
void path_combine(char *dest, int dest_size,
265
                  const char *base_path,
266
                  const char *filename)
267
{
268
    const char *p, *p1;
269
    int len;
270

    
271
    if (dest_size <= 0)
272
        return;
273
    if (path_is_absolute(filename)) {
274
        pstrcpy(dest, dest_size, filename);
275
    } else {
276
        p = strchr(base_path, ':');
277
        if (p)
278
            p++;
279
        else
280
            p = base_path;
281
        p1 = strrchr(base_path, '/');
282
#ifdef _WIN32
283
        {
284
            const char *p2;
285
            p2 = strrchr(base_path, '\\');
286
            if (!p1 || p2 > p1)
287
                p1 = p2;
288
        }
289
#endif
290
        if (p1)
291
            p1++;
292
        else
293
            p1 = base_path;
294
        if (p1 > p)
295
            p = p1;
296
        len = p - base_path;
297
        if (len > dest_size - 1)
298
            len = dest_size - 1;
299
        memcpy(dest, base_path, len);
300
        dest[len] = '\0';
301
        pstrcat(dest, dest_size, filename);
302
    }
303
}
304

    
305
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306
{
307
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308
        pstrcpy(dest, sz, bs->backing_file);
309
    } else {
310
        path_combine(dest, sz, bs->filename, bs->backing_file);
311
    }
312
}
313

    
314
void bdrv_register(BlockDriver *bdrv)
315
{
316
    /* Block drivers without coroutine functions need emulation */
317
    if (!bdrv->bdrv_co_readv) {
318
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
319
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
320

    
321
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322
         * the block driver lacks aio we need to emulate that too.
323
         */
324
        if (!bdrv->bdrv_aio_readv) {
325
            /* add AIO emulation layer */
326
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328
        }
329
    }
330

    
331
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332
}
333

    
334
/* create a new block device (by default it is empty) */
335
BlockDriverState *bdrv_new(const char *device_name)
336
{
337
    BlockDriverState *bs;
338

    
339
    bs = g_malloc0(sizeof(BlockDriverState));
340
    QLIST_INIT(&bs->dirty_bitmaps);
341
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342
    if (device_name[0] != '\0') {
343
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344
    }
345
    bdrv_iostatus_disable(bs);
346
    notifier_list_init(&bs->close_notifiers);
347
    notifier_with_return_list_init(&bs->before_write_notifiers);
348
    qemu_co_queue_init(&bs->throttled_reqs[0]);
349
    qemu_co_queue_init(&bs->throttled_reqs[1]);
350
    bs->refcnt = 1;
351

    
352
    return bs;
353
}
354

    
355
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356
{
357
    notifier_list_add(&bs->close_notifiers, notify);
358
}
359

    
360
BlockDriver *bdrv_find_format(const char *format_name)
361
{
362
    BlockDriver *drv1;
363
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364
        if (!strcmp(drv1->format_name, format_name)) {
365
            return drv1;
366
        }
367
    }
368
    return NULL;
369
}
370

    
371
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372
{
373
    static const char *whitelist_rw[] = {
374
        CONFIG_BDRV_RW_WHITELIST
375
    };
376
    static const char *whitelist_ro[] = {
377
        CONFIG_BDRV_RO_WHITELIST
378
    };
379
    const char **p;
380

    
381
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
382
        return 1;               /* no whitelist, anything goes */
383
    }
384

    
385
    for (p = whitelist_rw; *p; p++) {
386
        if (!strcmp(drv->format_name, *p)) {
387
            return 1;
388
        }
389
    }
390
    if (read_only) {
391
        for (p = whitelist_ro; *p; p++) {
392
            if (!strcmp(drv->format_name, *p)) {
393
                return 1;
394
            }
395
        }
396
    }
397
    return 0;
398
}
399

    
400
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401
                                          bool read_only)
402
{
403
    BlockDriver *drv = bdrv_find_format(format_name);
404
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405
}
406

    
407
typedef struct CreateCo {
408
    BlockDriver *drv;
409
    char *filename;
410
    QEMUOptionParameter *options;
411
    int ret;
412
    Error *err;
413
} CreateCo;
414

    
415
static void coroutine_fn bdrv_create_co_entry(void *opaque)
416
{
417
    Error *local_err = NULL;
418
    int ret;
419

    
420
    CreateCo *cco = opaque;
421
    assert(cco->drv);
422

    
423
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424
    if (local_err) {
425
        error_propagate(&cco->err, local_err);
426
    }
427
    cco->ret = ret;
428
}
429

    
430
int bdrv_create(BlockDriver *drv, const char* filename,
431
    QEMUOptionParameter *options, Error **errp)
432
{
433
    int ret;
434

    
435
    Coroutine *co;
436
    CreateCo cco = {
437
        .drv = drv,
438
        .filename = g_strdup(filename),
439
        .options = options,
440
        .ret = NOT_DONE,
441
        .err = NULL,
442
    };
443

    
444
    if (!drv->bdrv_create) {
445
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446
        ret = -ENOTSUP;
447
        goto out;
448
    }
449

    
450
    if (qemu_in_coroutine()) {
451
        /* Fast-path if already in coroutine context */
452
        bdrv_create_co_entry(&cco);
453
    } else {
454
        co = qemu_coroutine_create(bdrv_create_co_entry);
455
        qemu_coroutine_enter(co, &cco);
456
        while (cco.ret == NOT_DONE) {
457
            qemu_aio_wait();
458
        }
459
    }
460

    
461
    ret = cco.ret;
462
    if (ret < 0) {
463
        if (cco.err) {
464
            error_propagate(errp, cco.err);
465
        } else {
466
            error_setg_errno(errp, -ret, "Could not create image");
467
        }
468
    }
469

    
470
out:
471
    g_free(cco.filename);
472
    return ret;
473
}
474

    
475
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476
                     Error **errp)
477
{
478
    BlockDriver *drv;
479
    Error *local_err = NULL;
480
    int ret;
481

    
482
    drv = bdrv_find_protocol(filename, true);
483
    if (drv == NULL) {
484
        error_setg(errp, "Could not find protocol for file '%s'", filename);
485
        return -ENOENT;
486
    }
487

    
488
    ret = bdrv_create(drv, filename, options, &local_err);
489
    if (local_err) {
490
        error_propagate(errp, local_err);
491
    }
492
    return ret;
493
}
494

    
495
int bdrv_refresh_limits(BlockDriverState *bs)
496
{
497
    BlockDriver *drv = bs->drv;
498

    
499
    memset(&bs->bl, 0, sizeof(bs->bl));
500

    
501
    if (!drv) {
502
        return 0;
503
    }
504

    
505
    /* Take some limits from the children as a default */
506
    if (bs->file) {
507
        bdrv_refresh_limits(bs->file);
508
        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509
        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510
    } else {
511
        bs->bl.opt_mem_alignment = 512;
512
    }
513

    
514
    if (bs->backing_hd) {
515
        bdrv_refresh_limits(bs->backing_hd);
516
        bs->bl.opt_transfer_length =
517
            MAX(bs->bl.opt_transfer_length,
518
                bs->backing_hd->bl.opt_transfer_length);
519
        bs->bl.opt_mem_alignment =
520
            MAX(bs->bl.opt_mem_alignment,
521
                bs->backing_hd->bl.opt_mem_alignment);
522
    }
523

    
524
    /* Then let the driver override it */
525
    if (drv->bdrv_refresh_limits) {
526
        return drv->bdrv_refresh_limits(bs);
527
    }
528

    
529
    return 0;
530
}
531

    
532
/*
533
 * Create a uniquely-named empty temporary file.
534
 * Return 0 upon success, otherwise a negative errno value.
535
 */
536
int get_tmp_filename(char *filename, int size)
537
{
538
#ifdef _WIN32
539
    char temp_dir[MAX_PATH];
540
    /* GetTempFileName requires that its output buffer (4th param)
541
       have length MAX_PATH or greater.  */
542
    assert(size >= MAX_PATH);
543
    return (GetTempPath(MAX_PATH, temp_dir)
544
            && GetTempFileName(temp_dir, "qem", 0, filename)
545
            ? 0 : -GetLastError());
546
#else
547
    int fd;
548
    const char *tmpdir;
549
    tmpdir = getenv("TMPDIR");
550
    if (!tmpdir)
551
        tmpdir = "/tmp";
552
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
553
        return -EOVERFLOW;
554
    }
555
    fd = mkstemp(filename);
556
    if (fd < 0) {
557
        return -errno;
558
    }
559
    if (close(fd) != 0) {
560
        unlink(filename);
561
        return -errno;
562
    }
563
    return 0;
564
#endif
565
}
566

    
567
/*
568
 * Detect host devices. By convention, /dev/cdrom[N] is always
569
 * recognized as a host CDROM.
570
 */
571
static BlockDriver *find_hdev_driver(const char *filename)
572
{
573
    int score_max = 0, score;
574
    BlockDriver *drv = NULL, *d;
575

    
576
    QLIST_FOREACH(d, &bdrv_drivers, list) {
577
        if (d->bdrv_probe_device) {
578
            score = d->bdrv_probe_device(filename);
579
            if (score > score_max) {
580
                score_max = score;
581
                drv = d;
582
            }
583
        }
584
    }
585

    
586
    return drv;
587
}
588

    
589
BlockDriver *bdrv_find_protocol(const char *filename,
590
                                bool allow_protocol_prefix)
591
{
592
    BlockDriver *drv1;
593
    char protocol[128];
594
    int len;
595
    const char *p;
596

    
597
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
598

    
599
    /*
600
     * XXX(hch): we really should not let host device detection
601
     * override an explicit protocol specification, but moving this
602
     * later breaks access to device names with colons in them.
603
     * Thanks to the brain-dead persistent naming schemes on udev-
604
     * based Linux systems those actually are quite common.
605
     */
606
    drv1 = find_hdev_driver(filename);
607
    if (drv1) {
608
        return drv1;
609
    }
610

    
611
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
612
        return bdrv_find_format("file");
613
    }
614

    
615
    p = strchr(filename, ':');
616
    assert(p != NULL);
617
    len = p - filename;
618
    if (len > sizeof(protocol) - 1)
619
        len = sizeof(protocol) - 1;
620
    memcpy(protocol, filename, len);
621
    protocol[len] = '\0';
622
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
623
        if (drv1->protocol_name &&
624
            !strcmp(drv1->protocol_name, protocol)) {
625
            return drv1;
626
        }
627
    }
628
    return NULL;
629
}
630

    
631
static int find_image_format(BlockDriverState *bs, const char *filename,
632
                             BlockDriver **pdrv, Error **errp)
633
{
634
    int score, score_max;
635
    BlockDriver *drv1, *drv;
636
    uint8_t buf[2048];
637
    int ret = 0;
638

    
639
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
640
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
641
        drv = bdrv_find_format("raw");
642
        if (!drv) {
643
            error_setg(errp, "Could not find raw image format");
644
            ret = -ENOENT;
645
        }
646
        *pdrv = drv;
647
        return ret;
648
    }
649

    
650
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
651
    if (ret < 0) {
652
        error_setg_errno(errp, -ret, "Could not read image for determining its "
653
                         "format");
654
        *pdrv = NULL;
655
        return ret;
656
    }
657

    
658
    score_max = 0;
659
    drv = NULL;
660
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
661
        if (drv1->bdrv_probe) {
662
            score = drv1->bdrv_probe(buf, ret, filename);
663
            if (score > score_max) {
664
                score_max = score;
665
                drv = drv1;
666
            }
667
        }
668
    }
669
    if (!drv) {
670
        error_setg(errp, "Could not determine image format: No compatible "
671
                   "driver found");
672
        ret = -ENOENT;
673
    }
674
    *pdrv = drv;
675
    return ret;
676
}
677

    
678
/**
679
 * Set the current 'total_sectors' value
680
 */
681
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
682
{
683
    BlockDriver *drv = bs->drv;
684

    
685
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
686
    if (bs->sg)
687
        return 0;
688

    
689
    /* query actual device if possible, otherwise just trust the hint */
690
    if (drv->bdrv_getlength) {
691
        int64_t length = drv->bdrv_getlength(bs);
692
        if (length < 0) {
693
            return length;
694
        }
695
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
696
    }
697

    
698
    bs->total_sectors = hint;
699
    return 0;
700
}
701

    
702
/**
703
 * Set open flags for a given discard mode
704
 *
705
 * Return 0 on success, -1 if the discard mode was invalid.
706
 */
707
int bdrv_parse_discard_flags(const char *mode, int *flags)
708
{
709
    *flags &= ~BDRV_O_UNMAP;
710

    
711
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
712
        /* do nothing */
713
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
714
        *flags |= BDRV_O_UNMAP;
715
    } else {
716
        return -1;
717
    }
718

    
719
    return 0;
720
}
721

    
722
/**
723
 * Set open flags for a given cache mode
724
 *
725
 * Return 0 on success, -1 if the cache mode was invalid.
726
 */
727
int bdrv_parse_cache_flags(const char *mode, int *flags)
728
{
729
    *flags &= ~BDRV_O_CACHE_MASK;
730

    
731
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
732
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
733
    } else if (!strcmp(mode, "directsync")) {
734
        *flags |= BDRV_O_NOCACHE;
735
    } else if (!strcmp(mode, "writeback")) {
736
        *flags |= BDRV_O_CACHE_WB;
737
    } else if (!strcmp(mode, "unsafe")) {
738
        *flags |= BDRV_O_CACHE_WB;
739
        *flags |= BDRV_O_NO_FLUSH;
740
    } else if (!strcmp(mode, "writethrough")) {
741
        /* this is the default */
742
    } else {
743
        return -1;
744
    }
745

    
746
    return 0;
747
}
748

    
749
/**
750
 * The copy-on-read flag is actually a reference count so multiple users may
751
 * use the feature without worrying about clobbering its previous state.
752
 * Copy-on-read stays enabled until all users have called to disable it.
753
 */
754
void bdrv_enable_copy_on_read(BlockDriverState *bs)
755
{
756
    bs->copy_on_read++;
757
}
758

    
759
void bdrv_disable_copy_on_read(BlockDriverState *bs)
760
{
761
    assert(bs->copy_on_read > 0);
762
    bs->copy_on_read--;
763
}
764

    
765
static int bdrv_open_flags(BlockDriverState *bs, int flags)
766
{
767
    int open_flags = flags | BDRV_O_CACHE_WB;
768

    
769
    /*
770
     * Clear flags that are internal to the block layer before opening the
771
     * image.
772
     */
773
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
774

    
775
    /*
776
     * Snapshots should be writable.
777
     */
778
    if (bs->is_temporary) {
779
        open_flags |= BDRV_O_RDWR;
780
    }
781

    
782
    return open_flags;
783
}
784

    
785
static int bdrv_assign_node_name(BlockDriverState *bs,
786
                                 const char *node_name,
787
                                 Error **errp)
788
{
789
    if (!node_name) {
790
        return 0;
791
    }
792

    
793
    /* empty string node name is invalid */
794
    if (node_name[0] == '\0') {
795
        error_setg(errp, "Empty node name");
796
        return -EINVAL;
797
    }
798

    
799
    /* takes care of avoiding namespaces collisions */
800
    if (bdrv_find(node_name)) {
801
        error_setg(errp, "node-name=%s is conflicting with a device id",
802
                   node_name);
803
        return -EINVAL;
804
    }
805

    
806
    /* takes care of avoiding duplicates node names */
807
    if (bdrv_find_node(node_name)) {
808
        error_setg(errp, "Duplicate node name");
809
        return -EINVAL;
810
    }
811

    
812
    /* copy node name into the bs and insert it into the graph list */
813
    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
814
    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
815

    
816
    return 0;
817
}
818

    
819
/*
820
 * Common part for opening disk images and files
821
 *
822
 * Removes all processed options from *options.
823
 */
824
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
825
    QDict *options, int flags, BlockDriver *drv, Error **errp)
826
{
827
    int ret, open_flags;
828
    const char *filename;
829
    const char *node_name = NULL;
830
    Error *local_err = NULL;
831

    
832
    assert(drv != NULL);
833
    assert(bs->file == NULL);
834
    assert(options != NULL && bs->options != options);
835

    
836
    if (file != NULL) {
837
        filename = file->filename;
838
    } else {
839
        filename = qdict_get_try_str(options, "filename");
840
    }
841

    
842
    if (drv->bdrv_needs_filename && !filename) {
843
        error_setg(errp, "The '%s' block driver requires a file name",
844
                   drv->format_name);
845
        return -EINVAL;
846
    }
847

    
848
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
849

    
850
    node_name = qdict_get_try_str(options, "node-name");
851
    ret = bdrv_assign_node_name(bs, node_name, errp);
852
    if (ret < 0) {
853
        return ret;
854
    }
855
    qdict_del(options, "node-name");
856

    
857
    /* bdrv_open() with directly using a protocol as drv. This layer is already
858
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
859
     * and return immediately. */
860
    if (file != NULL && drv->bdrv_file_open) {
861
        bdrv_swap(file, bs);
862
        return 0;
863
    }
864

    
865
    bs->open_flags = flags;
866
    bs->guest_block_size = 512;
867
    bs->request_alignment = 512;
868
    bs->zero_beyond_eof = true;
869
    open_flags = bdrv_open_flags(bs, flags);
870
    bs->read_only = !(open_flags & BDRV_O_RDWR);
871

    
872
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
873
        error_setg(errp,
874
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
875
                        ? "Driver '%s' can only be used for read-only devices"
876
                        : "Driver '%s' is not whitelisted",
877
                   drv->format_name);
878
        return -ENOTSUP;
879
    }
880

    
881
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
882
    if (flags & BDRV_O_COPY_ON_READ) {
883
        if (!bs->read_only) {
884
            bdrv_enable_copy_on_read(bs);
885
        } else {
886
            error_setg(errp, "Can't use copy-on-read on read-only device");
887
            return -EINVAL;
888
        }
889
    }
890

    
891
    if (filename != NULL) {
892
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
893
    } else {
894
        bs->filename[0] = '\0';
895
    }
896

    
897
    bs->drv = drv;
898
    bs->opaque = g_malloc0(drv->instance_size);
899

    
900
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
901

    
902
    /* Open the image, either directly or using a protocol */
903
    if (drv->bdrv_file_open) {
904
        assert(file == NULL);
905
        assert(!drv->bdrv_needs_filename || filename != NULL);
906
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
907
    } else {
908
        if (file == NULL) {
909
            error_setg(errp, "Can't use '%s' as a block driver for the "
910
                       "protocol level", drv->format_name);
911
            ret = -EINVAL;
912
            goto free_and_fail;
913
        }
914
        bs->file = file;
915
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
916
    }
917

    
918
    if (ret < 0) {
919
        if (local_err) {
920
            error_propagate(errp, local_err);
921
        } else if (bs->filename[0]) {
922
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
923
        } else {
924
            error_setg_errno(errp, -ret, "Could not open image");
925
        }
926
        goto free_and_fail;
927
    }
928

    
929
    ret = refresh_total_sectors(bs, bs->total_sectors);
930
    if (ret < 0) {
931
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
932
        goto free_and_fail;
933
    }
934

    
935
    bdrv_refresh_limits(bs);
936
    assert(bdrv_opt_mem_align(bs) != 0);
937
    assert(bs->request_alignment != 0);
938

    
939
#ifndef _WIN32
940
    if (bs->is_temporary) {
941
        assert(bs->filename[0] != '\0');
942
        unlink(bs->filename);
943
    }
944
#endif
945
    return 0;
946

    
947
free_and_fail:
948
    bs->file = NULL;
949
    g_free(bs->opaque);
950
    bs->opaque = NULL;
951
    bs->drv = NULL;
952
    return ret;
953
}
954

    
955
/*
956
 * Opens a file using a protocol (file, host_device, nbd, ...)
957
 *
958
 * options is a QDict of options to pass to the block drivers, or NULL for an
959
 * empty set of options. The reference to the QDict belongs to the block layer
960
 * after the call (even on failure), so if the caller intends to reuse the
961
 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
962
 */
963
int bdrv_file_open(BlockDriverState **pbs, const char *filename,
964
                   const char *reference, QDict *options, int flags,
965
                   Error **errp)
966
{
967
    BlockDriverState *bs = NULL;
968
    BlockDriver *drv;
969
    const char *drvname;
970
    bool allow_protocol_prefix = false;
971
    Error *local_err = NULL;
972
    int ret;
973

    
974
    /* NULL means an empty set of options */
975
    if (options == NULL) {
976
        options = qdict_new();
977
    }
978

    
979
    if (reference) {
980
        if (filename || qdict_size(options)) {
981
            error_setg(errp, "Cannot reference an existing block device with "
982
                       "additional options or a new filename");
983
            return -EINVAL;
984
        }
985
        QDECREF(options);
986

    
987
        bs = bdrv_lookup_bs(reference, reference, errp);
988
        if (!bs) {
989
            return -ENODEV;
990
        }
991
        bdrv_ref(bs);
992
        *pbs = bs;
993
        return 0;
994
    }
995

    
996
    bs = bdrv_new("");
997
    bs->options = options;
998
    options = qdict_clone_shallow(options);
999

    
1000
    /* Fetch the file name from the options QDict if necessary */
1001
    if (!filename) {
1002
        filename = qdict_get_try_str(options, "filename");
1003
    } else if (filename && !qdict_haskey(options, "filename")) {
1004
        qdict_put(options, "filename", qstring_from_str(filename));
1005
        allow_protocol_prefix = true;
1006
    } else {
1007
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1008
                   "same time");
1009
        ret = -EINVAL;
1010
        goto fail;
1011
    }
1012

    
1013
    /* Find the right block driver */
1014
    drvname = qdict_get_try_str(options, "driver");
1015
    if (drvname) {
1016
        drv = bdrv_find_format(drvname);
1017
        if (!drv) {
1018
            error_setg(errp, "Unknown driver '%s'", drvname);
1019
        }
1020
        qdict_del(options, "driver");
1021
    } else if (filename) {
1022
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1023
        if (!drv) {
1024
            error_setg(errp, "Unknown protocol");
1025
        }
1026
    } else {
1027
        error_setg(errp, "Must specify either driver or file");
1028
        drv = NULL;
1029
    }
1030

    
1031
    if (!drv) {
1032
        /* errp has been set already */
1033
        ret = -ENOENT;
1034
        goto fail;
1035
    }
1036

    
1037
    /* Parse the filename and open it */
1038
    if (drv->bdrv_parse_filename && filename) {
1039
        drv->bdrv_parse_filename(filename, options, &local_err);
1040
        if (local_err) {
1041
            error_propagate(errp, local_err);
1042
            ret = -EINVAL;
1043
            goto fail;
1044
        }
1045
        qdict_del(options, "filename");
1046
    }
1047

    
1048
    if (!drv->bdrv_file_open) {
1049
        ret = bdrv_open(&bs, filename, NULL, options, flags, drv, &local_err);
1050
        options = NULL;
1051
    } else {
1052
        ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1053
    }
1054
    if (ret < 0) {
1055
        error_propagate(errp, local_err);
1056
        goto fail;
1057
    }
1058

    
1059
    /* Check if any unknown options were used */
1060
    if (options && (qdict_size(options) != 0)) {
1061
        const QDictEntry *entry = qdict_first(options);
1062
        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1063
                   drv->format_name, entry->key);
1064
        ret = -EINVAL;
1065
        goto fail;
1066
    }
1067
    QDECREF(options);
1068

    
1069
    bs->growable = 1;
1070
    *pbs = bs;
1071
    return 0;
1072

    
1073
fail:
1074
    QDECREF(options);
1075
    if (!bs->drv) {
1076
        QDECREF(bs->options);
1077
    }
1078
    bdrv_unref(bs);
1079
    return ret;
1080
}
1081

    
1082
/*
1083
 * Opens the backing file for a BlockDriverState if not yet open
1084
 *
1085
 * options is a QDict of options to pass to the block drivers, or NULL for an
1086
 * empty set of options. The reference to the QDict is transferred to this
1087
 * function (even on failure), so if the caller intends to reuse the dictionary,
1088
 * it needs to use QINCREF() before calling bdrv_file_open.
1089
 */
1090
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1091
{
1092
    char backing_filename[PATH_MAX];
1093
    int back_flags, ret;
1094
    BlockDriver *back_drv = NULL;
1095
    Error *local_err = NULL;
1096

    
1097
    if (bs->backing_hd != NULL) {
1098
        QDECREF(options);
1099
        return 0;
1100
    }
1101

    
1102
    /* NULL means an empty set of options */
1103
    if (options == NULL) {
1104
        options = qdict_new();
1105
    }
1106

    
1107
    bs->open_flags &= ~BDRV_O_NO_BACKING;
1108
    if (qdict_haskey(options, "file.filename")) {
1109
        backing_filename[0] = '\0';
1110
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1111
        QDECREF(options);
1112
        return 0;
1113
    } else {
1114
        bdrv_get_full_backing_filename(bs, backing_filename,
1115
                                       sizeof(backing_filename));
1116
    }
1117

    
1118
    if (bs->backing_format[0] != '\0') {
1119
        back_drv = bdrv_find_format(bs->backing_format);
1120
    }
1121

    
1122
    /* backing files always opened read-only */
1123
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1124
                                    BDRV_O_COPY_ON_READ);
1125

    
1126
    assert(bs->backing_hd == NULL);
1127
    ret = bdrv_open(&bs->backing_hd,
1128
                    *backing_filename ? backing_filename : NULL, NULL, options,
1129
                    back_flags, back_drv, &local_err);
1130
    if (ret < 0) {
1131
        bs->backing_hd = NULL;
1132
        bs->open_flags |= BDRV_O_NO_BACKING;
1133
        error_setg(errp, "Could not open backing file: %s",
1134
                   error_get_pretty(local_err));
1135
        error_free(local_err);
1136
        return ret;
1137
    }
1138

    
1139
    if (bs->backing_hd->file) {
1140
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1141
                bs->backing_hd->file->filename);
1142
    }
1143

    
1144
    /* Recalculate the BlockLimits with the backing file */
1145
    bdrv_refresh_limits(bs);
1146

    
1147
    return 0;
1148
}
1149

    
1150
/*
1151
 * Opens a disk image whose options are given as BlockdevRef in another block
1152
 * device's options.
1153
 *
1154
 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1155
 * image format auto-detection. If it is false and a filename is given,
1156
 * bdrv_open() will be used for auto-detection.
1157
 *
1158
 * If allow_none is true, no image will be opened if filename is false and no
1159
 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1160
 *
1161
 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1162
 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1163
 * itself, all options starting with "${bdref_key}." are considered part of the
1164
 * BlockdevRef.
1165
 *
1166
 * The BlockdevRef will be removed from the options QDict.
1167
 *
1168
 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1169
 */
1170
int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1171
                    QDict *options, const char *bdref_key, int flags,
1172
                    bool force_raw, bool allow_none, Error **errp)
1173
{
1174
    QDict *image_options;
1175
    int ret;
1176
    char *bdref_key_dot;
1177
    const char *reference;
1178

    
1179
    assert(pbs);
1180
    assert(*pbs == NULL);
1181

    
1182
    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1183
    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1184
    g_free(bdref_key_dot);
1185

    
1186
    reference = qdict_get_try_str(options, bdref_key);
1187
    if (!filename && !reference && !qdict_size(image_options)) {
1188
        if (allow_none) {
1189
            ret = 0;
1190
        } else {
1191
            error_setg(errp, "A block device must be specified for \"%s\"",
1192
                       bdref_key);
1193
            ret = -EINVAL;
1194
        }
1195
        goto done;
1196
    }
1197

    
1198
    if (filename && !force_raw) {
1199
        /* If a filename is given and the block driver should be detected
1200
           automatically (instead of using none), use bdrv_open() in order to do
1201
           that auto-detection. */
1202
        if (reference) {
1203
            error_setg(errp, "Cannot reference an existing block device while "
1204
                       "giving a filename");
1205
            ret = -EINVAL;
1206
            goto done;
1207
        }
1208

    
1209
        ret = bdrv_open(pbs, filename, NULL, image_options, flags, NULL, errp);
1210
    } else {
1211
        ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
1212
                             errp);
1213
    }
1214

    
1215
done:
1216
    qdict_del(options, bdref_key);
1217
    return ret;
1218
}
1219

    
1220
/*
1221
 * Opens a disk image (raw, qcow2, vmdk, ...)
1222
 *
1223
 * options is a QDict of options to pass to the block drivers, or NULL for an
1224
 * empty set of options. The reference to the QDict belongs to the block layer
1225
 * after the call (even on failure), so if the caller intends to reuse the
1226
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1227
 *
1228
 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1229
 * If it is not NULL, the referenced BDS will be reused.
1230
 *
1231
 * The reference parameter may be used to specify an existing block device which
1232
 * should be opened. If specified, neither options nor a filename may be given,
1233
 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1234
 */
1235
int bdrv_open(BlockDriverState **pbs, const char *filename,
1236
              const char *reference, QDict *options, int flags,
1237
              BlockDriver *drv, Error **errp)
1238
{
1239
    int ret;
1240
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1241
    char tmp_filename[PATH_MAX + 1];
1242
    BlockDriverState *file = NULL, *bs;
1243
    const char *drvname;
1244
    Error *local_err = NULL;
1245

    
1246
    assert(pbs);
1247

    
1248
    if (reference) {
1249
        bool options_non_empty = options ? qdict_size(options) : false;
1250
        QDECREF(options);
1251

    
1252
        if (*pbs) {
1253
            error_setg(errp, "Cannot reuse an existing BDS when referencing "
1254
                       "another block device");
1255
            return -EINVAL;
1256
        }
1257

    
1258
        if (filename || options_non_empty) {
1259
            error_setg(errp, "Cannot reference an existing block device with "
1260
                       "additional options or a new filename");
1261
            return -EINVAL;
1262
        }
1263

    
1264
        bs = bdrv_lookup_bs(reference, reference, errp);
1265
        if (!bs) {
1266
            return -ENODEV;
1267
        }
1268
        bdrv_ref(bs);
1269
        *pbs = bs;
1270
        return 0;
1271
    }
1272

    
1273
    if (*pbs) {
1274
        bs = *pbs;
1275
    } else {
1276
        bs = bdrv_new("");
1277
    }
1278

    
1279
    /* NULL means an empty set of options */
1280
    if (options == NULL) {
1281
        options = qdict_new();
1282
    }
1283

    
1284
    bs->options = options;
1285
    options = qdict_clone_shallow(options);
1286

    
1287
    /* For snapshot=on, create a temporary qcow2 overlay */
1288
    if (flags & BDRV_O_SNAPSHOT) {
1289
        BlockDriverState *bs1;
1290
        int64_t total_size;
1291
        BlockDriver *bdrv_qcow2;
1292
        QEMUOptionParameter *create_options;
1293
        QDict *snapshot_options;
1294

    
1295
        /* if snapshot, we create a temporary backing file and open it
1296
           instead of opening 'filename' directly */
1297

    
1298
        /* Get the required size from the image */
1299
        QINCREF(options);
1300
        bs1 = NULL;
1301
        ret = bdrv_open(&bs1, filename, NULL, options, BDRV_O_NO_BACKING,
1302
                        drv, &local_err);
1303
        if (ret < 0) {
1304
            goto fail;
1305
        }
1306
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1307

    
1308
        bdrv_unref(bs1);
1309

    
1310
        /* Create the temporary image */
1311
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1312
        if (ret < 0) {
1313
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1314
            goto fail;
1315
        }
1316

    
1317
        bdrv_qcow2 = bdrv_find_format("qcow2");
1318
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1319
                                                 NULL);
1320

    
1321
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1322

    
1323
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1324
        free_option_parameters(create_options);
1325
        if (ret < 0) {
1326
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1327
                             "'%s': %s", tmp_filename,
1328
                             error_get_pretty(local_err));
1329
            error_free(local_err);
1330
            local_err = NULL;
1331
            goto fail;
1332
        }
1333

    
1334
        /* Prepare a new options QDict for the temporary file, where user
1335
         * options refer to the backing file */
1336
        if (filename) {
1337
            qdict_put(options, "file.filename", qstring_from_str(filename));
1338
        }
1339
        if (drv) {
1340
            qdict_put(options, "driver", qstring_from_str(drv->format_name));
1341
        }
1342

    
1343
        snapshot_options = qdict_new();
1344
        qdict_put(snapshot_options, "backing", options);
1345
        qdict_flatten(snapshot_options);
1346

    
1347
        bs->options = snapshot_options;
1348
        options = qdict_clone_shallow(bs->options);
1349

    
1350
        filename = tmp_filename;
1351
        drv = bdrv_qcow2;
1352
        bs->is_temporary = 1;
1353
    }
1354

    
1355
    /* Open image file without format layer */
1356
    if (flags & BDRV_O_RDWR) {
1357
        flags |= BDRV_O_ALLOW_RDWR;
1358
    }
1359

    
1360
    assert(file == NULL);
1361
    ret = bdrv_open_image(&file, filename, options, "file",
1362
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1363
                          &local_err);
1364
    if (ret < 0) {
1365
        goto fail;
1366
    }
1367

    
1368
    /* Find the right image format driver */
1369
    drvname = qdict_get_try_str(options, "driver");
1370
    if (drvname) {
1371
        drv = bdrv_find_format(drvname);
1372
        qdict_del(options, "driver");
1373
        if (!drv) {
1374
            error_setg(errp, "Invalid driver: '%s'", drvname);
1375
            ret = -EINVAL;
1376
            goto unlink_and_fail;
1377
        }
1378
    }
1379

    
1380
    if (!drv) {
1381
        if (file) {
1382
            ret = find_image_format(file, filename, &drv, &local_err);
1383
        } else {
1384
            error_setg(errp, "Must specify either driver or file");
1385
            ret = -EINVAL;
1386
            goto unlink_and_fail;
1387
        }
1388
    }
1389

    
1390
    if (!drv) {
1391
        goto unlink_and_fail;
1392
    }
1393

    
1394
    /* Open the image */
1395
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1396
    if (ret < 0) {
1397
        goto unlink_and_fail;
1398
    }
1399

    
1400
    if (file && (bs->file != file)) {
1401
        bdrv_unref(file);
1402
        file = NULL;
1403
    }
1404

    
1405
    /* If there is a backing file, use it */
1406
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1407
        QDict *backing_options;
1408

    
1409
        qdict_extract_subqdict(options, &backing_options, "backing.");
1410
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1411
        if (ret < 0) {
1412
            goto close_and_fail;
1413
        }
1414
    }
1415

    
1416
    /* Check if any unknown options were used */
1417
    if (qdict_size(options) != 0) {
1418
        const QDictEntry *entry = qdict_first(options);
1419
        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1420
                   "support the option '%s'", drv->format_name, bs->device_name,
1421
                   entry->key);
1422

    
1423
        ret = -EINVAL;
1424
        goto close_and_fail;
1425
    }
1426
    QDECREF(options);
1427

    
1428
    if (!bdrv_key_required(bs)) {
1429
        bdrv_dev_change_media_cb(bs, true);
1430
    }
1431

    
1432
    *pbs = bs;
1433
    return 0;
1434

    
1435
unlink_and_fail:
1436
    if (file != NULL) {
1437
        bdrv_unref(file);
1438
    }
1439
    if (bs->is_temporary) {
1440
        unlink(filename);
1441
    }
1442
fail:
1443
    QDECREF(bs->options);
1444
    QDECREF(options);
1445
    bs->options = NULL;
1446
    if (!*pbs) {
1447
        /* If *pbs is NULL, a new BDS has been created in this function and
1448
           needs to be freed now. Otherwise, it does not need to be closed,
1449
           since it has not really been opened yet. */
1450
        bdrv_unref(bs);
1451
    }
1452
    if (local_err) {
1453
        error_propagate(errp, local_err);
1454
    }
1455
    return ret;
1456

    
1457
close_and_fail:
1458
    /* See fail path, but now the BDS has to be always closed */
1459
    if (*pbs) {
1460
        bdrv_close(bs);
1461
    } else {
1462
        bdrv_unref(bs);
1463
    }
1464
    QDECREF(options);
1465
    if (local_err) {
1466
        error_propagate(errp, local_err);
1467
    }
1468
    return ret;
1469
}
1470

    
1471
typedef struct BlockReopenQueueEntry {
1472
     bool prepared;
1473
     BDRVReopenState state;
1474
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1475
} BlockReopenQueueEntry;
1476

    
1477
/*
1478
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1479
 * reopen of multiple devices.
1480
 *
1481
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1482
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1483
 * be created and initialized. This newly created BlockReopenQueue should be
1484
 * passed back in for subsequent calls that are intended to be of the same
1485
 * atomic 'set'.
1486
 *
1487
 * bs is the BlockDriverState to add to the reopen queue.
1488
 *
1489
 * flags contains the open flags for the associated bs
1490
 *
1491
 * returns a pointer to bs_queue, which is either the newly allocated
1492
 * bs_queue, or the existing bs_queue being used.
1493
 *
1494
 */
1495
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1496
                                    BlockDriverState *bs, int flags)
1497
{
1498
    assert(bs != NULL);
1499

    
1500
    BlockReopenQueueEntry *bs_entry;
1501
    if (bs_queue == NULL) {
1502
        bs_queue = g_new0(BlockReopenQueue, 1);
1503
        QSIMPLEQ_INIT(bs_queue);
1504
    }
1505

    
1506
    if (bs->file) {
1507
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1508
    }
1509

    
1510
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1511
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1512

    
1513
    bs_entry->state.bs = bs;
1514
    bs_entry->state.flags = flags;
1515

    
1516
    return bs_queue;
1517
}
1518

    
1519
/*
1520
 * Reopen multiple BlockDriverStates atomically & transactionally.
1521
 *
1522
 * The queue passed in (bs_queue) must have been built up previous
1523
 * via bdrv_reopen_queue().
1524
 *
1525
 * Reopens all BDS specified in the queue, with the appropriate
1526
 * flags.  All devices are prepared for reopen, and failure of any
1527
 * device will cause all device changes to be abandonded, and intermediate
1528
 * data cleaned up.
1529
 *
1530
 * If all devices prepare successfully, then the changes are committed
1531
 * to all devices.
1532
 *
1533
 */
1534
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1535
{
1536
    int ret = -1;
1537
    BlockReopenQueueEntry *bs_entry, *next;
1538
    Error *local_err = NULL;
1539

    
1540
    assert(bs_queue != NULL);
1541

    
1542
    bdrv_drain_all();
1543

    
1544
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1545
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1546
            error_propagate(errp, local_err);
1547
            goto cleanup;
1548
        }
1549
        bs_entry->prepared = true;
1550
    }
1551

    
1552
    /* If we reach this point, we have success and just need to apply the
1553
     * changes
1554
     */
1555
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1556
        bdrv_reopen_commit(&bs_entry->state);
1557
    }
1558

    
1559
    ret = 0;
1560

    
1561
cleanup:
1562
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1563
        if (ret && bs_entry->prepared) {
1564
            bdrv_reopen_abort(&bs_entry->state);
1565
        }
1566
        g_free(bs_entry);
1567
    }
1568
    g_free(bs_queue);
1569
    return ret;
1570
}
1571

    
1572

    
1573
/* Reopen a single BlockDriverState with the specified flags. */
1574
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1575
{
1576
    int ret = -1;
1577
    Error *local_err = NULL;
1578
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1579

    
1580
    ret = bdrv_reopen_multiple(queue, &local_err);
1581
    if (local_err != NULL) {
1582
        error_propagate(errp, local_err);
1583
    }
1584
    return ret;
1585
}
1586

    
1587

    
1588
/*
1589
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1590
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1591
 * the block driver layer .bdrv_reopen_prepare()
1592
 *
1593
 * bs is the BlockDriverState to reopen
1594
 * flags are the new open flags
1595
 * queue is the reopen queue
1596
 *
1597
 * Returns 0 on success, non-zero on error.  On error errp will be set
1598
 * as well.
1599
 *
1600
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1601
 * It is the responsibility of the caller to then call the abort() or
1602
 * commit() for any other BDS that have been left in a prepare() state
1603
 *
1604
 */
1605
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1606
                        Error **errp)
1607
{
1608
    int ret = -1;
1609
    Error *local_err = NULL;
1610
    BlockDriver *drv;
1611

    
1612
    assert(reopen_state != NULL);
1613
    assert(reopen_state->bs->drv != NULL);
1614
    drv = reopen_state->bs->drv;
1615

    
1616
    /* if we are to stay read-only, do not allow permission change
1617
     * to r/w */
1618
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1619
        reopen_state->flags & BDRV_O_RDWR) {
1620
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1621
                  reopen_state->bs->device_name);
1622
        goto error;
1623
    }
1624

    
1625

    
1626
    ret = bdrv_flush(reopen_state->bs);
1627
    if (ret) {
1628
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1629
                  strerror(-ret));
1630
        goto error;
1631
    }
1632

    
1633
    if (drv->bdrv_reopen_prepare) {
1634
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1635
        if (ret) {
1636
            if (local_err != NULL) {
1637
                error_propagate(errp, local_err);
1638
            } else {
1639
                error_setg(errp, "failed while preparing to reopen image '%s'",
1640
                           reopen_state->bs->filename);
1641
            }
1642
            goto error;
1643
        }
1644
    } else {
1645
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1646
         * handler for each supported drv. */
1647
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1648
                  drv->format_name, reopen_state->bs->device_name,
1649
                 "reopening of file");
1650
        ret = -1;
1651
        goto error;
1652
    }
1653

    
1654
    ret = 0;
1655

    
1656
error:
1657
    return ret;
1658
}
1659

    
1660
/*
1661
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1662
 * makes them final by swapping the staging BlockDriverState contents into
1663
 * the active BlockDriverState contents.
1664
 */
1665
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1666
{
1667
    BlockDriver *drv;
1668

    
1669
    assert(reopen_state != NULL);
1670
    drv = reopen_state->bs->drv;
1671
    assert(drv != NULL);
1672

    
1673
    /* If there are any driver level actions to take */
1674
    if (drv->bdrv_reopen_commit) {
1675
        drv->bdrv_reopen_commit(reopen_state);
1676
    }
1677

    
1678
    /* set BDS specific flags now */
1679
    reopen_state->bs->open_flags         = reopen_state->flags;
1680
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1681
                                              BDRV_O_CACHE_WB);
1682
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1683

    
1684
    bdrv_refresh_limits(reopen_state->bs);
1685
}
1686

    
1687
/*
1688
 * Abort the reopen, and delete and free the staged changes in
1689
 * reopen_state
1690
 */
1691
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1692
{
1693
    BlockDriver *drv;
1694

    
1695
    assert(reopen_state != NULL);
1696
    drv = reopen_state->bs->drv;
1697
    assert(drv != NULL);
1698

    
1699
    if (drv->bdrv_reopen_abort) {
1700
        drv->bdrv_reopen_abort(reopen_state);
1701
    }
1702
}
1703

    
1704

    
1705
void bdrv_close(BlockDriverState *bs)
1706
{
1707
    if (bs->job) {
1708
        block_job_cancel_sync(bs->job);
1709
    }
1710
    bdrv_drain_all(); /* complete I/O */
1711
    bdrv_flush(bs);
1712
    bdrv_drain_all(); /* in case flush left pending I/O */
1713
    notifier_list_notify(&bs->close_notifiers, bs);
1714

    
1715
    if (bs->drv) {
1716
        if (bs->backing_hd) {
1717
            bdrv_unref(bs->backing_hd);
1718
            bs->backing_hd = NULL;
1719
        }
1720
        bs->drv->bdrv_close(bs);
1721
        g_free(bs->opaque);
1722
#ifdef _WIN32
1723
        if (bs->is_temporary) {
1724
            unlink(bs->filename);
1725
        }
1726
#endif
1727
        bs->opaque = NULL;
1728
        bs->drv = NULL;
1729
        bs->copy_on_read = 0;
1730
        bs->backing_file[0] = '\0';
1731
        bs->backing_format[0] = '\0';
1732
        bs->total_sectors = 0;
1733
        bs->encrypted = 0;
1734
        bs->valid_key = 0;
1735
        bs->sg = 0;
1736
        bs->growable = 0;
1737
        bs->zero_beyond_eof = false;
1738
        QDECREF(bs->options);
1739
        bs->options = NULL;
1740

    
1741
        if (bs->file != NULL) {
1742
            bdrv_unref(bs->file);
1743
            bs->file = NULL;
1744
        }
1745
    }
1746

    
1747
    bdrv_dev_change_media_cb(bs, false);
1748

    
1749
    /*throttling disk I/O limits*/
1750
    if (bs->io_limits_enabled) {
1751
        bdrv_io_limits_disable(bs);
1752
    }
1753
}
1754

    
1755
void bdrv_close_all(void)
1756
{
1757
    BlockDriverState *bs;
1758

    
1759
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1760
        bdrv_close(bs);
1761
    }
1762
}
1763

    
1764
/* Check if any requests are in-flight (including throttled requests) */
1765
static bool bdrv_requests_pending(BlockDriverState *bs)
1766
{
1767
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1768
        return true;
1769
    }
1770
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1771
        return true;
1772
    }
1773
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1774
        return true;
1775
    }
1776
    if (bs->file && bdrv_requests_pending(bs->file)) {
1777
        return true;
1778
    }
1779
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1780
        return true;
1781
    }
1782
    return false;
1783
}
1784

    
1785
static bool bdrv_requests_pending_all(void)
1786
{
1787
    BlockDriverState *bs;
1788
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1789
        if (bdrv_requests_pending(bs)) {
1790
            return true;
1791
        }
1792
    }
1793
    return false;
1794
}
1795

    
1796
/*
1797
 * Wait for pending requests to complete across all BlockDriverStates
1798
 *
1799
 * This function does not flush data to disk, use bdrv_flush_all() for that
1800
 * after calling this function.
1801
 *
1802
 * Note that completion of an asynchronous I/O operation can trigger any
1803
 * number of other I/O operations on other devices---for example a coroutine
1804
 * can be arbitrarily complex and a constant flow of I/O can come until the
1805
 * coroutine is complete.  Because of this, it is not possible to have a
1806
 * function to drain a single device's I/O queue.
1807
 */
1808
void bdrv_drain_all(void)
1809
{
1810
    /* Always run first iteration so any pending completion BHs run */
1811
    bool busy = true;
1812
    BlockDriverState *bs;
1813

    
1814
    while (busy) {
1815
        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1816
            bdrv_start_throttled_reqs(bs);
1817
        }
1818

    
1819
        busy = bdrv_requests_pending_all();
1820
        busy |= aio_poll(qemu_get_aio_context(), busy);
1821
    }
1822
}
1823

    
1824
/* make a BlockDriverState anonymous by removing from bdrv_state and
1825
 * graph_bdrv_state list.
1826
   Also, NULL terminate the device_name to prevent double remove */
1827
void bdrv_make_anon(BlockDriverState *bs)
1828
{
1829
    if (bs->device_name[0] != '\0') {
1830
        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1831
    }
1832
    bs->device_name[0] = '\0';
1833
    if (bs->node_name[0] != '\0') {
1834
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1835
    }
1836
    bs->node_name[0] = '\0';
1837
}
1838

    
1839
static void bdrv_rebind(BlockDriverState *bs)
1840
{
1841
    if (bs->drv && bs->drv->bdrv_rebind) {
1842
        bs->drv->bdrv_rebind(bs);
1843
    }
1844
}
1845

    
1846
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1847
                                     BlockDriverState *bs_src)
1848
{
1849
    /* move some fields that need to stay attached to the device */
1850
    bs_dest->open_flags         = bs_src->open_flags;
1851

    
1852
    /* dev info */
1853
    bs_dest->dev_ops            = bs_src->dev_ops;
1854
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1855
    bs_dest->dev                = bs_src->dev;
1856
    bs_dest->guest_block_size   = bs_src->guest_block_size;
1857
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1858

    
1859
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1860

    
1861
    /* i/o throttled req */
1862
    memcpy(&bs_dest->throttle_state,
1863
           &bs_src->throttle_state,
1864
           sizeof(ThrottleState));
1865
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1866
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1867
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1868

    
1869
    /* r/w error */
1870
    bs_dest->on_read_error      = bs_src->on_read_error;
1871
    bs_dest->on_write_error     = bs_src->on_write_error;
1872

    
1873
    /* i/o status */
1874
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1875
    bs_dest->iostatus           = bs_src->iostatus;
1876

    
1877
    /* dirty bitmap */
1878
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1879

    
1880
    /* reference count */
1881
    bs_dest->refcnt             = bs_src->refcnt;
1882

    
1883
    /* job */
1884
    bs_dest->in_use             = bs_src->in_use;
1885
    bs_dest->job                = bs_src->job;
1886

    
1887
    /* keep the same entry in bdrv_states */
1888
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1889
            bs_src->device_name);
1890
    bs_dest->device_list = bs_src->device_list;
1891

    
1892
    /* keep the same entry in graph_bdrv_states
1893
     * We do want to swap name but don't want to swap linked list entries
1894
     */
1895
    bs_dest->node_list   = bs_src->node_list;
1896
}
1897

    
1898
/*
1899
 * Swap bs contents for two image chains while they are live,
1900
 * while keeping required fields on the BlockDriverState that is
1901
 * actually attached to a device.
1902
 *
1903
 * This will modify the BlockDriverState fields, and swap contents
1904
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1905
 *
1906
 * bs_new is required to be anonymous.
1907
 *
1908
 * This function does not create any image files.
1909
 */
1910
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1911
{
1912
    BlockDriverState tmp;
1913

    
1914
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1915
    assert(bs_new->device_name[0] == '\0');
1916
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1917
    assert(bs_new->job == NULL);
1918
    assert(bs_new->dev == NULL);
1919
    assert(bs_new->in_use == 0);
1920
    assert(bs_new->io_limits_enabled == false);
1921
    assert(!throttle_have_timer(&bs_new->throttle_state));
1922

    
1923
    tmp = *bs_new;
1924
    *bs_new = *bs_old;
1925
    *bs_old = tmp;
1926

    
1927
    /* there are some fields that should not be swapped, move them back */
1928
    bdrv_move_feature_fields(&tmp, bs_old);
1929
    bdrv_move_feature_fields(bs_old, bs_new);
1930
    bdrv_move_feature_fields(bs_new, &tmp);
1931

    
1932
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1933
    assert(bs_new->device_name[0] == '\0');
1934

    
1935
    /* Check a few fields that should remain attached to the device */
1936
    assert(bs_new->dev == NULL);
1937
    assert(bs_new->job == NULL);
1938
    assert(bs_new->in_use == 0);
1939
    assert(bs_new->io_limits_enabled == false);
1940
    assert(!throttle_have_timer(&bs_new->throttle_state));
1941

    
1942
    bdrv_rebind(bs_new);
1943
    bdrv_rebind(bs_old);
1944
}
1945

    
1946
/*
1947
 * Add new bs contents at the top of an image chain while the chain is
1948
 * live, while keeping required fields on the top layer.
1949
 *
1950
 * This will modify the BlockDriverState fields, and swap contents
1951
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1952
 *
1953
 * bs_new is required to be anonymous.
1954
 *
1955
 * This function does not create any image files.
1956
 */
1957
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1958
{
1959
    bdrv_swap(bs_new, bs_top);
1960

    
1961
    /* The contents of 'tmp' will become bs_top, as we are
1962
     * swapping bs_new and bs_top contents. */
1963
    bs_top->backing_hd = bs_new;
1964
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1965
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1966
            bs_new->filename);
1967
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1968
            bs_new->drv ? bs_new->drv->format_name : "");
1969
}
1970

    
1971
static void bdrv_delete(BlockDriverState *bs)
1972
{
1973
    assert(!bs->dev);
1974
    assert(!bs->job);
1975
    assert(!bs->in_use);
1976
    assert(!bs->refcnt);
1977
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1978

    
1979
    bdrv_close(bs);
1980

    
1981
    /* remove from list, if necessary */
1982
    bdrv_make_anon(bs);
1983

    
1984
    g_free(bs);
1985
}
1986

    
1987
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1988
/* TODO change to DeviceState *dev when all users are qdevified */
1989
{
1990
    if (bs->dev) {
1991
        return -EBUSY;
1992
    }
1993
    bs->dev = dev;
1994
    bdrv_iostatus_reset(bs);
1995
    return 0;
1996
}
1997

    
1998
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1999
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2000
{
2001
    if (bdrv_attach_dev(bs, dev) < 0) {
2002
        abort();
2003
    }
2004
}
2005

    
2006
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2007
/* TODO change to DeviceState *dev when all users are qdevified */
2008
{
2009
    assert(bs->dev == dev);
2010
    bs->dev = NULL;
2011
    bs->dev_ops = NULL;
2012
    bs->dev_opaque = NULL;
2013
    bs->guest_block_size = 512;
2014
}
2015

    
2016
/* TODO change to return DeviceState * when all users are qdevified */
2017
void *bdrv_get_attached_dev(BlockDriverState *bs)
2018
{
2019
    return bs->dev;
2020
}
2021

    
2022
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2023
                      void *opaque)
2024
{
2025
    bs->dev_ops = ops;
2026
    bs->dev_opaque = opaque;
2027
}
2028

    
2029
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2030
                               enum MonitorEvent ev,
2031
                               BlockErrorAction action, bool is_read)
2032
{
2033
    QObject *data;
2034
    const char *action_str;
2035

    
2036
    switch (action) {
2037
    case BDRV_ACTION_REPORT:
2038
        action_str = "report";
2039
        break;
2040
    case BDRV_ACTION_IGNORE:
2041
        action_str = "ignore";
2042
        break;
2043
    case BDRV_ACTION_STOP:
2044
        action_str = "stop";
2045
        break;
2046
    default:
2047
        abort();
2048
    }
2049

    
2050
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2051
                              bdrv->device_name,
2052
                              action_str,
2053
                              is_read ? "read" : "write");
2054
    monitor_protocol_event(ev, data);
2055

    
2056
    qobject_decref(data);
2057
}
2058

    
2059
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2060
{
2061
    QObject *data;
2062

    
2063
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2064
                              bdrv_get_device_name(bs), ejected);
2065
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2066

    
2067
    qobject_decref(data);
2068
}
2069

    
2070
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2071
{
2072
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2073
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2074
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2075
        if (tray_was_closed) {
2076
            /* tray open */
2077
            bdrv_emit_qmp_eject_event(bs, true);
2078
        }
2079
        if (load) {
2080
            /* tray close */
2081
            bdrv_emit_qmp_eject_event(bs, false);
2082
        }
2083
    }
2084
}
2085

    
2086
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2087
{
2088
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2089
}
2090

    
2091
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2092
{
2093
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2094
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2095
    }
2096
}
2097

    
2098
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2099
{
2100
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2101
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
2102
    }
2103
    return false;
2104
}
2105

    
2106
static void bdrv_dev_resize_cb(BlockDriverState *bs)
2107
{
2108
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
2109
        bs->dev_ops->resize_cb(bs->dev_opaque);
2110
    }
2111
}
2112

    
2113
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2114
{
2115
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2116
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2117
    }
2118
    return false;
2119
}
2120

    
2121
/*
2122
 * Run consistency checks on an image
2123
 *
2124
 * Returns 0 if the check could be completed (it doesn't mean that the image is
2125
 * free of errors) or -errno when an internal error occurred. The results of the
2126
 * check are stored in res.
2127
 */
2128
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2129
{
2130
    if (bs->drv->bdrv_check == NULL) {
2131
        return -ENOTSUP;
2132
    }
2133

    
2134
    memset(res, 0, sizeof(*res));
2135
    return bs->drv->bdrv_check(bs, res, fix);
2136
}
2137

    
2138
#define COMMIT_BUF_SECTORS 2048
2139

    
2140
/* commit COW file into the raw image */
2141
int bdrv_commit(BlockDriverState *bs)
2142
{
2143
    BlockDriver *drv = bs->drv;
2144
    int64_t sector, total_sectors, length, backing_length;
2145
    int n, ro, open_flags;
2146
    int ret = 0;
2147
    uint8_t *buf = NULL;
2148
    char filename[PATH_MAX];
2149

    
2150
    if (!drv)
2151
        return -ENOMEDIUM;
2152
    
2153
    if (!bs->backing_hd) {
2154
        return -ENOTSUP;
2155
    }
2156

    
2157
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2158
        return -EBUSY;
2159
    }
2160

    
2161
    ro = bs->backing_hd->read_only;
2162
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2163
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2164
    open_flags =  bs->backing_hd->open_flags;
2165

    
2166
    if (ro) {
2167
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2168
            return -EACCES;
2169
        }
2170
    }
2171

    
2172
    length = bdrv_getlength(bs);
2173
    if (length < 0) {
2174
        ret = length;
2175
        goto ro_cleanup;
2176
    }
2177

    
2178
    backing_length = bdrv_getlength(bs->backing_hd);
2179
    if (backing_length < 0) {
2180
        ret = backing_length;
2181
        goto ro_cleanup;
2182
    }
2183

    
2184
    /* If our top snapshot is larger than the backing file image,
2185
     * grow the backing file image if possible.  If not possible,
2186
     * we must return an error */
2187
    if (length > backing_length) {
2188
        ret = bdrv_truncate(bs->backing_hd, length);
2189
        if (ret < 0) {
2190
            goto ro_cleanup;
2191
        }
2192
    }
2193

    
2194
    total_sectors = length >> BDRV_SECTOR_BITS;
2195
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2196

    
2197
    for (sector = 0; sector < total_sectors; sector += n) {
2198
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2199
        if (ret < 0) {
2200
            goto ro_cleanup;
2201
        }
2202
        if (ret) {
2203
            ret = bdrv_read(bs, sector, buf, n);
2204
            if (ret < 0) {
2205
                goto ro_cleanup;
2206
            }
2207

    
2208
            ret = bdrv_write(bs->backing_hd, sector, buf, n);
2209
            if (ret < 0) {
2210
                goto ro_cleanup;
2211
            }
2212
        }
2213
    }
2214

    
2215
    if (drv->bdrv_make_empty) {
2216
        ret = drv->bdrv_make_empty(bs);
2217
        if (ret < 0) {
2218
            goto ro_cleanup;
2219
        }
2220
        bdrv_flush(bs);
2221
    }
2222

    
2223
    /*
2224
     * Make sure all data we wrote to the backing device is actually
2225
     * stable on disk.
2226
     */
2227
    if (bs->backing_hd) {
2228
        bdrv_flush(bs->backing_hd);
2229
    }
2230

    
2231
    ret = 0;
2232
ro_cleanup:
2233
    g_free(buf);
2234

    
2235
    if (ro) {
2236
        /* ignoring error return here */
2237
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2238
    }
2239

    
2240
    return ret;
2241
}
2242

    
2243
int bdrv_commit_all(void)
2244
{
2245
    BlockDriverState *bs;
2246

    
2247
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2248
        if (bs->drv && bs->backing_hd) {
2249
            int ret = bdrv_commit(bs);
2250
            if (ret < 0) {
2251
                return ret;
2252
            }
2253
        }
2254
    }
2255
    return 0;
2256
}
2257

    
2258
/**
2259
 * Remove an active request from the tracked requests list
2260
 *
2261
 * This function should be called when a tracked request is completing.
2262
 */
2263
static void tracked_request_end(BdrvTrackedRequest *req)
2264
{
2265
    if (req->serialising) {
2266
        req->bs->serialising_in_flight--;
2267
    }
2268

    
2269
    QLIST_REMOVE(req, list);
2270
    qemu_co_queue_restart_all(&req->wait_queue);
2271
}
2272

    
2273
/**
2274
 * Add an active request to the tracked requests list
2275
 */
2276
static void tracked_request_begin(BdrvTrackedRequest *req,
2277
                                  BlockDriverState *bs,
2278
                                  int64_t offset,
2279
                                  unsigned int bytes, bool is_write)
2280
{
2281
    *req = (BdrvTrackedRequest){
2282
        .bs = bs,
2283
        .offset         = offset,
2284
        .bytes          = bytes,
2285
        .is_write       = is_write,
2286
        .co             = qemu_coroutine_self(),
2287
        .serialising    = false,
2288
        .overlap_offset = offset,
2289
        .overlap_bytes  = bytes,
2290
    };
2291

    
2292
    qemu_co_queue_init(&req->wait_queue);
2293

    
2294
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2295
}
2296

    
2297
static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2298
{
2299
    int64_t overlap_offset = req->offset & ~(align - 1);
2300
    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2301
                               - overlap_offset;
2302

    
2303
    if (!req->serialising) {
2304
        req->bs->serialising_in_flight++;
2305
        req->serialising = true;
2306
    }
2307

    
2308
    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2309
    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2310
}
2311

    
2312
/**
2313
 * Round a region to cluster boundaries
2314
 */
2315
void bdrv_round_to_clusters(BlockDriverState *bs,
2316
                            int64_t sector_num, int nb_sectors,
2317
                            int64_t *cluster_sector_num,
2318
                            int *cluster_nb_sectors)
2319
{
2320
    BlockDriverInfo bdi;
2321

    
2322
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2323
        *cluster_sector_num = sector_num;
2324
        *cluster_nb_sectors = nb_sectors;
2325
    } else {
2326
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2327
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2328
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2329
                                            nb_sectors, c);
2330
    }
2331
}
2332

    
2333
static int bdrv_get_cluster_size(BlockDriverState *bs)
2334
{
2335
    BlockDriverInfo bdi;
2336
    int ret;
2337

    
2338
    ret = bdrv_get_info(bs, &bdi);
2339
    if (ret < 0 || bdi.cluster_size == 0) {
2340
        return bs->request_alignment;
2341
    } else {
2342
        return bdi.cluster_size;
2343
    }
2344
}
2345

    
2346
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2347
                                     int64_t offset, unsigned int bytes)
2348
{
2349
    /*        aaaa   bbbb */
2350
    if (offset >= req->overlap_offset + req->overlap_bytes) {
2351
        return false;
2352
    }
2353
    /* bbbb   aaaa        */
2354
    if (req->overlap_offset >= offset + bytes) {
2355
        return false;
2356
    }
2357
    return true;
2358
}
2359

    
2360
static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2361
{
2362
    BlockDriverState *bs = self->bs;
2363
    BdrvTrackedRequest *req;
2364
    bool retry;
2365
    bool waited = false;
2366

    
2367
    if (!bs->serialising_in_flight) {
2368
        return false;
2369
    }
2370

    
2371
    do {
2372
        retry = false;
2373
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2374
            if (req == self || (!req->serialising && !self->serialising)) {
2375
                continue;
2376
            }
2377
            if (tracked_request_overlaps(req, self->overlap_offset,
2378
                                         self->overlap_bytes))
2379
            {
2380
                /* Hitting this means there was a reentrant request, for
2381
                 * example, a block driver issuing nested requests.  This must
2382
                 * never happen since it means deadlock.
2383
                 */
2384
                assert(qemu_coroutine_self() != req->co);
2385

    
2386
                /* If the request is already (indirectly) waiting for us, or
2387
                 * will wait for us as soon as it wakes up, then just go on
2388
                 * (instead of producing a deadlock in the former case). */
2389
                if (!req->waiting_for) {
2390
                    self->waiting_for = req;
2391
                    qemu_co_queue_wait(&req->wait_queue);
2392
                    self->waiting_for = NULL;
2393
                    retry = true;
2394
                    waited = true;
2395
                    break;
2396
                }
2397
            }
2398
        }
2399
    } while (retry);
2400

    
2401
    return waited;
2402
}
2403

    
2404
/*
2405
 * Return values:
2406
 * 0        - success
2407
 * -EINVAL  - backing format specified, but no file
2408
 * -ENOSPC  - can't update the backing file because no space is left in the
2409
 *            image file header
2410
 * -ENOTSUP - format driver doesn't support changing the backing file
2411
 */
2412
int bdrv_change_backing_file(BlockDriverState *bs,
2413
    const char *backing_file, const char *backing_fmt)
2414
{
2415
    BlockDriver *drv = bs->drv;
2416
    int ret;
2417

    
2418
    /* Backing file format doesn't make sense without a backing file */
2419
    if (backing_fmt && !backing_file) {
2420
        return -EINVAL;
2421
    }
2422

    
2423
    if (drv->bdrv_change_backing_file != NULL) {
2424
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2425
    } else {
2426
        ret = -ENOTSUP;
2427
    }
2428

    
2429
    if (ret == 0) {
2430
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2431
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2432
    }
2433
    return ret;
2434
}
2435

    
2436
/*
2437
 * Finds the image layer in the chain that has 'bs' as its backing file.
2438
 *
2439
 * active is the current topmost image.
2440
 *
2441
 * Returns NULL if bs is not found in active's image chain,
2442
 * or if active == bs.
2443
 */
2444
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2445
                                    BlockDriverState *bs)
2446
{
2447
    BlockDriverState *overlay = NULL;
2448
    BlockDriverState *intermediate;
2449

    
2450
    assert(active != NULL);
2451
    assert(bs != NULL);
2452

    
2453
    /* if bs is the same as active, then by definition it has no overlay
2454
     */
2455
    if (active == bs) {
2456
        return NULL;
2457
    }
2458

    
2459
    intermediate = active;
2460
    while (intermediate->backing_hd) {
2461
        if (intermediate->backing_hd == bs) {
2462
            overlay = intermediate;
2463
            break;
2464
        }
2465
        intermediate = intermediate->backing_hd;
2466
    }
2467

    
2468
    return overlay;
2469
}
2470

    
2471
typedef struct BlkIntermediateStates {
2472
    BlockDriverState *bs;
2473
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2474
} BlkIntermediateStates;
2475

    
2476

    
2477
/*
2478
 * Drops images above 'base' up to and including 'top', and sets the image
2479
 * above 'top' to have base as its backing file.
2480
 *
2481
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2482
 * information in 'bs' can be properly updated.
2483
 *
2484
 * E.g., this will convert the following chain:
2485
 * bottom <- base <- intermediate <- top <- active
2486
 *
2487
 * to
2488
 *
2489
 * bottom <- base <- active
2490
 *
2491
 * It is allowed for bottom==base, in which case it converts:
2492
 *
2493
 * base <- intermediate <- top <- active
2494
 *
2495
 * to
2496
 *
2497
 * base <- active
2498
 *
2499
 * Error conditions:
2500
 *  if active == top, that is considered an error
2501
 *
2502
 */
2503
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2504
                           BlockDriverState *base)
2505
{
2506
    BlockDriverState *intermediate;
2507
    BlockDriverState *base_bs = NULL;
2508
    BlockDriverState *new_top_bs = NULL;
2509
    BlkIntermediateStates *intermediate_state, *next;
2510
    int ret = -EIO;
2511

    
2512
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2513
    QSIMPLEQ_INIT(&states_to_delete);
2514

    
2515
    if (!top->drv || !base->drv) {
2516
        goto exit;
2517
    }
2518

    
2519
    new_top_bs = bdrv_find_overlay(active, top);
2520

    
2521
    if (new_top_bs == NULL) {
2522
        /* we could not find the image above 'top', this is an error */
2523
        goto exit;
2524
    }
2525

    
2526
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2527
     * to do, no intermediate images */
2528
    if (new_top_bs->backing_hd == base) {
2529
        ret = 0;
2530
        goto exit;
2531
    }
2532

    
2533
    intermediate = top;
2534

    
2535
    /* now we will go down through the list, and add each BDS we find
2536
     * into our deletion queue, until we hit the 'base'
2537
     */
2538
    while (intermediate) {
2539
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2540
        intermediate_state->bs = intermediate;
2541
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2542

    
2543
        if (intermediate->backing_hd == base) {
2544
            base_bs = intermediate->backing_hd;
2545
            break;
2546
        }
2547
        intermediate = intermediate->backing_hd;
2548
    }
2549
    if (base_bs == NULL) {
2550
        /* something went wrong, we did not end at the base. safely
2551
         * unravel everything, and exit with error */
2552
        goto exit;
2553
    }
2554

    
2555
    /* success - we can delete the intermediate states, and link top->base */
2556
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2557
                                   base_bs->drv ? base_bs->drv->format_name : "");
2558
    if (ret) {
2559
        goto exit;
2560
    }
2561
    new_top_bs->backing_hd = base_bs;
2562

    
2563
    bdrv_refresh_limits(new_top_bs);
2564

    
2565
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2566
        /* so that bdrv_close() does not recursively close the chain */
2567
        intermediate_state->bs->backing_hd = NULL;
2568
        bdrv_unref(intermediate_state->bs);
2569
    }
2570
    ret = 0;
2571

    
2572
exit:
2573
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2574
        g_free(intermediate_state);
2575
    }
2576
    return ret;
2577
}
2578

    
2579

    
2580
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2581
                                   size_t size)
2582
{
2583
    int64_t len;
2584

    
2585
    if (!bdrv_is_inserted(bs))
2586
        return -ENOMEDIUM;
2587

    
2588
    if (bs->growable)
2589
        return 0;
2590

    
2591
    len = bdrv_getlength(bs);
2592

    
2593
    if (offset < 0)
2594
        return -EIO;
2595

    
2596
    if ((offset > len) || (len - offset < size))
2597
        return -EIO;
2598

    
2599
    return 0;
2600
}
2601

    
2602
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2603
                              int nb_sectors)
2604
{
2605
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2606
                                   nb_sectors * BDRV_SECTOR_SIZE);
2607
}
2608

    
2609
typedef struct RwCo {
2610
    BlockDriverState *bs;
2611
    int64_t offset;
2612
    QEMUIOVector *qiov;
2613
    bool is_write;
2614
    int ret;
2615
    BdrvRequestFlags flags;
2616
} RwCo;
2617

    
2618
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2619
{
2620
    RwCo *rwco = opaque;
2621

    
2622
    if (!rwco->is_write) {
2623
        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2624
                                      rwco->qiov->size, rwco->qiov,
2625
                                      rwco->flags);
2626
    } else {
2627
        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2628
                                       rwco->qiov->size, rwco->qiov,
2629
                                       rwco->flags);
2630
    }
2631
}
2632

    
2633
/*
2634
 * Process a vectored synchronous request using coroutines
2635
 */
2636
static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2637
                        QEMUIOVector *qiov, bool is_write,
2638
                        BdrvRequestFlags flags)
2639
{
2640
    Coroutine *co;
2641
    RwCo rwco = {
2642
        .bs = bs,
2643
        .offset = offset,
2644
        .qiov = qiov,
2645
        .is_write = is_write,
2646
        .ret = NOT_DONE,
2647
        .flags = flags,
2648
    };
2649

    
2650
    /**
2651
     * In sync call context, when the vcpu is blocked, this throttling timer
2652
     * will not fire; so the I/O throttling function has to be disabled here
2653
     * if it has been enabled.
2654
     */
2655
    if (bs->io_limits_enabled) {
2656
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2657
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2658
        bdrv_io_limits_disable(bs);
2659
    }
2660

    
2661
    if (qemu_in_coroutine()) {
2662
        /* Fast-path if already in coroutine context */
2663
        bdrv_rw_co_entry(&rwco);
2664
    } else {
2665
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2666
        qemu_coroutine_enter(co, &rwco);
2667
        while (rwco.ret == NOT_DONE) {
2668
            qemu_aio_wait();
2669
        }
2670
    }
2671
    return rwco.ret;
2672
}
2673

    
2674
/*
2675
 * Process a synchronous request using coroutines
2676
 */
2677
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2678
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2679
{
2680
    QEMUIOVector qiov;
2681
    struct iovec iov = {
2682
        .iov_base = (void *)buf,
2683
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2684
    };
2685

    
2686
    qemu_iovec_init_external(&qiov, &iov, 1);
2687
    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2688
                        &qiov, is_write, flags);
2689
}
2690

    
2691
/* return < 0 if error. See bdrv_write() for the return codes */
2692
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2693
              uint8_t *buf, int nb_sectors)
2694
{
2695
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2696
}
2697

    
2698
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2699
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2700
                          uint8_t *buf, int nb_sectors)
2701
{
2702
    bool enabled;
2703
    int ret;
2704

    
2705
    enabled = bs->io_limits_enabled;
2706
    bs->io_limits_enabled = false;
2707
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2708
    bs->io_limits_enabled = enabled;
2709
    return ret;
2710
}
2711

    
2712
/* Return < 0 if error. Important errors are:
2713
  -EIO         generic I/O error (may happen for all errors)
2714
  -ENOMEDIUM   No media inserted.
2715
  -EINVAL      Invalid sector number or nb_sectors
2716
  -EACCES      Trying to write a read-only device
2717
*/
2718
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2719
               const uint8_t *buf, int nb_sectors)
2720
{
2721
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2722
}
2723

    
2724
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2725
                      int nb_sectors, BdrvRequestFlags flags)
2726
{
2727
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2728
                      BDRV_REQ_ZERO_WRITE | flags);
2729
}
2730

    
2731
/*
2732
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2733
 * The operation is sped up by checking the block status and only writing
2734
 * zeroes to the device if they currently do not return zeroes. Optional
2735
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2736
 *
2737
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2738
 */
2739
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2740
{
2741
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2742
    int64_t ret, nb_sectors, sector_num = 0;
2743
    int n;
2744

    
2745
    for (;;) {
2746
        nb_sectors = target_size - sector_num;
2747
        if (nb_sectors <= 0) {
2748
            return 0;
2749
        }
2750
        if (nb_sectors > INT_MAX) {
2751
            nb_sectors = INT_MAX;
2752
        }
2753
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2754
        if (ret < 0) {
2755
            error_report("error getting block status at sector %" PRId64 ": %s",
2756
                         sector_num, strerror(-ret));
2757
            return ret;
2758
        }
2759
        if (ret & BDRV_BLOCK_ZERO) {
2760
            sector_num += n;
2761
            continue;
2762
        }
2763
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2764
        if (ret < 0) {
2765
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2766
                         sector_num, strerror(-ret));
2767
            return ret;
2768
        }
2769
        sector_num += n;
2770
    }
2771
}
2772

    
2773
int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2774
{
2775
    QEMUIOVector qiov;
2776
    struct iovec iov = {
2777
        .iov_base = (void *)buf,
2778
        .iov_len = bytes,
2779
    };
2780
    int ret;
2781

    
2782
    if (bytes < 0) {
2783
        return -EINVAL;
2784
    }
2785

    
2786
    qemu_iovec_init_external(&qiov, &iov, 1);
2787
    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2788
    if (ret < 0) {
2789
        return ret;
2790
    }
2791

    
2792
    return bytes;
2793
}
2794

    
2795
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2796
{
2797
    int ret;
2798

    
2799
    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2800
    if (ret < 0) {
2801
        return ret;
2802
    }
2803

    
2804
    return qiov->size;
2805
}
2806

    
2807
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2808
                const void *buf, int bytes)
2809
{
2810
    QEMUIOVector qiov;
2811
    struct iovec iov = {
2812
        .iov_base   = (void *) buf,
2813
        .iov_len    = bytes,
2814
    };
2815

    
2816
    if (bytes < 0) {
2817
        return -EINVAL;
2818
    }
2819

    
2820
    qemu_iovec_init_external(&qiov, &iov, 1);
2821
    return bdrv_pwritev(bs, offset, &qiov);
2822
}
2823

    
2824
/*
2825
 * Writes to the file and ensures that no writes are reordered across this
2826
 * request (acts as a barrier)
2827
 *
2828
 * Returns 0 on success, -errno in error cases.
2829
 */
2830
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2831
    const void *buf, int count)
2832
{
2833
    int ret;
2834

    
2835
    ret = bdrv_pwrite(bs, offset, buf, count);
2836
    if (ret < 0) {
2837
        return ret;
2838
    }
2839

    
2840
    /* No flush needed for cache modes that already do it */
2841
    if (bs->enable_write_cache) {
2842
        bdrv_flush(bs);
2843
    }
2844

    
2845
    return 0;
2846
}
2847

    
2848
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2849
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2850
{
2851
    /* Perform I/O through a temporary buffer so that users who scribble over
2852
     * their read buffer while the operation is in progress do not end up
2853
     * modifying the image file.  This is critical for zero-copy guest I/O
2854
     * where anything might happen inside guest memory.
2855
     */
2856
    void *bounce_buffer;
2857

    
2858
    BlockDriver *drv = bs->drv;
2859
    struct iovec iov;
2860
    QEMUIOVector bounce_qiov;
2861
    int64_t cluster_sector_num;
2862
    int cluster_nb_sectors;
2863
    size_t skip_bytes;
2864
    int ret;
2865

    
2866
    /* Cover entire cluster so no additional backing file I/O is required when
2867
     * allocating cluster in the image file.
2868
     */
2869
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2870
                           &cluster_sector_num, &cluster_nb_sectors);
2871

    
2872
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2873
                                   cluster_sector_num, cluster_nb_sectors);
2874

    
2875
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2876
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2877
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2878

    
2879
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2880
                             &bounce_qiov);
2881
    if (ret < 0) {
2882
        goto err;
2883
    }
2884

    
2885
    if (drv->bdrv_co_write_zeroes &&
2886
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2887
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2888
                                      cluster_nb_sectors, 0);
2889
    } else {
2890
        /* This does not change the data on the disk, it is not necessary
2891
         * to flush even in cache=writethrough mode.
2892
         */
2893
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2894
                                  &bounce_qiov);
2895
    }
2896

    
2897
    if (ret < 0) {
2898
        /* It might be okay to ignore write errors for guest requests.  If this
2899
         * is a deliberate copy-on-read then we don't want to ignore the error.
2900
         * Simply report it in all cases.
2901
         */
2902
        goto err;
2903
    }
2904

    
2905
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2906
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2907
                        nb_sectors * BDRV_SECTOR_SIZE);
2908

    
2909
err:
2910
    qemu_vfree(bounce_buffer);
2911
    return ret;
2912
}
2913

    
2914
/*
2915
 * Forwards an already correctly aligned request to the BlockDriver. This
2916
 * handles copy on read and zeroing after EOF; any other features must be
2917
 * implemented by the caller.
2918
 */
2919
static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2920
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2921
    int64_t align, QEMUIOVector *qiov, int flags)
2922
{
2923
    BlockDriver *drv = bs->drv;
2924
    int ret;
2925

    
2926
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2927
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2928

    
2929
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2930
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2931

    
2932
    /* Handle Copy on Read and associated serialisation */
2933
    if (flags & BDRV_REQ_COPY_ON_READ) {
2934
        /* If we touch the same cluster it counts as an overlap.  This
2935
         * guarantees that allocating writes will be serialized and not race
2936
         * with each other for the same cluster.  For example, in copy-on-read
2937
         * it ensures that the CoR read and write operations are atomic and
2938
         * guest writes cannot interleave between them. */
2939
        mark_request_serialising(req, bdrv_get_cluster_size(bs));
2940
    }
2941

    
2942
    wait_serialising_requests(req);
2943

    
2944
    if (flags & BDRV_REQ_COPY_ON_READ) {
2945
        int pnum;
2946

    
2947
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2948
        if (ret < 0) {
2949
            goto out;
2950
        }
2951

    
2952
        if (!ret || pnum != nb_sectors) {
2953
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2954
            goto out;
2955
        }
2956
    }
2957

    
2958
    /* Forward the request to the BlockDriver */
2959
    if (!(bs->zero_beyond_eof && bs->growable)) {
2960
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2961
    } else {
2962
        /* Read zeros after EOF of growable BDSes */
2963
        int64_t len, total_sectors, max_nb_sectors;
2964

    
2965
        len = bdrv_getlength(bs);
2966
        if (len < 0) {
2967
            ret = len;
2968
            goto out;
2969
        }
2970

    
2971
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2972
        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2973
                                  align >> BDRV_SECTOR_BITS);
2974
        if (max_nb_sectors > 0) {
2975
            ret = drv->bdrv_co_readv(bs, sector_num,
2976
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2977
        } else {
2978
            ret = 0;
2979
        }
2980

    
2981
        /* Reading beyond end of file is supposed to produce zeroes */
2982
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2983
            uint64_t offset = MAX(0, total_sectors - sector_num);
2984
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2985
                              BDRV_SECTOR_SIZE;
2986
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2987
        }
2988
    }
2989

    
2990
out:
2991
    return ret;
2992
}
2993

    
2994
/*
2995
 * Handle a read request in coroutine context
2996
 */
2997
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2998
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2999
    BdrvRequestFlags flags)
3000
{
3001
    BlockDriver *drv = bs->drv;
3002
    BdrvTrackedRequest req;
3003

    
3004
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3005
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3006
    uint8_t *head_buf = NULL;
3007
    uint8_t *tail_buf = NULL;
3008
    QEMUIOVector local_qiov;
3009
    bool use_local_qiov = false;
3010
    int ret;
3011

    
3012
    if (!drv) {
3013
        return -ENOMEDIUM;
3014
    }
3015
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3016
        return -EIO;
3017
    }
3018

    
3019
    if (bs->copy_on_read) {
3020
        flags |= BDRV_REQ_COPY_ON_READ;
3021
    }
3022

    
3023
    /* throttling disk I/O */
3024
    if (bs->io_limits_enabled) {
3025
        bdrv_io_limits_intercept(bs, bytes, false);
3026
    }
3027

    
3028
    /* Align read if necessary by padding qiov */
3029
    if (offset & (align - 1)) {
3030
        head_buf = qemu_blockalign(bs, align);
3031
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3032
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3033
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3034
        use_local_qiov = true;
3035

    
3036
        bytes += offset & (align - 1);
3037
        offset = offset & ~(align - 1);
3038
    }
3039

    
3040
    if ((offset + bytes) & (align - 1)) {
3041
        if (!use_local_qiov) {
3042
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3043
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3044
            use_local_qiov = true;
3045
        }
3046
        tail_buf = qemu_blockalign(bs, align);
3047
        qemu_iovec_add(&local_qiov, tail_buf,
3048
                       align - ((offset + bytes) & (align - 1)));
3049

    
3050
        bytes = ROUND_UP(bytes, align);
3051
    }
3052

    
3053
    tracked_request_begin(&req, bs, offset, bytes, false);
3054
    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3055
                              use_local_qiov ? &local_qiov : qiov,
3056
                              flags);
3057
    tracked_request_end(&req);
3058

    
3059
    if (use_local_qiov) {
3060
        qemu_iovec_destroy(&local_qiov);
3061
        qemu_vfree(head_buf);
3062
        qemu_vfree(tail_buf);
3063
    }
3064

    
3065
    return ret;
3066
}
3067

    
3068
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3069
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3070
    BdrvRequestFlags flags)
3071
{
3072
    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3073
        return -EINVAL;
3074
    }
3075

    
3076
    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3077
                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3078
}
3079

    
3080
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3081
    int nb_sectors, QEMUIOVector *qiov)
3082
{
3083
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3084

    
3085
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3086
}
3087

    
3088
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3089
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3090
{
3091
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3092

    
3093
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3094
                            BDRV_REQ_COPY_ON_READ);
3095
}
3096

    
3097
/* if no limit is specified in the BlockLimits use a default
3098
 * of 32768 512-byte sectors (16 MiB) per request.
3099
 */
3100
#define MAX_WRITE_ZEROES_DEFAULT 32768
3101

    
3102
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3103
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3104
{
3105
    BlockDriver *drv = bs->drv;
3106
    QEMUIOVector qiov;
3107
    struct iovec iov = {0};
3108
    int ret = 0;
3109

    
3110
    int max_write_zeroes = bs->bl.max_write_zeroes ?
3111
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3112

    
3113
    while (nb_sectors > 0 && !ret) {
3114
        int num = nb_sectors;
3115

    
3116
        /* Align request.  Block drivers can expect the "bulk" of the request
3117
         * to be aligned.
3118
         */
3119
        if (bs->bl.write_zeroes_alignment
3120
            && num > bs->bl.write_zeroes_alignment) {
3121
            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3122
                /* Make a small request up to the first aligned sector.  */
3123
                num = bs->bl.write_zeroes_alignment;
3124
                num -= sector_num % bs->bl.write_zeroes_alignment;
3125
            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3126
                /* Shorten the request to the last aligned sector.  num cannot
3127
                 * underflow because num > bs->bl.write_zeroes_alignment.
3128
                 */
3129
                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3130
            }
3131
        }
3132

    
3133
        /* limit request size */
3134
        if (num > max_write_zeroes) {
3135
            num = max_write_zeroes;
3136
        }
3137

    
3138
        ret = -ENOTSUP;
3139
        /* First try the efficient write zeroes operation */
3140
        if (drv->bdrv_co_write_zeroes) {
3141
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3142
        }
3143

    
3144
        if (ret == -ENOTSUP) {
3145
            /* Fall back to bounce buffer if write zeroes is unsupported */
3146
            iov.iov_len = num * BDRV_SECTOR_SIZE;
3147
            if (iov.iov_base == NULL) {
3148
                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3149
                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3150
            }
3151
            qemu_iovec_init_external(&qiov, &iov, 1);
3152

    
3153
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3154

    
3155
            /* Keep bounce buffer around if it is big enough for all
3156
             * all future requests.
3157
             */
3158
            if (num < max_write_zeroes) {
3159
                qemu_vfree(iov.iov_base);
3160
                iov.iov_base = NULL;
3161
            }
3162
        }
3163

    
3164
        sector_num += num;
3165
        nb_sectors -= num;
3166
    }
3167

    
3168
    qemu_vfree(iov.iov_base);
3169
    return ret;
3170
}
3171

    
3172
/*
3173
 * Forwards an already correctly aligned write request to the BlockDriver.
3174
 */
3175
static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3176
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3177
    QEMUIOVector *qiov, int flags)
3178
{
3179
    BlockDriver *drv = bs->drv;
3180
    bool waited;
3181
    int ret;
3182

    
3183
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3184
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3185

    
3186
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3187
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3188

    
3189
    waited = wait_serialising_requests(req);
3190
    assert(!waited || !req->serialising);
3191
    assert(req->overlap_offset <= offset);
3192
    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3193

    
3194
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3195

    
3196
    if (ret < 0) {
3197
        /* Do nothing, write notifier decided to fail this request */
3198
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
3199
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3200
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3201
    } else {
3202
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3203
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3204
    }
3205
    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3206

    
3207
    if (ret == 0 && !bs->enable_write_cache) {
3208
        ret = bdrv_co_flush(bs);
3209
    }
3210

    
3211
    bdrv_set_dirty(bs, sector_num, nb_sectors);
3212

    
3213
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3214
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
3215
    }
3216
    if (bs->growable && ret >= 0) {
3217
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3218
    }
3219

    
3220
    return ret;
3221
}
3222

    
3223
/*
3224
 * Handle a write request in coroutine context
3225
 */
3226
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3227
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3228
    BdrvRequestFlags flags)
3229
{
3230
    BdrvTrackedRequest req;
3231
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3232
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3233
    uint8_t *head_buf = NULL;
3234
    uint8_t *tail_buf = NULL;
3235
    QEMUIOVector local_qiov;
3236
    bool use_local_qiov = false;
3237
    int ret;
3238

    
3239
    if (!bs->drv) {
3240
        return -ENOMEDIUM;
3241
    }
3242
    if (bs->read_only) {
3243
        return -EACCES;
3244
    }
3245
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3246
        return -EIO;
3247
    }
3248

    
3249
    /* throttling disk I/O */
3250
    if (bs->io_limits_enabled) {
3251
        bdrv_io_limits_intercept(bs, bytes, true);
3252
    }
3253

    
3254
    /*
3255
     * Align write if necessary by performing a read-modify-write cycle.
3256
     * Pad qiov with the read parts and be sure to have a tracked request not
3257
     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3258
     */
3259
    tracked_request_begin(&req, bs, offset, bytes, true);
3260

    
3261
    if (offset & (align - 1)) {
3262
        QEMUIOVector head_qiov;
3263
        struct iovec head_iov;
3264

    
3265
        mark_request_serialising(&req, align);
3266
        wait_serialising_requests(&req);
3267

    
3268
        head_buf = qemu_blockalign(bs, align);
3269
        head_iov = (struct iovec) {
3270
            .iov_base   = head_buf,
3271
            .iov_len    = align,
3272
        };
3273
        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3274

    
3275
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3276
        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3277
                                  align, &head_qiov, 0);
3278
        if (ret < 0) {
3279
            goto fail;
3280
        }
3281
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3282

    
3283
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3284
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3285
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3286
        use_local_qiov = true;
3287

    
3288
        bytes += offset & (align - 1);
3289
        offset = offset & ~(align - 1);
3290
    }
3291

    
3292
    if ((offset + bytes) & (align - 1)) {
3293
        QEMUIOVector tail_qiov;
3294
        struct iovec tail_iov;
3295
        size_t tail_bytes;
3296
        bool waited;
3297

    
3298
        mark_request_serialising(&req, align);
3299
        waited = wait_serialising_requests(&req);
3300
        assert(!waited || !use_local_qiov);
3301

    
3302
        tail_buf = qemu_blockalign(bs, align);
3303
        tail_iov = (struct iovec) {
3304
            .iov_base   = tail_buf,
3305
            .iov_len    = align,
3306
        };
3307
        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3308

    
3309
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3310
        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3311
                                  align, &tail_qiov, 0);
3312
        if (ret < 0) {
3313
            goto fail;
3314
        }
3315
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3316

    
3317
        if (!use_local_qiov) {
3318
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3319
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3320
            use_local_qiov = true;
3321
        }
3322

    
3323
        tail_bytes = (offset + bytes) & (align - 1);
3324
        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3325

    
3326
        bytes = ROUND_UP(bytes, align);
3327
    }
3328

    
3329
    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3330
                               use_local_qiov ? &local_qiov : qiov,
3331
                               flags);
3332

    
3333
fail:
3334
    tracked_request_end(&req);
3335

    
3336
    if (use_local_qiov) {
3337
        qemu_iovec_destroy(&local_qiov);
3338
    }
3339
    qemu_vfree(head_buf);
3340
    qemu_vfree(tail_buf);
3341

    
3342
    return ret;
3343
}
3344

    
3345
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3346
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3347
    BdrvRequestFlags flags)
3348
{
3349
    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3350
        return -EINVAL;
3351
    }
3352

    
3353
    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3354
                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3355
}
3356

    
3357
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3358
    int nb_sectors, QEMUIOVector *qiov)
3359
{
3360
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3361

    
3362
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3363
}
3364

    
3365
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3366
                                      int64_t sector_num, int nb_sectors,
3367
                                      BdrvRequestFlags flags)
3368
{
3369
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3370

    
3371
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
3372
        flags &= ~BDRV_REQ_MAY_UNMAP;
3373
    }
3374

    
3375
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3376
                             BDRV_REQ_ZERO_WRITE | flags);
3377
}
3378

    
3379
/**
3380
 * Truncate file to 'offset' bytes (needed only for file protocols)
3381
 */
3382
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3383
{
3384
    BlockDriver *drv = bs->drv;
3385
    int ret;
3386
    if (!drv)
3387
        return -ENOMEDIUM;
3388
    if (!drv->bdrv_truncate)
3389
        return -ENOTSUP;
3390
    if (bs->read_only)
3391
        return -EACCES;
3392
    if (bdrv_in_use(bs))
3393
        return -EBUSY;
3394
    ret = drv->bdrv_truncate(bs, offset);
3395
    if (ret == 0) {
3396
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3397
        bdrv_dev_resize_cb(bs);
3398
    }
3399
    return ret;
3400
}
3401

    
3402
/**
3403
 * Length of a allocated file in bytes. Sparse files are counted by actual
3404
 * allocated space. Return < 0 if error or unknown.
3405
 */
3406
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3407
{
3408
    BlockDriver *drv = bs->drv;
3409
    if (!drv) {
3410
        return -ENOMEDIUM;
3411
    }
3412
    if (drv->bdrv_get_allocated_file_size) {
3413
        return drv->bdrv_get_allocated_file_size(bs);
3414
    }
3415
    if (bs->file) {
3416
        return bdrv_get_allocated_file_size(bs->file);
3417
    }
3418
    return -ENOTSUP;
3419
}
3420

    
3421
/**
3422
 * Length of a file in bytes. Return < 0 if error or unknown.
3423
 */
3424
int64_t bdrv_getlength(BlockDriverState *bs)
3425
{
3426
    BlockDriver *drv = bs->drv;
3427
    if (!drv)
3428
        return -ENOMEDIUM;
3429

    
3430
    if (drv->has_variable_length) {
3431
        int ret = refresh_total_sectors(bs, bs->total_sectors);
3432
        if (ret < 0) {
3433
            return ret;
3434
        }
3435
    }
3436
    return bs->total_sectors * BDRV_SECTOR_SIZE;
3437
}
3438

    
3439
/* return 0 as number of sectors if no device present or error */
3440
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3441
{
3442
    int64_t length;
3443
    length = bdrv_getlength(bs);
3444
    if (length < 0)
3445
        length = 0;
3446
    else
3447
        length = length >> BDRV_SECTOR_BITS;
3448
    *nb_sectors_ptr = length;
3449
}
3450

    
3451
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3452
                       BlockdevOnError on_write_error)
3453
{
3454
    bs->on_read_error = on_read_error;
3455
    bs->on_write_error = on_write_error;
3456
}
3457

    
3458
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3459
{
3460
    return is_read ? bs->on_read_error : bs->on_write_error;
3461
}
3462

    
3463
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3464
{
3465
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3466

    
3467
    switch (on_err) {
3468
    case BLOCKDEV_ON_ERROR_ENOSPC:
3469
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3470
    case BLOCKDEV_ON_ERROR_STOP:
3471
        return BDRV_ACTION_STOP;
3472
    case BLOCKDEV_ON_ERROR_REPORT:
3473
        return BDRV_ACTION_REPORT;
3474
    case BLOCKDEV_ON_ERROR_IGNORE:
3475
        return BDRV_ACTION_IGNORE;
3476
    default:
3477
        abort();
3478
    }
3479
}
3480

    
3481
/* This is done by device models because, while the block layer knows
3482
 * about the error, it does not know whether an operation comes from
3483
 * the device or the block layer (from a job, for example).
3484
 */
3485
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3486
                       bool is_read, int error)
3487
{
3488
    assert(error >= 0);
3489
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3490
    if (action == BDRV_ACTION_STOP) {
3491
        vm_stop(RUN_STATE_IO_ERROR);
3492
        bdrv_iostatus_set_err(bs, error);
3493
    }
3494
}
3495

    
3496
int bdrv_is_read_only(BlockDriverState *bs)
3497
{
3498
    return bs->read_only;
3499
}
3500

    
3501
int bdrv_is_sg(BlockDriverState *bs)
3502
{
3503
    return bs->sg;
3504
}
3505

    
3506
int bdrv_enable_write_cache(BlockDriverState *bs)
3507
{
3508
    return bs->enable_write_cache;
3509
}
3510

    
3511
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3512
{
3513
    bs->enable_write_cache = wce;
3514

    
3515
    /* so a reopen() will preserve wce */
3516
    if (wce) {
3517
        bs->open_flags |= BDRV_O_CACHE_WB;
3518
    } else {
3519
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3520
    }
3521
}
3522

    
3523
int bdrv_is_encrypted(BlockDriverState *bs)
3524
{
3525
    if (bs->backing_hd && bs->backing_hd->encrypted)
3526
        return 1;
3527
    return bs->encrypted;
3528
}
3529

    
3530
int bdrv_key_required(BlockDriverState *bs)
3531
{
3532
    BlockDriverState *backing_hd = bs->backing_hd;
3533

    
3534
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3535
        return 1;
3536
    return (bs->encrypted && !bs->valid_key);
3537
}
3538

    
3539
int bdrv_set_key(BlockDriverState *bs, const char *key)
3540
{
3541
    int ret;
3542
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3543
        ret = bdrv_set_key(bs->backing_hd, key);
3544
        if (ret < 0)
3545
            return ret;
3546
        if (!bs->encrypted)
3547
            return 0;
3548
    }
3549
    if (!bs->encrypted) {
3550
        return -EINVAL;
3551
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3552
        return -ENOMEDIUM;
3553
    }
3554
    ret = bs->drv->bdrv_set_key(bs, key);
3555
    if (ret < 0) {
3556
        bs->valid_key = 0;
3557
    } else if (!bs->valid_key) {
3558
        bs->valid_key = 1;
3559
        /* call the change callback now, we skipped it on open */
3560
        bdrv_dev_change_media_cb(bs, true);
3561
    }
3562
    return ret;
3563
}
3564

    
3565
const char *bdrv_get_format_name(BlockDriverState *bs)
3566
{
3567
    return bs->drv ? bs->drv->format_name : NULL;
3568
}
3569

    
3570
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3571
                         void *opaque)
3572
{
3573
    BlockDriver *drv;
3574

    
3575
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3576
        it(opaque, drv->format_name);
3577
    }
3578
}
3579

    
3580
/* This function is to find block backend bs */
3581
BlockDriverState *bdrv_find(const char *name)
3582
{
3583
    BlockDriverState *bs;
3584

    
3585
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3586
        if (!strcmp(name, bs->device_name)) {
3587
            return bs;
3588
        }
3589
    }
3590
    return NULL;
3591
}
3592

    
3593
/* This function is to find a node in the bs graph */
3594
BlockDriverState *bdrv_find_node(const char *node_name)
3595
{
3596
    BlockDriverState *bs;
3597

    
3598
    assert(node_name);
3599

    
3600
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3601
        if (!strcmp(node_name, bs->node_name)) {
3602
            return bs;
3603
        }
3604
    }
3605
    return NULL;
3606
}
3607

    
3608
/* Put this QMP function here so it can access the static graph_bdrv_states. */
3609
BlockDeviceInfoList *bdrv_named_nodes_list(void)
3610
{
3611
    BlockDeviceInfoList *list, *entry;
3612
    BlockDriverState *bs;
3613

    
3614
    list = NULL;
3615
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3616
        entry = g_malloc0(sizeof(*entry));
3617
        entry->value = bdrv_block_device_info(bs);
3618
        entry->next = list;
3619
        list = entry;
3620
    }
3621

    
3622
    return list;
3623
}
3624

    
3625
BlockDriverState *bdrv_lookup_bs(const char *device,
3626
                                 const char *node_name,
3627
                                 Error **errp)
3628
{
3629
    BlockDriverState *bs = NULL;
3630

    
3631
    if (device) {
3632
        bs = bdrv_find(device);
3633

    
3634
        if (bs) {
3635
            return bs;
3636
        }
3637
    }
3638

    
3639
    if (node_name) {
3640
        bs = bdrv_find_node(node_name);
3641

    
3642
        if (bs) {
3643
            return bs;
3644
        }
3645
    }
3646

    
3647
    error_setg(errp, "Cannot find device=%s nor node_name=%s",
3648
                     device ? device : "",
3649
                     node_name ? node_name : "");
3650
    return NULL;
3651
}
3652

    
3653
BlockDriverState *bdrv_next(BlockDriverState *bs)
3654
{
3655
    if (!bs) {
3656
        return QTAILQ_FIRST(&bdrv_states);
3657
    }
3658
    return QTAILQ_NEXT(bs, device_list);
3659
}
3660

    
3661
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3662
{
3663
    BlockDriverState *bs;
3664

    
3665
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3666
        it(opaque, bs);
3667
    }
3668
}
3669

    
3670
const char *bdrv_get_device_name(BlockDriverState *bs)
3671
{
3672
    return bs->device_name;
3673
}
3674

    
3675
int bdrv_get_flags(BlockDriverState *bs)
3676
{
3677
    return bs->open_flags;
3678
}
3679

    
3680
int bdrv_flush_all(void)
3681
{
3682
    BlockDriverState *bs;
3683
    int result = 0;
3684

    
3685
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3686
        int ret = bdrv_flush(bs);
3687
        if (ret < 0 && !result) {
3688
            result = ret;
3689
        }
3690
    }
3691

    
3692
    return result;
3693
}
3694

    
3695
int bdrv_has_zero_init_1(BlockDriverState *bs)
3696
{
3697
    return 1;
3698
}
3699

    
3700
int bdrv_has_zero_init(BlockDriverState *bs)
3701
{
3702
    assert(bs->drv);
3703

    
3704
    /* If BS is a copy on write image, it is initialized to
3705
       the contents of the base image, which may not be zeroes.  */
3706
    if (bs->backing_hd) {
3707
        return 0;
3708
    }
3709
    if (bs->drv->bdrv_has_zero_init) {
3710
        return bs->drv->bdrv_has_zero_init(bs);
3711
    }
3712

    
3713
    /* safe default */
3714
    return 0;
3715
}
3716

    
3717
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3718
{
3719
    BlockDriverInfo bdi;
3720

    
3721
    if (bs->backing_hd) {
3722
        return false;
3723
    }
3724

    
3725
    if (bdrv_get_info(bs, &bdi) == 0) {
3726
        return bdi.unallocated_blocks_are_zero;
3727
    }
3728

    
3729
    return false;
3730
}
3731

    
3732
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3733
{
3734
    BlockDriverInfo bdi;
3735

    
3736
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3737
        return false;
3738
    }
3739

    
3740
    if (bdrv_get_info(bs, &bdi) == 0) {
3741
        return bdi.can_write_zeroes_with_unmap;
3742
    }
3743

    
3744
    return false;
3745
}
3746

    
3747
typedef struct BdrvCoGetBlockStatusData {
3748
    BlockDriverState *bs;
3749
    BlockDriverState *base;
3750
    int64_t sector_num;
3751
    int nb_sectors;
3752
    int *pnum;
3753
    int64_t ret;
3754
    bool done;
3755
} BdrvCoGetBlockStatusData;
3756

    
3757
/*
3758
 * Returns true iff the specified sector is present in the disk image. Drivers
3759
 * not implementing the functionality are assumed to not support backing files,
3760
 * hence all their sectors are reported as allocated.
3761
 *
3762
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3763
 * and 'pnum' is set to 0.
3764
 *
3765
 * 'pnum' is set to the number of sectors (including and immediately following
3766
 * the specified sector) that are known to be in the same
3767
 * allocated/unallocated state.
3768
 *
3769
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3770
 * beyond the end of the disk image it will be clamped.
3771
 */
3772
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3773
                                                     int64_t sector_num,
3774
                                                     int nb_sectors, int *pnum)
3775
{
3776
    int64_t length;
3777
    int64_t n;
3778
    int64_t ret, ret2;
3779

    
3780
    length = bdrv_getlength(bs);
3781
    if (length < 0) {
3782
        return length;
3783
    }
3784

    
3785
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3786
        *pnum = 0;
3787
        return 0;
3788
    }
3789

    
3790
    n = bs->total_sectors - sector_num;
3791
    if (n < nb_sectors) {
3792
        nb_sectors = n;
3793
    }
3794

    
3795
    if (!bs->drv->bdrv_co_get_block_status) {
3796
        *pnum = nb_sectors;
3797
        ret = BDRV_BLOCK_DATA;
3798
        if (bs->drv->protocol_name) {
3799
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3800
        }
3801
        return ret;
3802
    }
3803

    
3804
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3805
    if (ret < 0) {
3806
        *pnum = 0;
3807
        return ret;
3808
    }
3809

    
3810
    if (ret & BDRV_BLOCK_RAW) {
3811
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3812
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3813
                                     *pnum, pnum);
3814
    }
3815

    
3816
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3817
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3818
            ret |= BDRV_BLOCK_ZERO;
3819
        } else if (bs->backing_hd) {
3820
            BlockDriverState *bs2 = bs->backing_hd;
3821
            int64_t length2 = bdrv_getlength(bs2);
3822
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3823
                ret |= BDRV_BLOCK_ZERO;
3824
            }
3825
        }
3826
    }
3827

    
3828
    if (bs->file &&
3829
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3830
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3831
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3832
                                        *pnum, pnum);
3833
        if (ret2 >= 0) {
3834
            /* Ignore errors.  This is just providing extra information, it
3835
             * is useful but not necessary.
3836
             */
3837
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3838
        }
3839
    }
3840

    
3841
    return ret;
3842
}
3843

    
3844
/* Coroutine wrapper for bdrv_get_block_status() */
3845
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3846
{
3847
    BdrvCoGetBlockStatusData *data = opaque;
3848
    BlockDriverState *bs = data->bs;
3849

    
3850
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3851
                                         data->pnum);
3852
    data->done = true;
3853
}
3854

    
3855
/*
3856
 * Synchronous wrapper around bdrv_co_get_block_status().
3857
 *
3858
 * See bdrv_co_get_block_status() for details.
3859
 */
3860
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3861
                              int nb_sectors, int *pnum)
3862
{
3863
    Coroutine *co;
3864
    BdrvCoGetBlockStatusData data = {
3865
        .bs = bs,
3866
        .sector_num = sector_num,
3867
        .nb_sectors = nb_sectors,
3868
        .pnum = pnum,
3869
        .done = false,
3870
    };
3871

    
3872
    if (qemu_in_coroutine()) {
3873
        /* Fast-path if already in coroutine context */
3874
        bdrv_get_block_status_co_entry(&data);
3875
    } else {
3876
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3877
        qemu_coroutine_enter(co, &data);
3878
        while (!data.done) {
3879
            qemu_aio_wait();
3880
        }
3881
    }
3882
    return data.ret;
3883
}
3884

    
3885
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3886
                                   int nb_sectors, int *pnum)
3887
{
3888
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3889
    if (ret < 0) {
3890
        return ret;
3891
    }
3892
    return
3893
        (ret & BDRV_BLOCK_DATA) ||
3894
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3895
}
3896

    
3897
/*
3898
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3899
 *
3900
 * Return true if the given sector is allocated in any image between
3901
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3902
 * sector is allocated in any image of the chain.  Return false otherwise.
3903
 *
3904
 * 'pnum' is set to the number of sectors (including and immediately following
3905
 *  the specified sector) that are known to be in the same
3906
 *  allocated/unallocated state.
3907
 *
3908
 */
3909
int bdrv_is_allocated_above(BlockDriverState *top,
3910
                            BlockDriverState *base,
3911
                            int64_t sector_num,
3912
                            int nb_sectors, int *pnum)
3913
{
3914
    BlockDriverState *intermediate;
3915
    int ret, n = nb_sectors;
3916

    
3917
    intermediate = top;
3918
    while (intermediate && intermediate != base) {
3919
        int pnum_inter;
3920
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3921
                                &pnum_inter);
3922
        if (ret < 0) {
3923
            return ret;
3924
        } else if (ret) {
3925
            *pnum = pnum_inter;
3926
            return 1;
3927
        }
3928

    
3929
        /*
3930
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3931
         * might have
3932
         *
3933
         * [sector_num+x, nr_sectors] allocated.
3934
         */
3935
        if (n > pnum_inter &&
3936
            (intermediate == top ||
3937
             sector_num + pnum_inter < intermediate->total_sectors)) {
3938
            n = pnum_inter;
3939
        }
3940

    
3941
        intermediate = intermediate->backing_hd;
3942
    }
3943

    
3944
    *pnum = n;
3945
    return 0;
3946
}
3947

    
3948
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3949
{
3950
    if (bs->backing_hd && bs->backing_hd->encrypted)
3951
        return bs->backing_file;
3952
    else if (bs->encrypted)
3953
        return bs->filename;
3954
    else
3955
        return NULL;
3956
}
3957

    
3958
void bdrv_get_backing_filename(BlockDriverState *bs,
3959
                               char *filename, int filename_size)
3960
{
3961
    pstrcpy(filename, filename_size, bs->backing_file);
3962
}
3963

    
3964
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3965
                          const uint8_t *buf, int nb_sectors)
3966
{
3967
    BlockDriver *drv = bs->drv;
3968
    if (!drv)
3969
        return -ENOMEDIUM;
3970
    if (!drv->bdrv_write_compressed)
3971
        return -ENOTSUP;
3972
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3973
        return -EIO;
3974

    
3975
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3976

    
3977
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3978
}
3979

    
3980
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3981
{
3982
    BlockDriver *drv = bs->drv;
3983
    if (!drv)
3984
        return -ENOMEDIUM;
3985
    if (!drv->bdrv_get_info)
3986
        return -ENOTSUP;
3987
    memset(bdi, 0, sizeof(*bdi));
3988
    return drv->bdrv_get_info(bs, bdi);
3989
}
3990

    
3991
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3992
{
3993
    BlockDriver *drv = bs->drv;
3994
    if (drv && drv->bdrv_get_specific_info) {
3995
        return drv->bdrv_get_specific_info(bs);
3996
    }
3997
    return NULL;
3998
}
3999

    
4000
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4001
                      int64_t pos, int size)
4002
{
4003
    QEMUIOVector qiov;
4004
    struct iovec iov = {
4005
        .iov_base   = (void *) buf,
4006
        .iov_len    = size,
4007
    };
4008

    
4009
    qemu_iovec_init_external(&qiov, &iov, 1);
4010
    return bdrv_writev_vmstate(bs, &qiov, pos);
4011
}
4012

    
4013
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4014
{
4015
    BlockDriver *drv = bs->drv;
4016

    
4017
    if (!drv) {
4018
        return -ENOMEDIUM;
4019
    } else if (drv->bdrv_save_vmstate) {
4020
        return drv->bdrv_save_vmstate(bs, qiov, pos);
4021
    } else if (bs->file) {
4022
        return bdrv_writev_vmstate(bs->file, qiov, pos);
4023
    }
4024

    
4025
    return -ENOTSUP;
4026
}
4027

    
4028
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4029
                      int64_t pos, int size)
4030
{
4031
    BlockDriver *drv = bs->drv;
4032
    if (!drv)
4033
        return -ENOMEDIUM;
4034
    if (drv->bdrv_load_vmstate)
4035
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
4036
    if (bs->file)
4037
        return bdrv_load_vmstate(bs->file, buf, pos, size);
4038
    return -ENOTSUP;
4039
}
4040

    
4041
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4042
{
4043
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4044
        return;
4045
    }
4046

    
4047
    bs->drv->bdrv_debug_event(bs, event);
4048
}
4049

    
4050
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4051
                          const char *tag)
4052
{
4053
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4054
        bs = bs->file;
4055
    }
4056

    
4057
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4058
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4059
    }
4060

    
4061
    return -ENOTSUP;
4062
}
4063

    
4064
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4065
{
4066
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4067
        bs = bs->file;
4068
    }
4069

    
4070
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4071
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4072
    }
4073

    
4074
    return -ENOTSUP;
4075
}
4076

    
4077
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4078
{
4079
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4080
        bs = bs->file;
4081
    }
4082

    
4083
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4084
        return bs->drv->bdrv_debug_resume(bs, tag);
4085
    }
4086

    
4087
    return -ENOTSUP;
4088
}
4089

    
4090
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4091
{
4092
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4093
        bs = bs->file;
4094
    }
4095

    
4096
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4097
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
4098
    }
4099

    
4100
    return false;
4101
}
4102

    
4103
int bdrv_is_snapshot(BlockDriverState *bs)
4104
{
4105
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4106
}
4107

    
4108
/* backing_file can either be relative, or absolute, or a protocol.  If it is
4109
 * relative, it must be relative to the chain.  So, passing in bs->filename
4110
 * from a BDS as backing_file should not be done, as that may be relative to
4111
 * the CWD rather than the chain. */
4112
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4113
        const char *backing_file)
4114
{
4115
    char *filename_full = NULL;
4116
    char *backing_file_full = NULL;
4117
    char *filename_tmp = NULL;
4118
    int is_protocol = 0;
4119
    BlockDriverState *curr_bs = NULL;
4120
    BlockDriverState *retval = NULL;
4121

    
4122
    if (!bs || !bs->drv || !backing_file) {
4123
        return NULL;
4124
    }
4125

    
4126
    filename_full     = g_malloc(PATH_MAX);
4127
    backing_file_full = g_malloc(PATH_MAX);
4128
    filename_tmp      = g_malloc(PATH_MAX);
4129

    
4130
    is_protocol = path_has_protocol(backing_file);
4131

    
4132
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4133

    
4134
        /* If either of the filename paths is actually a protocol, then
4135
         * compare unmodified paths; otherwise make paths relative */
4136
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4137
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4138
                retval = curr_bs->backing_hd;
4139
                break;
4140
            }
4141
        } else {
4142
            /* If not an absolute filename path, make it relative to the current
4143
             * image's filename path */
4144
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4145
                         backing_file);
4146

    
4147
            /* We are going to compare absolute pathnames */
4148
            if (!realpath(filename_tmp, filename_full)) {
4149
                continue;
4150
            }
4151

    
4152
            /* We need to make sure the backing filename we are comparing against
4153
             * is relative to the current image filename (or absolute) */
4154
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4155
                         curr_bs->backing_file);
4156

    
4157
            if (!realpath(filename_tmp, backing_file_full)) {
4158
                continue;
4159
            }
4160

    
4161
            if (strcmp(backing_file_full, filename_full) == 0) {
4162
                retval = curr_bs->backing_hd;
4163
                break;
4164
            }
4165
        }
4166
    }
4167

    
4168
    g_free(filename_full);
4169
    g_free(backing_file_full);
4170
    g_free(filename_tmp);
4171
    return retval;
4172
}
4173

    
4174
int bdrv_get_backing_file_depth(BlockDriverState *bs)
4175
{
4176
    if (!bs->drv) {
4177
        return 0;
4178
    }
4179

    
4180
    if (!bs->backing_hd) {
4181
        return 0;
4182
    }
4183

    
4184
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4185
}
4186

    
4187
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4188
{
4189
    BlockDriverState *curr_bs = NULL;
4190

    
4191
    if (!bs) {
4192
        return NULL;
4193
    }
4194

    
4195
    curr_bs = bs;
4196

    
4197
    while (curr_bs->backing_hd) {
4198
        curr_bs = curr_bs->backing_hd;
4199
    }
4200
    return curr_bs;
4201
}
4202

    
4203
/**************************************************************/
4204
/* async I/Os */
4205

    
4206
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4207
                                 QEMUIOVector *qiov, int nb_sectors,
4208
                                 BlockDriverCompletionFunc *cb, void *opaque)
4209
{
4210
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4211

    
4212
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4213
                                 cb, opaque, false);
4214
}
4215

    
4216
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4217
                                  QEMUIOVector *qiov, int nb_sectors,
4218
                                  BlockDriverCompletionFunc *cb, void *opaque)
4219
{
4220
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4221

    
4222
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4223
                                 cb, opaque, true);
4224
}
4225

    
4226
BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4227
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4228
        BlockDriverCompletionFunc *cb, void *opaque)
4229
{
4230
    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4231

    
4232
    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4233
                                 BDRV_REQ_ZERO_WRITE | flags,
4234
                                 cb, opaque, true);
4235
}
4236

    
4237

    
4238
typedef struct MultiwriteCB {
4239
    int error;
4240
    int num_requests;
4241
    int num_callbacks;
4242
    struct {
4243
        BlockDriverCompletionFunc *cb;
4244
        void *opaque;
4245
        QEMUIOVector *free_qiov;
4246
    } callbacks[];
4247
} MultiwriteCB;
4248

    
4249
static void multiwrite_user_cb(MultiwriteCB *mcb)
4250
{
4251
    int i;
4252

    
4253
    for (i = 0; i < mcb->num_callbacks; i++) {
4254
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4255
        if (mcb->callbacks[i].free_qiov) {
4256
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4257
        }
4258
        g_free(mcb->callbacks[i].free_qiov);
4259
    }
4260
}
4261

    
4262
static void multiwrite_cb(void *opaque, int ret)
4263
{
4264
    MultiwriteCB *mcb = opaque;
4265

    
4266
    trace_multiwrite_cb(mcb, ret);
4267

    
4268
    if (ret < 0 && !mcb->error) {
4269
        mcb->error = ret;
4270
    }
4271

    
4272
    mcb->num_requests--;
4273
    if (mcb->num_requests == 0) {
4274
        multiwrite_user_cb(mcb);
4275
        g_free(mcb);
4276
    }
4277
}
4278

    
4279
static int multiwrite_req_compare(const void *a, const void *b)
4280
{
4281
    const BlockRequest *req1 = a, *req2 = b;
4282

    
4283
    /*
4284
     * Note that we can't simply subtract req2->sector from req1->sector
4285
     * here as that could overflow the return value.
4286
     */
4287
    if (req1->sector > req2->sector) {
4288
        return 1;
4289
    } else if (req1->sector < req2->sector) {
4290
        return -1;
4291
    } else {
4292
        return 0;
4293
    }
4294
}
4295

    
4296
/*
4297
 * Takes a bunch of requests and tries to merge them. Returns the number of
4298
 * requests that remain after merging.
4299
 */
4300
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4301
    int num_reqs, MultiwriteCB *mcb)
4302
{
4303
    int i, outidx;
4304

    
4305
    // Sort requests by start sector
4306
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4307

    
4308
    // Check if adjacent requests touch the same clusters. If so, combine them,
4309
    // filling up gaps with zero sectors.
4310
    outidx = 0;
4311
    for (i = 1; i < num_reqs; i++) {
4312
        int merge = 0;
4313
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4314

    
4315
        // Handle exactly sequential writes and overlapping writes.
4316
        if (reqs[i].sector <= oldreq_last) {
4317
            merge = 1;
4318
        }
4319

    
4320
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4321
            merge = 0;
4322
        }
4323

    
4324
        if (merge) {
4325
            size_t size;
4326
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4327
            qemu_iovec_init(qiov,
4328
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4329

    
4330
            // Add the first request to the merged one. If the requests are
4331
            // overlapping, drop the last sectors of the first request.
4332
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
4333
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4334

    
4335
            // We should need to add any zeros between the two requests
4336
            assert (reqs[i].sector <= oldreq_last);
4337

    
4338
            // Add the second request
4339
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4340

    
4341
            reqs[outidx].nb_sectors = qiov->size >> 9;
4342
            reqs[outidx].qiov = qiov;
4343

    
4344
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4345
        } else {
4346
            outidx++;
4347
            reqs[outidx].sector     = reqs[i].sector;
4348
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4349
            reqs[outidx].qiov       = reqs[i].qiov;
4350
        }
4351
    }
4352

    
4353
    return outidx + 1;
4354
}
4355

    
4356
/*
4357
 * Submit multiple AIO write requests at once.
4358
 *
4359
 * On success, the function returns 0 and all requests in the reqs array have
4360
 * been submitted. In error case this function returns -1, and any of the
4361
 * requests may or may not be submitted yet. In particular, this means that the
4362
 * callback will be called for some of the requests, for others it won't. The
4363
 * caller must check the error field of the BlockRequest to wait for the right
4364
 * callbacks (if error != 0, no callback will be called).
4365
 *
4366
 * The implementation may modify the contents of the reqs array, e.g. to merge
4367
 * requests. However, the fields opaque and error are left unmodified as they
4368
 * are used to signal failure for a single request to the caller.
4369
 */
4370
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4371
{
4372
    MultiwriteCB *mcb;
4373
    int i;
4374

    
4375
    /* don't submit writes if we don't have a medium */
4376
    if (bs->drv == NULL) {
4377
        for (i = 0; i < num_reqs; i++) {
4378
            reqs[i].error = -ENOMEDIUM;
4379
        }
4380
        return -1;
4381
    }
4382

    
4383
    if (num_reqs == 0) {
4384
        return 0;
4385
    }
4386

    
4387
    // Create MultiwriteCB structure
4388
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4389
    mcb->num_requests = 0;
4390
    mcb->num_callbacks = num_reqs;
4391

    
4392
    for (i = 0; i < num_reqs; i++) {
4393
        mcb->callbacks[i].cb = reqs[i].cb;
4394
        mcb->callbacks[i].opaque = reqs[i].opaque;
4395
    }
4396

    
4397
    // Check for mergable requests
4398
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4399

    
4400
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4401

    
4402
    /* Run the aio requests. */
4403
    mcb->num_requests = num_reqs;
4404
    for (i = 0; i < num_reqs; i++) {
4405
        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4406
                              reqs[i].nb_sectors, reqs[i].flags,
4407
                              multiwrite_cb, mcb,
4408
                              true);
4409
    }
4410

    
4411
    return 0;
4412
}
4413

    
4414
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4415
{
4416
    acb->aiocb_info->cancel(acb);
4417
}
4418

    
4419
/**************************************************************/
4420
/* async block device emulation */
4421

    
4422
typedef struct BlockDriverAIOCBSync {
4423
    BlockDriverAIOCB common;
4424
    QEMUBH *bh;
4425
    int ret;
4426
    /* vector translation state */
4427
    QEMUIOVector *qiov;
4428
    uint8_t *bounce;
4429
    int is_write;
4430
} BlockDriverAIOCBSync;
4431

    
4432
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4433
{
4434
    BlockDriverAIOCBSync *acb =
4435
        container_of(blockacb, BlockDriverAIOCBSync, common);
4436
    qemu_bh_delete(acb->bh);
4437
    acb->bh = NULL;
4438
    qemu_aio_release(acb);
4439
}
4440

    
4441
static const AIOCBInfo bdrv_em_aiocb_info = {
4442
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4443
    .cancel             = bdrv_aio_cancel_em,
4444
};
4445

    
4446
static void bdrv_aio_bh_cb(void *opaque)
4447
{
4448
    BlockDriverAIOCBSync *acb = opaque;
4449

    
4450
    if (!acb->is_write)
4451
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4452
    qemu_vfree(acb->bounce);
4453
    acb->common.cb(acb->common.opaque, acb->ret);
4454
    qemu_bh_delete(acb->bh);
4455
    acb->bh = NULL;
4456
    qemu_aio_release(acb);
4457
}
4458

    
4459
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4460
                                            int64_t sector_num,
4461
                                            QEMUIOVector *qiov,
4462
                                            int nb_sectors,
4463
                                            BlockDriverCompletionFunc *cb,
4464
                                            void *opaque,
4465
                                            int is_write)
4466

    
4467
{
4468
    BlockDriverAIOCBSync *acb;
4469

    
4470
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4471
    acb->is_write = is_write;
4472
    acb->qiov = qiov;
4473
    acb->bounce = qemu_blockalign(bs, qiov->size);
4474
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4475

    
4476
    if (is_write) {
4477
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4478
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4479
    } else {
4480
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4481
    }
4482

    
4483
    qemu_bh_schedule(acb->bh);
4484

    
4485
    return &acb->common;
4486
}
4487

    
4488
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4489
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4490
        BlockDriverCompletionFunc *cb, void *opaque)
4491
{
4492
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4493
}
4494

    
4495
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4496
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4497
        BlockDriverCompletionFunc *cb, void *opaque)
4498
{
4499
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4500
}
4501

    
4502

    
4503
typedef struct BlockDriverAIOCBCoroutine {
4504
    BlockDriverAIOCB common;
4505
    BlockRequest req;
4506
    bool is_write;
4507
    bool *done;
4508
    QEMUBH* bh;
4509
} BlockDriverAIOCBCoroutine;
4510

    
4511
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4512
{
4513
    BlockDriverAIOCBCoroutine *acb =
4514
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4515
    bool done = false;
4516

    
4517
    acb->done = &done;
4518
    while (!done) {
4519
        qemu_aio_wait();
4520
    }
4521
}
4522

    
4523
static const AIOCBInfo bdrv_em_co_aiocb_info = {
4524
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4525
    .cancel             = bdrv_aio_co_cancel_em,
4526
};
4527

    
4528
static void bdrv_co_em_bh(void *opaque)
4529
{
4530
    BlockDriverAIOCBCoroutine *acb = opaque;
4531

    
4532
    acb->common.cb(acb->common.opaque, acb->req.error);
4533

    
4534
    if (acb->done) {
4535
        *acb->done = true;
4536
    }
4537

    
4538
    qemu_bh_delete(acb->bh);
4539
    qemu_aio_release(acb);
4540
}
4541

    
4542
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4543
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4544
{
4545
    BlockDriverAIOCBCoroutine *acb = opaque;
4546
    BlockDriverState *bs = acb->common.bs;
4547

    
4548
    if (!acb->is_write) {
4549
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4550
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4551
    } else {
4552
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4553
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4554
    }
4555

    
4556
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4557
    qemu_bh_schedule(acb->bh);
4558
}
4559

    
4560
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4561
                                               int64_t sector_num,
4562
                                               QEMUIOVector *qiov,
4563
                                               int nb_sectors,
4564
                                               BdrvRequestFlags flags,
4565
                                               BlockDriverCompletionFunc *cb,
4566
                                               void *opaque,
4567
                                               bool is_write)
4568
{
4569
    Coroutine *co;
4570
    BlockDriverAIOCBCoroutine *acb;
4571

    
4572
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4573
    acb->req.sector = sector_num;
4574
    acb->req.nb_sectors = nb_sectors;
4575
    acb->req.qiov = qiov;
4576
    acb->req.flags = flags;
4577
    acb->is_write = is_write;
4578
    acb->done = NULL;
4579

    
4580
    co = qemu_coroutine_create(bdrv_co_do_rw);
4581
    qemu_coroutine_enter(co, acb);
4582

    
4583
    return &acb->common;
4584
}
4585

    
4586
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4587
{
4588
    BlockDriverAIOCBCoroutine *acb = opaque;
4589
    BlockDriverState *bs = acb->common.bs;
4590

    
4591
    acb->req.error = bdrv_co_flush(bs);
4592
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4593
    qemu_bh_schedule(acb->bh);
4594
}
4595

    
4596
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4597
        BlockDriverCompletionFunc *cb, void *opaque)
4598
{
4599
    trace_bdrv_aio_flush(bs, opaque);
4600

    
4601
    Coroutine *co;
4602
    BlockDriverAIOCBCoroutine *acb;
4603

    
4604
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4605
    acb->done = NULL;
4606

    
4607
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4608
    qemu_coroutine_enter(co, acb);
4609

    
4610
    return &acb->common;
4611
}
4612

    
4613
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4614
{
4615
    BlockDriverAIOCBCoroutine *acb = opaque;
4616
    BlockDriverState *bs = acb->common.bs;
4617

    
4618
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4619
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4620
    qemu_bh_schedule(acb->bh);
4621
}
4622

    
4623
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4624
        int64_t sector_num, int nb_sectors,
4625
        BlockDriverCompletionFunc *cb, void *opaque)
4626
{
4627
    Coroutine *co;
4628
    BlockDriverAIOCBCoroutine *acb;
4629

    
4630
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4631

    
4632
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4633
    acb->req.sector = sector_num;
4634
    acb->req.nb_sectors = nb_sectors;
4635
    acb->done = NULL;
4636
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4637
    qemu_coroutine_enter(co, acb);
4638

    
4639
    return &acb->common;
4640
}
4641

    
4642
void bdrv_init(void)
4643
{
4644
    module_call_init(MODULE_INIT_BLOCK);
4645
}
4646

    
4647
void bdrv_init_with_whitelist(void)
4648
{
4649
    use_bdrv_whitelist = 1;
4650
    bdrv_init();
4651
}
4652

    
4653
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4654
                   BlockDriverCompletionFunc *cb, void *opaque)
4655
{
4656
    BlockDriverAIOCB *acb;
4657

    
4658
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4659
    acb->aiocb_info = aiocb_info;
4660
    acb->bs = bs;
4661
    acb->cb = cb;
4662
    acb->opaque = opaque;
4663
    return acb;
4664
}
4665

    
4666
void qemu_aio_release(void *p)
4667
{
4668
    BlockDriverAIOCB *acb = p;
4669
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4670
}
4671

    
4672
/**************************************************************/
4673
/* Coroutine block device emulation */
4674

    
4675
typedef struct CoroutineIOCompletion {
4676
    Coroutine *coroutine;
4677
    int ret;
4678
} CoroutineIOCompletion;
4679

    
4680
static void bdrv_co_io_em_complete(void *opaque, int ret)
4681
{
4682
    CoroutineIOCompletion *co = opaque;
4683

    
4684
    co->ret = ret;
4685
    qemu_coroutine_enter(co->coroutine, NULL);
4686
}
4687

    
4688
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4689
                                      int nb_sectors, QEMUIOVector *iov,
4690
                                      bool is_write)
4691
{
4692
    CoroutineIOCompletion co = {
4693
        .coroutine = qemu_coroutine_self(),
4694
    };
4695
    BlockDriverAIOCB *acb;
4696

    
4697
    if (is_write) {
4698
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4699
                                       bdrv_co_io_em_complete, &co);
4700
    } else {
4701
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4702
                                      bdrv_co_io_em_complete, &co);
4703
    }
4704

    
4705
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4706
    if (!acb) {
4707
        return -EIO;
4708
    }
4709
    qemu_coroutine_yield();
4710

    
4711
    return co.ret;
4712
}
4713

    
4714
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4715
                                         int64_t sector_num, int nb_sectors,
4716
                                         QEMUIOVector *iov)
4717
{
4718
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4719
}
4720

    
4721
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4722
                                         int64_t sector_num, int nb_sectors,
4723
                                         QEMUIOVector *iov)
4724
{
4725
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4726
}
4727

    
4728
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4729
{
4730
    RwCo *rwco = opaque;
4731

    
4732
    rwco->ret = bdrv_co_flush(rwco->bs);
4733
}
4734

    
4735
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4736
{
4737
    int ret;
4738

    
4739
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4740
        return 0;
4741
    }
4742

    
4743
    /* Write back cached data to the OS even with cache=unsafe */
4744
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4745
    if (bs->drv->bdrv_co_flush_to_os) {
4746
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4747
        if (ret < 0) {
4748
            return ret;
4749
        }
4750
    }
4751

    
4752
    /* But don't actually force it to the disk with cache=unsafe */
4753
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4754
        goto flush_parent;
4755
    }
4756

    
4757
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4758
    if (bs->drv->bdrv_co_flush_to_disk) {
4759
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4760
    } else if (bs->drv->bdrv_aio_flush) {
4761
        BlockDriverAIOCB *acb;
4762
        CoroutineIOCompletion co = {
4763
            .coroutine = qemu_coroutine_self(),
4764
        };
4765

    
4766
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4767
        if (acb == NULL) {
4768
            ret = -EIO;
4769
        } else {
4770
            qemu_coroutine_yield();
4771
            ret = co.ret;
4772
        }
4773
    } else {
4774
        /*
4775
         * Some block drivers always operate in either writethrough or unsafe
4776
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4777
         * know how the server works (because the behaviour is hardcoded or
4778
         * depends on server-side configuration), so we can't ensure that
4779
         * everything is safe on disk. Returning an error doesn't work because
4780
         * that would break guests even if the server operates in writethrough
4781
         * mode.
4782
         *
4783
         * Let's hope the user knows what he's doing.
4784
         */
4785
        ret = 0;
4786
    }
4787
    if (ret < 0) {
4788
        return ret;
4789
    }
4790

    
4791
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4792
     * in the case of cache=unsafe, so there are no useless flushes.
4793
     */
4794
flush_parent:
4795
    return bdrv_co_flush(bs->file);
4796
}
4797

    
4798
void bdrv_invalidate_cache(BlockDriverState *bs)
4799
{
4800
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4801
        bs->drv->bdrv_invalidate_cache(bs);
4802
    }
4803
}
4804

    
4805
void bdrv_invalidate_cache_all(void)
4806
{
4807
    BlockDriverState *bs;
4808

    
4809
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4810
        bdrv_invalidate_cache(bs);
4811
    }
4812
}
4813

    
4814
void bdrv_clear_incoming_migration_all(void)
4815
{
4816
    BlockDriverState *bs;
4817

    
4818
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4819
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4820
    }
4821
}
4822

    
4823
int bdrv_flush(BlockDriverState *bs)
4824
{
4825
    Coroutine *co;
4826
    RwCo rwco = {
4827
        .bs = bs,
4828
        .ret = NOT_DONE,
4829
    };
4830

    
4831
    if (qemu_in_coroutine()) {
4832
        /* Fast-path if already in coroutine context */
4833
        bdrv_flush_co_entry(&rwco);
4834
    } else {
4835
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4836
        qemu_coroutine_enter(co, &rwco);
4837
        while (rwco.ret == NOT_DONE) {
4838
            qemu_aio_wait();
4839
        }
4840
    }
4841

    
4842
    return rwco.ret;
4843
}
4844

    
4845
typedef struct DiscardCo {
4846
    BlockDriverState *bs;
4847
    int64_t sector_num;
4848
    int nb_sectors;
4849
    int ret;
4850
} DiscardCo;
4851
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4852
{
4853
    DiscardCo *rwco = opaque;
4854

    
4855
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4856
}
4857

    
4858
/* if no limit is specified in the BlockLimits use a default
4859
 * of 32768 512-byte sectors (16 MiB) per request.
4860
 */
4861
#define MAX_DISCARD_DEFAULT 32768
4862

    
4863
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4864
                                 int nb_sectors)
4865
{
4866
    int max_discard;
4867

    
4868
    if (!bs->drv) {
4869
        return -ENOMEDIUM;
4870
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4871
        return -EIO;
4872
    } else if (bs->read_only) {
4873
        return -EROFS;
4874
    }
4875

    
4876
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4877

    
4878
    /* Do nothing if disabled.  */
4879
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4880
        return 0;
4881
    }
4882

    
4883
    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4884
        return 0;
4885
    }
4886

    
4887
    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4888
    while (nb_sectors > 0) {
4889
        int ret;
4890
        int num = nb_sectors;
4891

    
4892
        /* align request */
4893
        if (bs->bl.discard_alignment &&
4894
            num >= bs->bl.discard_alignment &&
4895
            sector_num % bs->bl.discard_alignment) {
4896
            if (num > bs->bl.discard_alignment) {
4897
                num = bs->bl.discard_alignment;
4898
            }
4899
            num -= sector_num % bs->bl.discard_alignment;
4900
        }
4901

    
4902
        /* limit request size */
4903
        if (num > max_discard) {
4904
            num = max_discard;
4905
        }
4906

    
4907
        if (bs->drv->bdrv_co_discard) {
4908
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4909
        } else {
4910
            BlockDriverAIOCB *acb;
4911
            CoroutineIOCompletion co = {
4912
                .coroutine = qemu_coroutine_self(),
4913
            };
4914

    
4915
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4916
                                            bdrv_co_io_em_complete, &co);
4917
            if (acb == NULL) {
4918
                return -EIO;
4919
            } else {
4920
                qemu_coroutine_yield();
4921
                ret = co.ret;
4922
            }
4923
        }
4924
        if (ret && ret != -ENOTSUP) {
4925
            return ret;
4926
        }
4927

    
4928
        sector_num += num;
4929
        nb_sectors -= num;
4930
    }
4931
    return 0;
4932
}
4933

    
4934
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4935
{
4936
    Coroutine *co;
4937
    DiscardCo rwco = {
4938
        .bs = bs,
4939
        .sector_num = sector_num,
4940
        .nb_sectors = nb_sectors,
4941
        .ret = NOT_DONE,
4942
    };
4943

    
4944
    if (qemu_in_coroutine()) {
4945
        /* Fast-path if already in coroutine context */
4946
        bdrv_discard_co_entry(&rwco);
4947
    } else {
4948
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4949
        qemu_coroutine_enter(co, &rwco);
4950
        while (rwco.ret == NOT_DONE) {
4951
            qemu_aio_wait();
4952
        }
4953
    }
4954

    
4955
    return rwco.ret;
4956
}
4957

    
4958
/**************************************************************/
4959
/* removable device support */
4960

    
4961
/**
4962
 * Return TRUE if the media is present
4963
 */
4964
int bdrv_is_inserted(BlockDriverState *bs)
4965
{
4966
    BlockDriver *drv = bs->drv;
4967

    
4968
    if (!drv)
4969
        return 0;
4970
    if (!drv->bdrv_is_inserted)
4971
        return 1;
4972
    return drv->bdrv_is_inserted(bs);
4973
}
4974

    
4975
/**
4976
 * Return whether the media changed since the last call to this
4977
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4978
 */
4979
int bdrv_media_changed(BlockDriverState *bs)
4980
{
4981
    BlockDriver *drv = bs->drv;
4982

    
4983
    if (drv && drv->bdrv_media_changed) {
4984
        return drv->bdrv_media_changed(bs);
4985
    }
4986
    return -ENOTSUP;
4987
}
4988

    
4989
/**
4990
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4991
 */
4992
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4993
{
4994
    BlockDriver *drv = bs->drv;
4995

    
4996
    if (drv && drv->bdrv_eject) {
4997
        drv->bdrv_eject(bs, eject_flag);
4998
    }
4999

    
5000
    if (bs->device_name[0] != '\0') {
5001
        bdrv_emit_qmp_eject_event(bs, eject_flag);
5002
    }
5003
}
5004

    
5005
/**
5006
 * Lock or unlock the media (if it is locked, the user won't be able
5007
 * to eject it manually).
5008
 */
5009
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5010
{
5011
    BlockDriver *drv = bs->drv;
5012

    
5013
    trace_bdrv_lock_medium(bs, locked);
5014

    
5015
    if (drv && drv->bdrv_lock_medium) {
5016
        drv->bdrv_lock_medium(bs, locked);
5017
    }
5018
}
5019

    
5020
/* needed for generic scsi interface */
5021

    
5022
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5023
{
5024
    BlockDriver *drv = bs->drv;
5025

    
5026
    if (drv && drv->bdrv_ioctl)
5027
        return drv->bdrv_ioctl(bs, req, buf);
5028
    return -ENOTSUP;
5029
}
5030

    
5031
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5032
        unsigned long int req, void *buf,
5033
        BlockDriverCompletionFunc *cb, void *opaque)
5034
{
5035
    BlockDriver *drv = bs->drv;
5036

    
5037
    if (drv && drv->bdrv_aio_ioctl)
5038
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5039
    return NULL;
5040
}
5041

    
5042
void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5043
{
5044
    bs->guest_block_size = align;
5045
}
5046

    
5047
void *qemu_blockalign(BlockDriverState *bs, size_t size)
5048
{
5049
    return qemu_memalign(bdrv_opt_mem_align(bs), size);
5050
}
5051

    
5052
/*
5053
 * Check if all memory in this vector is sector aligned.
5054
 */
5055
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5056
{
5057
    int i;
5058
    size_t alignment = bdrv_opt_mem_align(bs);
5059

    
5060
    for (i = 0; i < qiov->niov; i++) {
5061
        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5062
            return false;
5063
        }
5064
        if (qiov->iov[i].iov_len % alignment) {
5065
            return false;
5066
        }
5067
    }
5068

    
5069
    return true;
5070
}
5071

    
5072
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5073
{
5074
    int64_t bitmap_size;
5075
    BdrvDirtyBitmap *bitmap;
5076

    
5077
    assert((granularity & (granularity - 1)) == 0);
5078

    
5079
    granularity >>= BDRV_SECTOR_BITS;
5080
    assert(granularity);
5081
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5082
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5083
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5084
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5085
    return bitmap;
5086
}
5087

    
5088
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5089
{
5090
    BdrvDirtyBitmap *bm, *next;
5091
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5092
        if (bm == bitmap) {
5093
            QLIST_REMOVE(bitmap, list);
5094
            hbitmap_free(bitmap->bitmap);
5095
            g_free(bitmap);
5096
            return;
5097
        }
5098
    }
5099
}
5100

    
5101
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5102
{
5103
    BdrvDirtyBitmap *bm;
5104
    BlockDirtyInfoList *list = NULL;
5105
    BlockDirtyInfoList **plist = &list;
5106

    
5107
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5108
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5109
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5110
        info->count = bdrv_get_dirty_count(bs, bm);
5111
        info->granularity =
5112
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5113
        entry->value = info;
5114
        *plist = entry;
5115
        plist = &entry->next;
5116
    }
5117

    
5118
    return list;
5119
}
5120

    
5121
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5122
{
5123
    if (bitmap) {
5124
        return hbitmap_get(bitmap->bitmap, sector);
5125
    } else {
5126
        return 0;
5127
    }
5128
}
5129

    
5130
void bdrv_dirty_iter_init(BlockDriverState *bs,
5131
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5132
{
5133
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5134
}
5135

    
5136
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5137
                    int nr_sectors)
5138
{
5139
    BdrvDirtyBitmap *bitmap;
5140
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5141
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5142
    }
5143
}
5144

    
5145
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5146
{
5147
    BdrvDirtyBitmap *bitmap;
5148
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5149
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5150
    }
5151
}
5152

    
5153
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5154
{
5155
    return hbitmap_count(bitmap->bitmap);
5156
}
5157

    
5158
/* Get a reference to bs */
5159
void bdrv_ref(BlockDriverState *bs)
5160
{
5161
    bs->refcnt++;
5162
}
5163

    
5164
/* Release a previously grabbed reference to bs.
5165
 * If after releasing, reference count is zero, the BlockDriverState is
5166
 * deleted. */
5167
void bdrv_unref(BlockDriverState *bs)
5168
{
5169
    assert(bs->refcnt > 0);
5170
    if (--bs->refcnt == 0) {
5171
        bdrv_delete(bs);
5172
    }
5173
}
5174

    
5175
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5176
{
5177
    assert(bs->in_use != in_use);
5178
    bs->in_use = in_use;
5179
}
5180

    
5181
int bdrv_in_use(BlockDriverState *bs)
5182
{
5183
    return bs->in_use;
5184
}
5185

    
5186
void bdrv_iostatus_enable(BlockDriverState *bs)
5187
{
5188
    bs->iostatus_enabled = true;
5189
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5190
}
5191

    
5192
/* The I/O status is only enabled if the drive explicitly
5193
 * enables it _and_ the VM is configured to stop on errors */
5194
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5195
{
5196
    return (bs->iostatus_enabled &&
5197
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5198
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5199
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5200
}
5201

    
5202
void bdrv_iostatus_disable(BlockDriverState *bs)
5203
{
5204
    bs->iostatus_enabled = false;
5205
}
5206

    
5207
void bdrv_iostatus_reset(BlockDriverState *bs)
5208
{
5209
    if (bdrv_iostatus_is_enabled(bs)) {
5210
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5211
        if (bs->job) {
5212
            block_job_iostatus_reset(bs->job);
5213
        }
5214
    }
5215
}
5216

    
5217
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5218
{
5219
    assert(bdrv_iostatus_is_enabled(bs));
5220
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5221
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5222
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
5223
    }
5224
}
5225

    
5226
void
5227
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5228
        enum BlockAcctType type)
5229
{
5230
    assert(type < BDRV_MAX_IOTYPE);
5231

    
5232
    cookie->bytes = bytes;
5233
    cookie->start_time_ns = get_clock();
5234
    cookie->type = type;
5235
}
5236

    
5237
void
5238
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5239
{
5240
    assert(cookie->type < BDRV_MAX_IOTYPE);
5241

    
5242
    bs->nr_bytes[cookie->type] += cookie->bytes;
5243
    bs->nr_ops[cookie->type]++;
5244
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5245
}
5246

    
5247
void bdrv_img_create(const char *filename, const char *fmt,
5248
                     const char *base_filename, const char *base_fmt,
5249
                     char *options, uint64_t img_size, int flags,
5250
                     Error **errp, bool quiet)
5251
{
5252
    QEMUOptionParameter *param = NULL, *create_options = NULL;
5253
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
5254
    BlockDriver *drv, *proto_drv;
5255
    BlockDriver *backing_drv = NULL;
5256
    Error *local_err = NULL;
5257
    int ret = 0;
5258

    
5259
    /* Find driver and parse its options */
5260
    drv = bdrv_find_format(fmt);
5261
    if (!drv) {
5262
        error_setg(errp, "Unknown file format '%s'", fmt);
5263
        return;
5264
    }
5265

    
5266
    proto_drv = bdrv_find_protocol(filename, true);
5267
    if (!proto_drv) {
5268
        error_setg(errp, "Unknown protocol '%s'", filename);
5269
        return;
5270
    }
5271

    
5272
    create_options = append_option_parameters(create_options,
5273
                                              drv->create_options);
5274
    create_options = append_option_parameters(create_options,
5275
                                              proto_drv->create_options);
5276

    
5277
    /* Create parameter list with default values */
5278
    param = parse_option_parameters("", create_options, param);
5279

    
5280
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5281

    
5282
    /* Parse -o options */
5283
    if (options) {
5284
        param = parse_option_parameters(options, create_options, param);
5285
        if (param == NULL) {
5286
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
5287
            goto out;
5288
        }
5289
    }
5290

    
5291
    if (base_filename) {
5292
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5293
                                 base_filename)) {
5294
            error_setg(errp, "Backing file not supported for file format '%s'",
5295
                       fmt);
5296
            goto out;
5297
        }
5298
    }
5299

    
5300
    if (base_fmt) {
5301
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5302
            error_setg(errp, "Backing file format not supported for file "
5303
                             "format '%s'", fmt);
5304
            goto out;
5305
        }
5306
    }
5307

    
5308
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5309
    if (backing_file && backing_file->value.s) {
5310
        if (!strcmp(filename, backing_file->value.s)) {
5311
            error_setg(errp, "Error: Trying to create an image with the "
5312
                             "same filename as the backing file");
5313
            goto out;
5314
        }
5315
    }
5316

    
5317
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5318
    if (backing_fmt && backing_fmt->value.s) {
5319
        backing_drv = bdrv_find_format(backing_fmt->value.s);
5320
        if (!backing_drv) {
5321
            error_setg(errp, "Unknown backing file format '%s'",
5322
                       backing_fmt->value.s);
5323
            goto out;
5324
        }
5325
    }
5326

    
5327
    // The size for the image must always be specified, with one exception:
5328
    // If we are using a backing file, we can obtain the size from there
5329
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
5330
    if (size && size->value.n == -1) {
5331
        if (backing_file && backing_file->value.s) {
5332
            BlockDriverState *bs;
5333
            uint64_t size;
5334
            char buf[32];
5335
            int back_flags;
5336

    
5337
            /* backing files always opened read-only */
5338
            back_flags =
5339
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5340

    
5341
            bs = NULL;
5342
            ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5343
                            backing_drv, &local_err);
5344
            if (ret < 0) {
5345
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
5346
                                 backing_file->value.s,
5347
                                 error_get_pretty(local_err));
5348
                error_free(local_err);
5349
                local_err = NULL;
5350
                goto out;
5351
            }
5352
            bdrv_get_geometry(bs, &size);
5353
            size *= 512;
5354

    
5355
            snprintf(buf, sizeof(buf), "%" PRId64, size);
5356
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5357

    
5358
            bdrv_unref(bs);
5359
        } else {
5360
            error_setg(errp, "Image creation needs a size parameter");
5361
            goto out;
5362
        }
5363
    }
5364

    
5365
    if (!quiet) {
5366
        printf("Formatting '%s', fmt=%s ", filename, fmt);
5367
        print_option_parameters(param);
5368
        puts("");
5369
    }
5370
    ret = bdrv_create(drv, filename, param, &local_err);
5371
    if (ret == -EFBIG) {
5372
        /* This is generally a better message than whatever the driver would
5373
         * deliver (especially because of the cluster_size_hint), since that
5374
         * is most probably not much different from "image too large". */
5375
        const char *cluster_size_hint = "";
5376
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5377
            cluster_size_hint = " (try using a larger cluster size)";
5378
        }
5379
        error_setg(errp, "The image size is too large for file format '%s'"
5380
                   "%s", fmt, cluster_size_hint);
5381
        error_free(local_err);
5382
        local_err = NULL;
5383
    }
5384

    
5385
out:
5386
    free_option_parameters(create_options);
5387
    free_option_parameters(param);
5388

    
5389
    if (local_err) {
5390
        error_propagate(errp, local_err);
5391
    }
5392
}
5393

    
5394
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5395
{
5396
    /* Currently BlockDriverState always uses the main loop AioContext */
5397
    return qemu_get_aio_context();
5398
}
5399

    
5400
void bdrv_add_before_write_notifier(BlockDriverState *bs,
5401
                                    NotifierWithReturn *notifier)
5402
{
5403
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5404
}
5405

    
5406
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5407
{
5408
    if (bs->drv->bdrv_amend_options == NULL) {
5409
        return -ENOTSUP;
5410
    }
5411
    return bs->drv->bdrv_amend_options(bs, options);
5412
}
5413

    
5414
/* Used to recurse on single child block filters.
5415
 * Single child block filter will store their child in bs->file.
5416
 */
5417
bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5418
                                      BlockDriverState *candidate)
5419
{
5420
    if (!bs->drv) {
5421
        return false;
5422
    }
5423

    
5424
    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5425
        if (bs == candidate) {
5426
            return true;
5427
        } else {
5428
            return false;
5429
        }
5430
    }
5431

    
5432
    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5433
        return false;
5434
    }
5435

    
5436
    if (!bs->file) {
5437
        return false;
5438
    }
5439

    
5440
    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5441
}
5442

    
5443
bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5444
                                      BlockDriverState *candidate)
5445
{
5446
    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5447
        return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5448
    }
5449

    
5450
    return bdrv_generic_is_first_non_filter(bs, candidate);
5451
}
5452

    
5453
/* This function checks if the candidate is the first non filter bs down it's
5454
 * bs chain. Since we don't have pointers to parents it explore all bs chains
5455
 * from the top. Some filters can choose not to pass down the recursion.
5456
 */
5457
bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5458
{
5459
    BlockDriverState *bs;
5460

    
5461
    /* walk down the bs forest recursively */
5462
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5463
        bool perm;
5464

    
5465
        perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5466

    
5467
        /* candidate is the first non filter */
5468
        if (perm) {
5469
            return true;
5470
        }
5471
    }
5472

    
5473
    return false;
5474
}