Statistics
| Branch: | Revision:

root / block.c @ feature-archipelago

History | View | Annotate | Download (150.9 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor/monitor.h"
28
#include "block/block_int.h"
29
#include "block/blockjob.h"
30
#include "qemu/module.h"
31
#include "qapi/qmp/qjson.h"
32
#include "sysemu/sysemu.h"
33
#include "qemu/notify.h"
34
#include "block/coroutine.h"
35
#include "block/qapi.h"
36
#include "qmp-commands.h"
37
#include "qemu/timer.h"
38

    
39
#ifdef CONFIG_BSD
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <sys/ioctl.h>
43
#include <sys/queue.h>
44
#ifndef __DragonFly__
45
#include <sys/disk.h>
46
#endif
47
#endif
48

    
49
#ifdef _WIN32
50
#include <windows.h>
51
#endif
52

    
53
struct BdrvDirtyBitmap {
54
    HBitmap *bitmap;
55
    QLIST_ENTRY(BdrvDirtyBitmap) list;
56
};
57

    
58
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59

    
60
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63
        BlockDriverCompletionFunc *cb, void *opaque);
64
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66
        BlockDriverCompletionFunc *cb, void *opaque);
67
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68
                                         int64_t sector_num, int nb_sectors,
69
                                         QEMUIOVector *iov);
70
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71
                                         int64_t sector_num, int nb_sectors,
72
                                         QEMUIOVector *iov);
73
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75
    BdrvRequestFlags flags);
76
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78
    BdrvRequestFlags flags);
79
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80
                                               int64_t sector_num,
81
                                               QEMUIOVector *qiov,
82
                                               int nb_sectors,
83
                                               BdrvRequestFlags flags,
84
                                               BlockDriverCompletionFunc *cb,
85
                                               void *opaque,
86
                                               bool is_write);
87
static void coroutine_fn bdrv_co_do_rw(void *opaque);
88
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96

    
97
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_set_io_limits(BlockDriverState *bs,
125
                        ThrottleConfig *cfg)
126
{
127
    int i;
128

    
129
    throttle_config(&bs->throttle_state, cfg);
130

    
131
    for (i = 0; i < 2; i++) {
132
        qemu_co_enter_next(&bs->throttled_reqs[i]);
133
    }
134
}
135

    
136
/* this function drain all the throttled IOs */
137
static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138
{
139
    bool drained = false;
140
    bool enabled = bs->io_limits_enabled;
141
    int i;
142

    
143
    bs->io_limits_enabled = false;
144

    
145
    for (i = 0; i < 2; i++) {
146
        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147
            drained = true;
148
        }
149
    }
150

    
151
    bs->io_limits_enabled = enabled;
152

    
153
    return drained;
154
}
155

    
156
void bdrv_io_limits_disable(BlockDriverState *bs)
157
{
158
    bs->io_limits_enabled = false;
159

    
160
    bdrv_start_throttled_reqs(bs);
161

    
162
    throttle_destroy(&bs->throttle_state);
163
}
164

    
165
static void bdrv_throttle_read_timer_cb(void *opaque)
166
{
167
    BlockDriverState *bs = opaque;
168
    qemu_co_enter_next(&bs->throttled_reqs[0]);
169
}
170

    
171
static void bdrv_throttle_write_timer_cb(void *opaque)
172
{
173
    BlockDriverState *bs = opaque;
174
    qemu_co_enter_next(&bs->throttled_reqs[1]);
175
}
176

    
177
/* should be called before bdrv_set_io_limits if a limit is set */
178
void bdrv_io_limits_enable(BlockDriverState *bs)
179
{
180
    assert(!bs->io_limits_enabled);
181
    throttle_init(&bs->throttle_state,
182
                  QEMU_CLOCK_VIRTUAL,
183
                  bdrv_throttle_read_timer_cb,
184
                  bdrv_throttle_write_timer_cb,
185
                  bs);
186
    bs->io_limits_enabled = true;
187
}
188

    
189
/* This function makes an IO wait if needed
190
 *
191
 * @nb_sectors: the number of sectors of the IO
192
 * @is_write:   is the IO a write
193
 */
194
static void bdrv_io_limits_intercept(BlockDriverState *bs,
195
                                     unsigned int bytes,
196
                                     bool is_write)
197
{
198
    /* does this io must wait */
199
    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200

    
201
    /* if must wait or any request of this type throttled queue the IO */
202
    if (must_wait ||
203
        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204
        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205
    }
206

    
207
    /* the IO will be executed, do the accounting */
208
    throttle_account(&bs->throttle_state, is_write, bytes);
209

    
210

    
211
    /* if the next request must wait -> do nothing */
212
    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213
        return;
214
    }
215

    
216
    /* else queue next request for execution */
217
    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218
}
219

    
220
size_t bdrv_opt_mem_align(BlockDriverState *bs)
221
{
222
    if (!bs || !bs->drv) {
223
        /* 4k should be on the safe side */
224
        return 4096;
225
    }
226

    
227
    return bs->bl.opt_mem_alignment;
228
}
229

    
230
/* check if the path starts with "<protocol>:" */
231
static int path_has_protocol(const char *path)
232
{
233
    const char *p;
234

    
235
#ifdef _WIN32
236
    if (is_windows_drive(path) ||
237
        is_windows_drive_prefix(path)) {
238
        return 0;
239
    }
240
    p = path + strcspn(path, ":/\\");
241
#else
242
    p = path + strcspn(path, ":/");
243
#endif
244

    
245
    return *p == ':';
246
}
247

    
248
int path_is_absolute(const char *path)
249
{
250
#ifdef _WIN32
251
    /* specific case for names like: "\\.\d:" */
252
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253
        return 1;
254
    }
255
    return (*path == '/' || *path == '\\');
256
#else
257
    return (*path == '/');
258
#endif
259
}
260

    
261
/* if filename is absolute, just copy it to dest. Otherwise, build a
262
   path to it by considering it is relative to base_path. URL are
263
   supported. */
264
void path_combine(char *dest, int dest_size,
265
                  const char *base_path,
266
                  const char *filename)
267
{
268
    const char *p, *p1;
269
    int len;
270

    
271
    if (dest_size <= 0)
272
        return;
273
    if (path_is_absolute(filename)) {
274
        pstrcpy(dest, dest_size, filename);
275
    } else {
276
        p = strchr(base_path, ':');
277
        if (p)
278
            p++;
279
        else
280
            p = base_path;
281
        p1 = strrchr(base_path, '/');
282
#ifdef _WIN32
283
        {
284
            const char *p2;
285
            p2 = strrchr(base_path, '\\');
286
            if (!p1 || p2 > p1)
287
                p1 = p2;
288
        }
289
#endif
290
        if (p1)
291
            p1++;
292
        else
293
            p1 = base_path;
294
        if (p1 > p)
295
            p = p1;
296
        len = p - base_path;
297
        if (len > dest_size - 1)
298
            len = dest_size - 1;
299
        memcpy(dest, base_path, len);
300
        dest[len] = '\0';
301
        pstrcat(dest, dest_size, filename);
302
    }
303
}
304

    
305
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306
{
307
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308
        pstrcpy(dest, sz, bs->backing_file);
309
    } else {
310
        path_combine(dest, sz, bs->filename, bs->backing_file);
311
    }
312
}
313

    
314
void bdrv_register(BlockDriver *bdrv)
315
{
316
    /* Block drivers without coroutine functions need emulation */
317
    if (!bdrv->bdrv_co_readv) {
318
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
319
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
320

    
321
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322
         * the block driver lacks aio we need to emulate that too.
323
         */
324
        if (!bdrv->bdrv_aio_readv) {
325
            /* add AIO emulation layer */
326
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328
        }
329
    }
330

    
331
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332
}
333

    
334
/* create a new block device (by default it is empty) */
335
BlockDriverState *bdrv_new(const char *device_name)
336
{
337
    BlockDriverState *bs;
338

    
339
    bs = g_malloc0(sizeof(BlockDriverState));
340
    QLIST_INIT(&bs->dirty_bitmaps);
341
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342
    if (device_name[0] != '\0') {
343
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344
    }
345
    bdrv_iostatus_disable(bs);
346
    notifier_list_init(&bs->close_notifiers);
347
    notifier_with_return_list_init(&bs->before_write_notifiers);
348
    qemu_co_queue_init(&bs->throttled_reqs[0]);
349
    qemu_co_queue_init(&bs->throttled_reqs[1]);
350
    bs->refcnt = 1;
351

    
352
    return bs;
353
}
354

    
355
void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356
{
357
    notifier_list_add(&bs->close_notifiers, notify);
358
}
359

    
360
BlockDriver *bdrv_find_format(const char *format_name)
361
{
362
    BlockDriver *drv1;
363
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364
        if (!strcmp(drv1->format_name, format_name)) {
365
            return drv1;
366
        }
367
    }
368
    return NULL;
369
}
370

    
371
static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372
{
373
    static const char *whitelist_rw[] = {
374
        CONFIG_BDRV_RW_WHITELIST
375
    };
376
    static const char *whitelist_ro[] = {
377
        CONFIG_BDRV_RO_WHITELIST
378
    };
379
    const char **p;
380

    
381
    if (!whitelist_rw[0] && !whitelist_ro[0]) {
382
        return 1;               /* no whitelist, anything goes */
383
    }
384

    
385
    for (p = whitelist_rw; *p; p++) {
386
        if (!strcmp(drv->format_name, *p)) {
387
            return 1;
388
        }
389
    }
390
    if (read_only) {
391
        for (p = whitelist_ro; *p; p++) {
392
            if (!strcmp(drv->format_name, *p)) {
393
                return 1;
394
            }
395
        }
396
    }
397
    return 0;
398
}
399

    
400
BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401
                                          bool read_only)
402
{
403
    BlockDriver *drv = bdrv_find_format(format_name);
404
    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405
}
406

    
407
typedef struct CreateCo {
408
    BlockDriver *drv;
409
    char *filename;
410
    QEMUOptionParameter *options;
411
    int ret;
412
    Error *err;
413
} CreateCo;
414

    
415
static void coroutine_fn bdrv_create_co_entry(void *opaque)
416
{
417
    Error *local_err = NULL;
418
    int ret;
419

    
420
    CreateCo *cco = opaque;
421
    assert(cco->drv);
422

    
423
    ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424
    if (local_err) {
425
        error_propagate(&cco->err, local_err);
426
    }
427
    cco->ret = ret;
428
}
429

    
430
int bdrv_create(BlockDriver *drv, const char* filename,
431
    QEMUOptionParameter *options, Error **errp)
432
{
433
    int ret;
434

    
435
    Coroutine *co;
436
    CreateCo cco = {
437
        .drv = drv,
438
        .filename = g_strdup(filename),
439
        .options = options,
440
        .ret = NOT_DONE,
441
        .err = NULL,
442
    };
443

    
444
    if (!drv->bdrv_create) {
445
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446
        ret = -ENOTSUP;
447
        goto out;
448
    }
449

    
450
    if (qemu_in_coroutine()) {
451
        /* Fast-path if already in coroutine context */
452
        bdrv_create_co_entry(&cco);
453
    } else {
454
        co = qemu_coroutine_create(bdrv_create_co_entry);
455
        qemu_coroutine_enter(co, &cco);
456
        while (cco.ret == NOT_DONE) {
457
            qemu_aio_wait();
458
        }
459
    }
460

    
461
    ret = cco.ret;
462
    if (ret < 0) {
463
        if (cco.err) {
464
            error_propagate(errp, cco.err);
465
        } else {
466
            error_setg_errno(errp, -ret, "Could not create image");
467
        }
468
    }
469

    
470
out:
471
    g_free(cco.filename);
472
    return ret;
473
}
474

    
475
int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476
                     Error **errp)
477
{
478
    BlockDriver *drv;
479
    Error *local_err = NULL;
480
    int ret;
481

    
482
    drv = bdrv_find_protocol(filename, true);
483
    if (drv == NULL) {
484
        error_setg(errp, "Could not find protocol for file '%s'", filename);
485
        return -ENOENT;
486
    }
487

    
488
    ret = bdrv_create(drv, filename, options, &local_err);
489
    if (local_err) {
490
        error_propagate(errp, local_err);
491
    }
492
    return ret;
493
}
494

    
495
int bdrv_refresh_limits(BlockDriverState *bs)
496
{
497
    BlockDriver *drv = bs->drv;
498

    
499
    memset(&bs->bl, 0, sizeof(bs->bl));
500

    
501
    if (!drv) {
502
        return 0;
503
    }
504

    
505
    /* Take some limits from the children as a default */
506
    if (bs->file) {
507
        bdrv_refresh_limits(bs->file);
508
        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509
        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510
    } else {
511
        bs->bl.opt_mem_alignment = 512;
512
    }
513

    
514
    if (bs->backing_hd) {
515
        bdrv_refresh_limits(bs->backing_hd);
516
        bs->bl.opt_transfer_length =
517
            MAX(bs->bl.opt_transfer_length,
518
                bs->backing_hd->bl.opt_transfer_length);
519
        bs->bl.opt_mem_alignment =
520
            MAX(bs->bl.opt_mem_alignment,
521
                bs->backing_hd->bl.opt_mem_alignment);
522
    }
523

    
524
    /* Then let the driver override it */
525
    if (drv->bdrv_refresh_limits) {
526
        return drv->bdrv_refresh_limits(bs);
527
    }
528

    
529
    return 0;
530
}
531

    
532
/*
533
 * Create a uniquely-named empty temporary file.
534
 * Return 0 upon success, otherwise a negative errno value.
535
 */
536
int get_tmp_filename(char *filename, int size)
537
{
538
#ifdef _WIN32
539
    char temp_dir[MAX_PATH];
540
    /* GetTempFileName requires that its output buffer (4th param)
541
       have length MAX_PATH or greater.  */
542
    assert(size >= MAX_PATH);
543
    return (GetTempPath(MAX_PATH, temp_dir)
544
            && GetTempFileName(temp_dir, "qem", 0, filename)
545
            ? 0 : -GetLastError());
546
#else
547
    int fd;
548
    const char *tmpdir;
549
    tmpdir = getenv("TMPDIR");
550
    if (!tmpdir) {
551
        tmpdir = "/var/tmp";
552
    }
553
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
554
        return -EOVERFLOW;
555
    }
556
    fd = mkstemp(filename);
557
    if (fd < 0) {
558
        return -errno;
559
    }
560
    if (close(fd) != 0) {
561
        unlink(filename);
562
        return -errno;
563
    }
564
    return 0;
565
#endif
566
}
567

    
568
/*
569
 * Detect host devices. By convention, /dev/cdrom[N] is always
570
 * recognized as a host CDROM.
571
 */
572
static BlockDriver *find_hdev_driver(const char *filename)
573
{
574
    int score_max = 0, score;
575
    BlockDriver *drv = NULL, *d;
576

    
577
    QLIST_FOREACH(d, &bdrv_drivers, list) {
578
        if (d->bdrv_probe_device) {
579
            score = d->bdrv_probe_device(filename);
580
            if (score > score_max) {
581
                score_max = score;
582
                drv = d;
583
            }
584
        }
585
    }
586

    
587
    return drv;
588
}
589

    
590
BlockDriver *bdrv_find_protocol(const char *filename,
591
                                bool allow_protocol_prefix)
592
{
593
    BlockDriver *drv1;
594
    char protocol[128];
595
    int len;
596
    const char *p;
597

    
598
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
599

    
600
    /*
601
     * XXX(hch): we really should not let host device detection
602
     * override an explicit protocol specification, but moving this
603
     * later breaks access to device names with colons in them.
604
     * Thanks to the brain-dead persistent naming schemes on udev-
605
     * based Linux systems those actually are quite common.
606
     */
607
    drv1 = find_hdev_driver(filename);
608
    if (drv1) {
609
        return drv1;
610
    }
611

    
612
    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
613
        return bdrv_find_format("file");
614
    }
615

    
616
    p = strchr(filename, ':');
617
    assert(p != NULL);
618
    len = p - filename;
619
    if (len > sizeof(protocol) - 1)
620
        len = sizeof(protocol) - 1;
621
    memcpy(protocol, filename, len);
622
    protocol[len] = '\0';
623
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
624
        if (drv1->protocol_name &&
625
            !strcmp(drv1->protocol_name, protocol)) {
626
            return drv1;
627
        }
628
    }
629
    return NULL;
630
}
631

    
632
static int find_image_format(BlockDriverState *bs, const char *filename,
633
                             BlockDriver **pdrv, Error **errp)
634
{
635
    int score, score_max;
636
    BlockDriver *drv1, *drv;
637
    uint8_t buf[2048];
638
    int ret = 0;
639

    
640
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
641
    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
642
        drv = bdrv_find_format("raw");
643
        if (!drv) {
644
            error_setg(errp, "Could not find raw image format");
645
            ret = -ENOENT;
646
        }
647
        *pdrv = drv;
648
        return ret;
649
    }
650

    
651
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
652
    if (ret < 0) {
653
        error_setg_errno(errp, -ret, "Could not read image for determining its "
654
                         "format");
655
        *pdrv = NULL;
656
        return ret;
657
    }
658

    
659
    score_max = 0;
660
    drv = NULL;
661
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
662
        if (drv1->bdrv_probe) {
663
            score = drv1->bdrv_probe(buf, ret, filename);
664
            if (score > score_max) {
665
                score_max = score;
666
                drv = drv1;
667
            }
668
        }
669
    }
670
    if (!drv) {
671
        error_setg(errp, "Could not determine image format: No compatible "
672
                   "driver found");
673
        ret = -ENOENT;
674
    }
675
    *pdrv = drv;
676
    return ret;
677
}
678

    
679
/**
680
 * Set the current 'total_sectors' value
681
 */
682
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
683
{
684
    BlockDriver *drv = bs->drv;
685

    
686
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
687
    if (bs->sg)
688
        return 0;
689

    
690
    /* query actual device if possible, otherwise just trust the hint */
691
    if (drv->bdrv_getlength) {
692
        int64_t length = drv->bdrv_getlength(bs);
693
        if (length < 0) {
694
            return length;
695
        }
696
        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
697
    }
698

    
699
    bs->total_sectors = hint;
700
    return 0;
701
}
702

    
703
/**
704
 * Set open flags for a given discard mode
705
 *
706
 * Return 0 on success, -1 if the discard mode was invalid.
707
 */
708
int bdrv_parse_discard_flags(const char *mode, int *flags)
709
{
710
    *flags &= ~BDRV_O_UNMAP;
711

    
712
    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
713
        /* do nothing */
714
    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
715
        *flags |= BDRV_O_UNMAP;
716
    } else {
717
        return -1;
718
    }
719

    
720
    return 0;
721
}
722

    
723
/**
724
 * Set open flags for a given cache mode
725
 *
726
 * Return 0 on success, -1 if the cache mode was invalid.
727
 */
728
int bdrv_parse_cache_flags(const char *mode, int *flags)
729
{
730
    *flags &= ~BDRV_O_CACHE_MASK;
731

    
732
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
733
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
734
    } else if (!strcmp(mode, "directsync")) {
735
        *flags |= BDRV_O_NOCACHE;
736
    } else if (!strcmp(mode, "writeback")) {
737
        *flags |= BDRV_O_CACHE_WB;
738
    } else if (!strcmp(mode, "unsafe")) {
739
        *flags |= BDRV_O_CACHE_WB;
740
        *flags |= BDRV_O_NO_FLUSH;
741
    } else if (!strcmp(mode, "writethrough")) {
742
        /* this is the default */
743
    } else {
744
        return -1;
745
    }
746

    
747
    return 0;
748
}
749

    
750
/**
751
 * The copy-on-read flag is actually a reference count so multiple users may
752
 * use the feature without worrying about clobbering its previous state.
753
 * Copy-on-read stays enabled until all users have called to disable it.
754
 */
755
void bdrv_enable_copy_on_read(BlockDriverState *bs)
756
{
757
    bs->copy_on_read++;
758
}
759

    
760
void bdrv_disable_copy_on_read(BlockDriverState *bs)
761
{
762
    assert(bs->copy_on_read > 0);
763
    bs->copy_on_read--;
764
}
765

    
766
static int bdrv_open_flags(BlockDriverState *bs, int flags)
767
{
768
    int open_flags = flags | BDRV_O_CACHE_WB;
769

    
770
    /*
771
     * Clear flags that are internal to the block layer before opening the
772
     * image.
773
     */
774
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
775

    
776
    /*
777
     * Snapshots should be writable.
778
     */
779
    if (bs->is_temporary) {
780
        open_flags |= BDRV_O_RDWR;
781
    }
782

    
783
    return open_flags;
784
}
785

    
786
static int bdrv_assign_node_name(BlockDriverState *bs,
787
                                 const char *node_name,
788
                                 Error **errp)
789
{
790
    if (!node_name) {
791
        return 0;
792
    }
793

    
794
    /* empty string node name is invalid */
795
    if (node_name[0] == '\0') {
796
        error_setg(errp, "Empty node name");
797
        return -EINVAL;
798
    }
799

    
800
    /* takes care of avoiding namespaces collisions */
801
    if (bdrv_find(node_name)) {
802
        error_setg(errp, "node-name=%s is conflicting with a device id",
803
                   node_name);
804
        return -EINVAL;
805
    }
806

    
807
    /* takes care of avoiding duplicates node names */
808
    if (bdrv_find_node(node_name)) {
809
        error_setg(errp, "Duplicate node name");
810
        return -EINVAL;
811
    }
812

    
813
    /* copy node name into the bs and insert it into the graph list */
814
    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
815
    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
816

    
817
    return 0;
818
}
819

    
820
/*
821
 * Common part for opening disk images and files
822
 *
823
 * Removes all processed options from *options.
824
 */
825
static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
826
    QDict *options, int flags, BlockDriver *drv, Error **errp)
827
{
828
    int ret, open_flags;
829
    const char *filename;
830
    const char *node_name = NULL;
831
    Error *local_err = NULL;
832

    
833
    assert(drv != NULL);
834
    assert(bs->file == NULL);
835
    assert(options != NULL && bs->options != options);
836

    
837
    if (file != NULL) {
838
        filename = file->filename;
839
    } else {
840
        filename = qdict_get_try_str(options, "filename");
841
    }
842

    
843
    if (drv->bdrv_needs_filename && !filename) {
844
        error_setg(errp, "The '%s' block driver requires a file name",
845
                   drv->format_name);
846
        return -EINVAL;
847
    }
848

    
849
    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
850

    
851
    node_name = qdict_get_try_str(options, "node-name");
852
    ret = bdrv_assign_node_name(bs, node_name, errp);
853
    if (ret < 0) {
854
        return ret;
855
    }
856
    qdict_del(options, "node-name");
857

    
858
    /* bdrv_open() with directly using a protocol as drv. This layer is already
859
     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
860
     * and return immediately. */
861
    if (file != NULL && drv->bdrv_file_open) {
862
        bdrv_swap(file, bs);
863
        return 0;
864
    }
865

    
866
    bs->open_flags = flags;
867
    bs->guest_block_size = 512;
868
    bs->request_alignment = 512;
869
    bs->zero_beyond_eof = true;
870
    open_flags = bdrv_open_flags(bs, flags);
871
    bs->read_only = !(open_flags & BDRV_O_RDWR);
872

    
873
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
874
        error_setg(errp,
875
                   !bs->read_only && bdrv_is_whitelisted(drv, true)
876
                        ? "Driver '%s' can only be used for read-only devices"
877
                        : "Driver '%s' is not whitelisted",
878
                   drv->format_name);
879
        return -ENOTSUP;
880
    }
881

    
882
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
883
    if (flags & BDRV_O_COPY_ON_READ) {
884
        if (!bs->read_only) {
885
            bdrv_enable_copy_on_read(bs);
886
        } else {
887
            error_setg(errp, "Can't use copy-on-read on read-only device");
888
            return -EINVAL;
889
        }
890
    }
891

    
892
    if (filename != NULL) {
893
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
894
    } else {
895
        bs->filename[0] = '\0';
896
    }
897

    
898
    bs->drv = drv;
899
    bs->opaque = g_malloc0(drv->instance_size);
900

    
901
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
902

    
903
    /* Open the image, either directly or using a protocol */
904
    if (drv->bdrv_file_open) {
905
        assert(file == NULL);
906
        assert(!drv->bdrv_needs_filename || filename != NULL);
907
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
908
    } else {
909
        if (file == NULL) {
910
            error_setg(errp, "Can't use '%s' as a block driver for the "
911
                       "protocol level", drv->format_name);
912
            ret = -EINVAL;
913
            goto free_and_fail;
914
        }
915
        bs->file = file;
916
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
917
    }
918

    
919
    if (ret < 0) {
920
        if (local_err) {
921
            error_propagate(errp, local_err);
922
        } else if (bs->filename[0]) {
923
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
924
        } else {
925
            error_setg_errno(errp, -ret, "Could not open image");
926
        }
927
        goto free_and_fail;
928
    }
929

    
930
    ret = refresh_total_sectors(bs, bs->total_sectors);
931
    if (ret < 0) {
932
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
933
        goto free_and_fail;
934
    }
935

    
936
    bdrv_refresh_limits(bs);
937
    assert(bdrv_opt_mem_align(bs) != 0);
938
    assert(bs->request_alignment != 0);
939

    
940
#ifndef _WIN32
941
    if (bs->is_temporary) {
942
        assert(bs->filename[0] != '\0');
943
        unlink(bs->filename);
944
    }
945
#endif
946
    return 0;
947

    
948
free_and_fail:
949
    bs->file = NULL;
950
    g_free(bs->opaque);
951
    bs->opaque = NULL;
952
    bs->drv = NULL;
953
    return ret;
954
}
955

    
956
/*
957
 * Opens a file using a protocol (file, host_device, nbd, ...)
958
 *
959
 * options is an indirect pointer to a QDict of options to pass to the block
960
 * drivers, or pointer to NULL for an empty set of options. If this function
961
 * takes ownership of the QDict reference, it will set *options to NULL;
962
 * otherwise, it will contain unused/unrecognized options after this function
963
 * returns. Then, the caller is responsible for freeing it. If it intends to
964
 * reuse the QDict, QINCREF() should be called beforehand.
965
 */
966
static int bdrv_file_open(BlockDriverState *bs, const char *filename,
967
                          QDict **options, int flags, Error **errp)
968
{
969
    BlockDriver *drv;
970
    const char *drvname;
971
    bool allow_protocol_prefix = false;
972
    Error *local_err = NULL;
973
    int ret;
974

    
975
    /* Fetch the file name from the options QDict if necessary */
976
    if (!filename) {
977
        filename = qdict_get_try_str(*options, "filename");
978
    } else if (filename && !qdict_haskey(*options, "filename")) {
979
        qdict_put(*options, "filename", qstring_from_str(filename));
980
        allow_protocol_prefix = true;
981
    } else {
982
        error_setg(errp, "Can't specify 'file' and 'filename' options at the "
983
                   "same time");
984
        ret = -EINVAL;
985
        goto fail;
986
    }
987

    
988
    /* Find the right block driver */
989
    drvname = qdict_get_try_str(*options, "driver");
990
    if (drvname) {
991
        drv = bdrv_find_format(drvname);
992
        if (!drv) {
993
            error_setg(errp, "Unknown driver '%s'", drvname);
994
        }
995
        qdict_del(*options, "driver");
996
    } else if (filename) {
997
        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
998
        if (!drv) {
999
            error_setg(errp, "Unknown protocol");
1000
        }
1001
    } else {
1002
        error_setg(errp, "Must specify either driver or file");
1003
        drv = NULL;
1004
    }
1005

    
1006
    if (!drv) {
1007
        /* errp has been set already */
1008
        ret = -ENOENT;
1009
        goto fail;
1010
    }
1011

    
1012
    /* Parse the filename and open it */
1013
    if (drv->bdrv_parse_filename && filename) {
1014
        drv->bdrv_parse_filename(filename, *options, &local_err);
1015
        if (local_err) {
1016
            error_propagate(errp, local_err);
1017
            ret = -EINVAL;
1018
            goto fail;
1019
        }
1020
        qdict_del(*options, "filename");
1021
    }
1022

    
1023
    if (!drv->bdrv_file_open) {
1024
        ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1025
        *options = NULL;
1026
    } else {
1027
        ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1028
    }
1029
    if (ret < 0) {
1030
        error_propagate(errp, local_err);
1031
        goto fail;
1032
    }
1033

    
1034
    bs->growable = 1;
1035
    return 0;
1036

    
1037
fail:
1038
    return ret;
1039
}
1040

    
1041
/*
1042
 * Opens the backing file for a BlockDriverState if not yet open
1043
 *
1044
 * options is a QDict of options to pass to the block drivers, or NULL for an
1045
 * empty set of options. The reference to the QDict is transferred to this
1046
 * function (even on failure), so if the caller intends to reuse the dictionary,
1047
 * it needs to use QINCREF() before calling bdrv_file_open.
1048
 */
1049
int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1050
{
1051
    char backing_filename[PATH_MAX];
1052
    int back_flags, ret;
1053
    BlockDriver *back_drv = NULL;
1054
    Error *local_err = NULL;
1055

    
1056
    if (bs->backing_hd != NULL) {
1057
        QDECREF(options);
1058
        return 0;
1059
    }
1060

    
1061
    /* NULL means an empty set of options */
1062
    if (options == NULL) {
1063
        options = qdict_new();
1064
    }
1065

    
1066
    bs->open_flags &= ~BDRV_O_NO_BACKING;
1067
    if (qdict_haskey(options, "file.filename")) {
1068
        backing_filename[0] = '\0';
1069
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1070
        QDECREF(options);
1071
        return 0;
1072
    } else {
1073
        bdrv_get_full_backing_filename(bs, backing_filename,
1074
                                       sizeof(backing_filename));
1075
    }
1076

    
1077
    if (bs->backing_format[0] != '\0') {
1078
        back_drv = bdrv_find_format(bs->backing_format);
1079
    }
1080

    
1081
    /* backing files always opened read-only */
1082
    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1083
                                    BDRV_O_COPY_ON_READ);
1084

    
1085
    assert(bs->backing_hd == NULL);
1086
    ret = bdrv_open(&bs->backing_hd,
1087
                    *backing_filename ? backing_filename : NULL, NULL, options,
1088
                    back_flags, back_drv, &local_err);
1089
    if (ret < 0) {
1090
        bs->backing_hd = NULL;
1091
        bs->open_flags |= BDRV_O_NO_BACKING;
1092
        error_setg(errp, "Could not open backing file: %s",
1093
                   error_get_pretty(local_err));
1094
        error_free(local_err);
1095
        return ret;
1096
    }
1097

    
1098
    if (bs->backing_hd->file) {
1099
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1100
                bs->backing_hd->file->filename);
1101
    }
1102

    
1103
    /* Recalculate the BlockLimits with the backing file */
1104
    bdrv_refresh_limits(bs);
1105

    
1106
    return 0;
1107
}
1108

    
1109
/*
1110
 * Opens a disk image whose options are given as BlockdevRef in another block
1111
 * device's options.
1112
 *
1113
 * If allow_none is true, no image will be opened if filename is false and no
1114
 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1115
 *
1116
 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1117
 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1118
 * itself, all options starting with "${bdref_key}." are considered part of the
1119
 * BlockdevRef.
1120
 *
1121
 * The BlockdevRef will be removed from the options QDict.
1122
 *
1123
 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1124
 */
1125
int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1126
                    QDict *options, const char *bdref_key, int flags,
1127
                    bool allow_none, Error **errp)
1128
{
1129
    QDict *image_options;
1130
    int ret;
1131
    char *bdref_key_dot;
1132
    const char *reference;
1133

    
1134
    assert(pbs);
1135
    assert(*pbs == NULL);
1136

    
1137
    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1138
    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1139
    g_free(bdref_key_dot);
1140

    
1141
    reference = qdict_get_try_str(options, bdref_key);
1142
    if (!filename && !reference && !qdict_size(image_options)) {
1143
        if (allow_none) {
1144
            ret = 0;
1145
        } else {
1146
            error_setg(errp, "A block device must be specified for \"%s\"",
1147
                       bdref_key);
1148
            ret = -EINVAL;
1149
        }
1150
        goto done;
1151
    }
1152

    
1153
    ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1154

    
1155
done:
1156
    qdict_del(options, bdref_key);
1157
    return ret;
1158
}
1159

    
1160
/*
1161
 * Opens a disk image (raw, qcow2, vmdk, ...)
1162
 *
1163
 * options is a QDict of options to pass to the block drivers, or NULL for an
1164
 * empty set of options. The reference to the QDict belongs to the block layer
1165
 * after the call (even on failure), so if the caller intends to reuse the
1166
 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1167
 *
1168
 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1169
 * If it is not NULL, the referenced BDS will be reused.
1170
 *
1171
 * The reference parameter may be used to specify an existing block device which
1172
 * should be opened. If specified, neither options nor a filename may be given,
1173
 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1174
 */
1175
int bdrv_open(BlockDriverState **pbs, const char *filename,
1176
              const char *reference, QDict *options, int flags,
1177
              BlockDriver *drv, Error **errp)
1178
{
1179
    int ret;
1180
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1181
    char tmp_filename[PATH_MAX + 1];
1182
    BlockDriverState *file = NULL, *bs;
1183
    const char *drvname;
1184
    Error *local_err = NULL;
1185

    
1186
    assert(pbs);
1187

    
1188
    if (reference) {
1189
        bool options_non_empty = options ? qdict_size(options) : false;
1190
        QDECREF(options);
1191

    
1192
        if (*pbs) {
1193
            error_setg(errp, "Cannot reuse an existing BDS when referencing "
1194
                       "another block device");
1195
            return -EINVAL;
1196
        }
1197

    
1198
        if (filename || options_non_empty) {
1199
            error_setg(errp, "Cannot reference an existing block device with "
1200
                       "additional options or a new filename");
1201
            return -EINVAL;
1202
        }
1203

    
1204
        bs = bdrv_lookup_bs(reference, reference, errp);
1205
        if (!bs) {
1206
            return -ENODEV;
1207
        }
1208
        bdrv_ref(bs);
1209
        *pbs = bs;
1210
        return 0;
1211
    }
1212

    
1213
    if (*pbs) {
1214
        bs = *pbs;
1215
    } else {
1216
        bs = bdrv_new("");
1217
    }
1218

    
1219
    /* NULL means an empty set of options */
1220
    if (options == NULL) {
1221
        options = qdict_new();
1222
    }
1223

    
1224
    bs->options = options;
1225
    options = qdict_clone_shallow(options);
1226

    
1227
    if (flags & BDRV_O_PROTOCOL) {
1228
        assert(!drv);
1229
        ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1230
                             &local_err);
1231
        if (!ret) {
1232
            goto done;
1233
        } else if (bs->drv) {
1234
            goto close_and_fail;
1235
        } else {
1236
            goto fail;
1237
        }
1238
    }
1239

    
1240
    /* For snapshot=on, create a temporary qcow2 overlay */
1241
    if (flags & BDRV_O_SNAPSHOT) {
1242
        BlockDriverState *bs1;
1243
        int64_t total_size;
1244
        BlockDriver *bdrv_qcow2;
1245
        QEMUOptionParameter *create_options;
1246
        QDict *snapshot_options;
1247

    
1248
        /* if snapshot, we create a temporary backing file and open it
1249
           instead of opening 'filename' directly */
1250

    
1251
        /* Get the required size from the image */
1252
        QINCREF(options);
1253
        bs1 = NULL;
1254
        ret = bdrv_open(&bs1, filename, NULL, options, BDRV_O_NO_BACKING,
1255
                        drv, &local_err);
1256
        if (ret < 0) {
1257
            goto fail;
1258
        }
1259
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1260

    
1261
        bdrv_unref(bs1);
1262

    
1263
        /* Create the temporary image */
1264
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1265
        if (ret < 0) {
1266
            error_setg_errno(errp, -ret, "Could not get temporary filename");
1267
            goto fail;
1268
        }
1269

    
1270
        bdrv_qcow2 = bdrv_find_format("qcow2");
1271
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1272
                                                 NULL);
1273

    
1274
        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1275

    
1276
        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1277
        free_option_parameters(create_options);
1278
        if (ret < 0) {
1279
            error_setg_errno(errp, -ret, "Could not create temporary overlay "
1280
                             "'%s': %s", tmp_filename,
1281
                             error_get_pretty(local_err));
1282
            error_free(local_err);
1283
            local_err = NULL;
1284
            goto fail;
1285
        }
1286

    
1287
        /* Prepare a new options QDict for the temporary file, where user
1288
         * options refer to the backing file */
1289
        if (filename) {
1290
            qdict_put(options, "file.filename", qstring_from_str(filename));
1291
        }
1292
        if (drv) {
1293
            qdict_put(options, "driver", qstring_from_str(drv->format_name));
1294
        }
1295

    
1296
        snapshot_options = qdict_new();
1297
        qdict_put(snapshot_options, "backing", options);
1298
        qdict_flatten(snapshot_options);
1299

    
1300
        bs->options = snapshot_options;
1301
        options = qdict_clone_shallow(bs->options);
1302

    
1303
        filename = tmp_filename;
1304
        drv = bdrv_qcow2;
1305
        bs->is_temporary = 1;
1306
    }
1307

    
1308
    /* Open image file without format layer */
1309
    if (flags & BDRV_O_RDWR) {
1310
        flags |= BDRV_O_ALLOW_RDWR;
1311
    }
1312

    
1313
    assert(file == NULL);
1314
    ret = bdrv_open_image(&file, filename, options, "file",
1315
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP) |
1316
                          BDRV_O_PROTOCOL, true, &local_err);
1317
    if (ret < 0) {
1318
        goto fail;
1319
    }
1320

    
1321
    /* Find the right image format driver */
1322
    drvname = qdict_get_try_str(options, "driver");
1323
    if (drvname) {
1324
        drv = bdrv_find_format(drvname);
1325
        qdict_del(options, "driver");
1326
        if (!drv) {
1327
            error_setg(errp, "Invalid driver: '%s'", drvname);
1328
            ret = -EINVAL;
1329
            goto unlink_and_fail;
1330
        }
1331
    }
1332

    
1333
    if (!drv) {
1334
        if (file) {
1335
            ret = find_image_format(file, filename, &drv, &local_err);
1336
        } else {
1337
            error_setg(errp, "Must specify either driver or file");
1338
            ret = -EINVAL;
1339
            goto unlink_and_fail;
1340
        }
1341
    }
1342

    
1343
    if (!drv) {
1344
        goto unlink_and_fail;
1345
    }
1346

    
1347
    /* Open the image */
1348
    ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1349
    if (ret < 0) {
1350
        goto unlink_and_fail;
1351
    }
1352

    
1353
    if (file && (bs->file != file)) {
1354
        bdrv_unref(file);
1355
        file = NULL;
1356
    }
1357

    
1358
    /* If there is a backing file, use it */
1359
    if ((flags & BDRV_O_NO_BACKING) == 0) {
1360
        QDict *backing_options;
1361

    
1362
        qdict_extract_subqdict(options, &backing_options, "backing.");
1363
        ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1364
        if (ret < 0) {
1365
            goto close_and_fail;
1366
        }
1367
    }
1368

    
1369
done:
1370
    /* Check if any unknown options were used */
1371
    if (options && (qdict_size(options) != 0)) {
1372
        const QDictEntry *entry = qdict_first(options);
1373
        if (flags & BDRV_O_PROTOCOL) {
1374
            error_setg(errp, "Block protocol '%s' doesn't support the option "
1375
                       "'%s'", drv->format_name, entry->key);
1376
        } else {
1377
            error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1378
                       "support the option '%s'", drv->format_name,
1379
                       bs->device_name, entry->key);
1380
        }
1381

    
1382
        ret = -EINVAL;
1383
        goto close_and_fail;
1384
    }
1385
    QDECREF(options);
1386

    
1387
    if (!bdrv_key_required(bs)) {
1388
        bdrv_dev_change_media_cb(bs, true);
1389
    }
1390

    
1391
    *pbs = bs;
1392
    return 0;
1393

    
1394
unlink_and_fail:
1395
    if (file != NULL) {
1396
        bdrv_unref(file);
1397
    }
1398
    if (bs->is_temporary) {
1399
        unlink(filename);
1400
    }
1401
fail:
1402
    QDECREF(bs->options);
1403
    QDECREF(options);
1404
    bs->options = NULL;
1405
    if (!*pbs) {
1406
        /* If *pbs is NULL, a new BDS has been created in this function and
1407
           needs to be freed now. Otherwise, it does not need to be closed,
1408
           since it has not really been opened yet. */
1409
        bdrv_unref(bs);
1410
    }
1411
    if (local_err) {
1412
        error_propagate(errp, local_err);
1413
    }
1414
    return ret;
1415

    
1416
close_and_fail:
1417
    /* See fail path, but now the BDS has to be always closed */
1418
    if (*pbs) {
1419
        bdrv_close(bs);
1420
    } else {
1421
        bdrv_unref(bs);
1422
    }
1423
    QDECREF(options);
1424
    if (local_err) {
1425
        error_propagate(errp, local_err);
1426
    }
1427
    return ret;
1428
}
1429

    
1430
typedef struct BlockReopenQueueEntry {
1431
     bool prepared;
1432
     BDRVReopenState state;
1433
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1434
} BlockReopenQueueEntry;
1435

    
1436
/*
1437
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1438
 * reopen of multiple devices.
1439
 *
1440
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1441
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1442
 * be created and initialized. This newly created BlockReopenQueue should be
1443
 * passed back in for subsequent calls that are intended to be of the same
1444
 * atomic 'set'.
1445
 *
1446
 * bs is the BlockDriverState to add to the reopen queue.
1447
 *
1448
 * flags contains the open flags for the associated bs
1449
 *
1450
 * returns a pointer to bs_queue, which is either the newly allocated
1451
 * bs_queue, or the existing bs_queue being used.
1452
 *
1453
 */
1454
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1455
                                    BlockDriverState *bs, int flags)
1456
{
1457
    assert(bs != NULL);
1458

    
1459
    BlockReopenQueueEntry *bs_entry;
1460
    if (bs_queue == NULL) {
1461
        bs_queue = g_new0(BlockReopenQueue, 1);
1462
        QSIMPLEQ_INIT(bs_queue);
1463
    }
1464

    
1465
    if (bs->file) {
1466
        bdrv_reopen_queue(bs_queue, bs->file, flags);
1467
    }
1468

    
1469
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
1470
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1471

    
1472
    bs_entry->state.bs = bs;
1473
    bs_entry->state.flags = flags;
1474

    
1475
    return bs_queue;
1476
}
1477

    
1478
/*
1479
 * Reopen multiple BlockDriverStates atomically & transactionally.
1480
 *
1481
 * The queue passed in (bs_queue) must have been built up previous
1482
 * via bdrv_reopen_queue().
1483
 *
1484
 * Reopens all BDS specified in the queue, with the appropriate
1485
 * flags.  All devices are prepared for reopen, and failure of any
1486
 * device will cause all device changes to be abandonded, and intermediate
1487
 * data cleaned up.
1488
 *
1489
 * If all devices prepare successfully, then the changes are committed
1490
 * to all devices.
1491
 *
1492
 */
1493
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1494
{
1495
    int ret = -1;
1496
    BlockReopenQueueEntry *bs_entry, *next;
1497
    Error *local_err = NULL;
1498

    
1499
    assert(bs_queue != NULL);
1500

    
1501
    bdrv_drain_all();
1502

    
1503
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1504
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1505
            error_propagate(errp, local_err);
1506
            goto cleanup;
1507
        }
1508
        bs_entry->prepared = true;
1509
    }
1510

    
1511
    /* If we reach this point, we have success and just need to apply the
1512
     * changes
1513
     */
1514
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1515
        bdrv_reopen_commit(&bs_entry->state);
1516
    }
1517

    
1518
    ret = 0;
1519

    
1520
cleanup:
1521
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1522
        if (ret && bs_entry->prepared) {
1523
            bdrv_reopen_abort(&bs_entry->state);
1524
        }
1525
        g_free(bs_entry);
1526
    }
1527
    g_free(bs_queue);
1528
    return ret;
1529
}
1530

    
1531

    
1532
/* Reopen a single BlockDriverState with the specified flags. */
1533
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1534
{
1535
    int ret = -1;
1536
    Error *local_err = NULL;
1537
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1538

    
1539
    ret = bdrv_reopen_multiple(queue, &local_err);
1540
    if (local_err != NULL) {
1541
        error_propagate(errp, local_err);
1542
    }
1543
    return ret;
1544
}
1545

    
1546

    
1547
/*
1548
 * Prepares a BlockDriverState for reopen. All changes are staged in the
1549
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1550
 * the block driver layer .bdrv_reopen_prepare()
1551
 *
1552
 * bs is the BlockDriverState to reopen
1553
 * flags are the new open flags
1554
 * queue is the reopen queue
1555
 *
1556
 * Returns 0 on success, non-zero on error.  On error errp will be set
1557
 * as well.
1558
 *
1559
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1560
 * It is the responsibility of the caller to then call the abort() or
1561
 * commit() for any other BDS that have been left in a prepare() state
1562
 *
1563
 */
1564
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1565
                        Error **errp)
1566
{
1567
    int ret = -1;
1568
    Error *local_err = NULL;
1569
    BlockDriver *drv;
1570

    
1571
    assert(reopen_state != NULL);
1572
    assert(reopen_state->bs->drv != NULL);
1573
    drv = reopen_state->bs->drv;
1574

    
1575
    /* if we are to stay read-only, do not allow permission change
1576
     * to r/w */
1577
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1578
        reopen_state->flags & BDRV_O_RDWR) {
1579
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1580
                  reopen_state->bs->device_name);
1581
        goto error;
1582
    }
1583

    
1584

    
1585
    ret = bdrv_flush(reopen_state->bs);
1586
    if (ret) {
1587
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1588
                  strerror(-ret));
1589
        goto error;
1590
    }
1591

    
1592
    if (drv->bdrv_reopen_prepare) {
1593
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1594
        if (ret) {
1595
            if (local_err != NULL) {
1596
                error_propagate(errp, local_err);
1597
            } else {
1598
                error_setg(errp, "failed while preparing to reopen image '%s'",
1599
                           reopen_state->bs->filename);
1600
            }
1601
            goto error;
1602
        }
1603
    } else {
1604
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1605
         * handler for each supported drv. */
1606
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1607
                  drv->format_name, reopen_state->bs->device_name,
1608
                 "reopening of file");
1609
        ret = -1;
1610
        goto error;
1611
    }
1612

    
1613
    ret = 0;
1614

    
1615
error:
1616
    return ret;
1617
}
1618

    
1619
/*
1620
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1621
 * makes them final by swapping the staging BlockDriverState contents into
1622
 * the active BlockDriverState contents.
1623
 */
1624
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1625
{
1626
    BlockDriver *drv;
1627

    
1628
    assert(reopen_state != NULL);
1629
    drv = reopen_state->bs->drv;
1630
    assert(drv != NULL);
1631

    
1632
    /* If there are any driver level actions to take */
1633
    if (drv->bdrv_reopen_commit) {
1634
        drv->bdrv_reopen_commit(reopen_state);
1635
    }
1636

    
1637
    /* set BDS specific flags now */
1638
    reopen_state->bs->open_flags         = reopen_state->flags;
1639
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1640
                                              BDRV_O_CACHE_WB);
1641
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1642

    
1643
    bdrv_refresh_limits(reopen_state->bs);
1644
}
1645

    
1646
/*
1647
 * Abort the reopen, and delete and free the staged changes in
1648
 * reopen_state
1649
 */
1650
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1651
{
1652
    BlockDriver *drv;
1653

    
1654
    assert(reopen_state != NULL);
1655
    drv = reopen_state->bs->drv;
1656
    assert(drv != NULL);
1657

    
1658
    if (drv->bdrv_reopen_abort) {
1659
        drv->bdrv_reopen_abort(reopen_state);
1660
    }
1661
}
1662

    
1663

    
1664
void bdrv_close(BlockDriverState *bs)
1665
{
1666
    if (bs->job) {
1667
        block_job_cancel_sync(bs->job);
1668
    }
1669
    bdrv_drain_all(); /* complete I/O */
1670
    bdrv_flush(bs);
1671
    bdrv_drain_all(); /* in case flush left pending I/O */
1672
    notifier_list_notify(&bs->close_notifiers, bs);
1673

    
1674
    if (bs->drv) {
1675
        if (bs->backing_hd) {
1676
            bdrv_unref(bs->backing_hd);
1677
            bs->backing_hd = NULL;
1678
        }
1679
        bs->drv->bdrv_close(bs);
1680
        g_free(bs->opaque);
1681
#ifdef _WIN32
1682
        if (bs->is_temporary) {
1683
            unlink(bs->filename);
1684
        }
1685
#endif
1686
        bs->opaque = NULL;
1687
        bs->drv = NULL;
1688
        bs->copy_on_read = 0;
1689
        bs->backing_file[0] = '\0';
1690
        bs->backing_format[0] = '\0';
1691
        bs->total_sectors = 0;
1692
        bs->encrypted = 0;
1693
        bs->valid_key = 0;
1694
        bs->sg = 0;
1695
        bs->growable = 0;
1696
        bs->zero_beyond_eof = false;
1697
        QDECREF(bs->options);
1698
        bs->options = NULL;
1699

    
1700
        if (bs->file != NULL) {
1701
            bdrv_unref(bs->file);
1702
            bs->file = NULL;
1703
        }
1704
    }
1705

    
1706
    bdrv_dev_change_media_cb(bs, false);
1707

    
1708
    /*throttling disk I/O limits*/
1709
    if (bs->io_limits_enabled) {
1710
        bdrv_io_limits_disable(bs);
1711
    }
1712
}
1713

    
1714
void bdrv_close_all(void)
1715
{
1716
    BlockDriverState *bs;
1717

    
1718
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1719
        bdrv_close(bs);
1720
    }
1721
}
1722

    
1723
/* Check if any requests are in-flight (including throttled requests) */
1724
static bool bdrv_requests_pending(BlockDriverState *bs)
1725
{
1726
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
1727
        return true;
1728
    }
1729
    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1730
        return true;
1731
    }
1732
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1733
        return true;
1734
    }
1735
    if (bs->file && bdrv_requests_pending(bs->file)) {
1736
        return true;
1737
    }
1738
    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1739
        return true;
1740
    }
1741
    return false;
1742
}
1743

    
1744
static bool bdrv_requests_pending_all(void)
1745
{
1746
    BlockDriverState *bs;
1747
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1748
        if (bdrv_requests_pending(bs)) {
1749
            return true;
1750
        }
1751
    }
1752
    return false;
1753
}
1754

    
1755
/*
1756
 * Wait for pending requests to complete across all BlockDriverStates
1757
 *
1758
 * This function does not flush data to disk, use bdrv_flush_all() for that
1759
 * after calling this function.
1760
 *
1761
 * Note that completion of an asynchronous I/O operation can trigger any
1762
 * number of other I/O operations on other devices---for example a coroutine
1763
 * can be arbitrarily complex and a constant flow of I/O can come until the
1764
 * coroutine is complete.  Because of this, it is not possible to have a
1765
 * function to drain a single device's I/O queue.
1766
 */
1767
void bdrv_drain_all(void)
1768
{
1769
    /* Always run first iteration so any pending completion BHs run */
1770
    bool busy = true;
1771
    BlockDriverState *bs;
1772

    
1773
    while (busy) {
1774
        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1775
            bdrv_start_throttled_reqs(bs);
1776
        }
1777

    
1778
        busy = bdrv_requests_pending_all();
1779
        busy |= aio_poll(qemu_get_aio_context(), busy);
1780
    }
1781
}
1782

    
1783
/* make a BlockDriverState anonymous by removing from bdrv_state and
1784
 * graph_bdrv_state list.
1785
   Also, NULL terminate the device_name to prevent double remove */
1786
void bdrv_make_anon(BlockDriverState *bs)
1787
{
1788
    if (bs->device_name[0] != '\0') {
1789
        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1790
    }
1791
    bs->device_name[0] = '\0';
1792
    if (bs->node_name[0] != '\0') {
1793
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1794
    }
1795
    bs->node_name[0] = '\0';
1796
}
1797

    
1798
static void bdrv_rebind(BlockDriverState *bs)
1799
{
1800
    if (bs->drv && bs->drv->bdrv_rebind) {
1801
        bs->drv->bdrv_rebind(bs);
1802
    }
1803
}
1804

    
1805
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1806
                                     BlockDriverState *bs_src)
1807
{
1808
    /* move some fields that need to stay attached to the device */
1809
    bs_dest->open_flags         = bs_src->open_flags;
1810

    
1811
    /* dev info */
1812
    bs_dest->dev_ops            = bs_src->dev_ops;
1813
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1814
    bs_dest->dev                = bs_src->dev;
1815
    bs_dest->guest_block_size   = bs_src->guest_block_size;
1816
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1817

    
1818
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1819

    
1820
    /* i/o throttled req */
1821
    memcpy(&bs_dest->throttle_state,
1822
           &bs_src->throttle_state,
1823
           sizeof(ThrottleState));
1824
    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1825
    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1826
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1827

    
1828
    /* r/w error */
1829
    bs_dest->on_read_error      = bs_src->on_read_error;
1830
    bs_dest->on_write_error     = bs_src->on_write_error;
1831

    
1832
    /* i/o status */
1833
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1834
    bs_dest->iostatus           = bs_src->iostatus;
1835

    
1836
    /* dirty bitmap */
1837
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1838

    
1839
    /* reference count */
1840
    bs_dest->refcnt             = bs_src->refcnt;
1841

    
1842
    /* job */
1843
    bs_dest->in_use             = bs_src->in_use;
1844
    bs_dest->job                = bs_src->job;
1845

    
1846
    /* keep the same entry in bdrv_states */
1847
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1848
            bs_src->device_name);
1849
    bs_dest->device_list = bs_src->device_list;
1850

    
1851
    /* keep the same entry in graph_bdrv_states
1852
     * We do want to swap name but don't want to swap linked list entries
1853
     */
1854
    bs_dest->node_list   = bs_src->node_list;
1855
}
1856

    
1857
/*
1858
 * Swap bs contents for two image chains while they are live,
1859
 * while keeping required fields on the BlockDriverState that is
1860
 * actually attached to a device.
1861
 *
1862
 * This will modify the BlockDriverState fields, and swap contents
1863
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1864
 *
1865
 * bs_new is required to be anonymous.
1866
 *
1867
 * This function does not create any image files.
1868
 */
1869
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1870
{
1871
    BlockDriverState tmp;
1872

    
1873
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1874
    assert(bs_new->device_name[0] == '\0');
1875
    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1876
    assert(bs_new->job == NULL);
1877
    assert(bs_new->dev == NULL);
1878
    assert(bs_new->in_use == 0);
1879
    assert(bs_new->io_limits_enabled == false);
1880
    assert(!throttle_have_timer(&bs_new->throttle_state));
1881

    
1882
    tmp = *bs_new;
1883
    *bs_new = *bs_old;
1884
    *bs_old = tmp;
1885

    
1886
    /* there are some fields that should not be swapped, move them back */
1887
    bdrv_move_feature_fields(&tmp, bs_old);
1888
    bdrv_move_feature_fields(bs_old, bs_new);
1889
    bdrv_move_feature_fields(bs_new, &tmp);
1890

    
1891
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1892
    assert(bs_new->device_name[0] == '\0');
1893

    
1894
    /* Check a few fields that should remain attached to the device */
1895
    assert(bs_new->dev == NULL);
1896
    assert(bs_new->job == NULL);
1897
    assert(bs_new->in_use == 0);
1898
    assert(bs_new->io_limits_enabled == false);
1899
    assert(!throttle_have_timer(&bs_new->throttle_state));
1900

    
1901
    bdrv_rebind(bs_new);
1902
    bdrv_rebind(bs_old);
1903
}
1904

    
1905
/*
1906
 * Add new bs contents at the top of an image chain while the chain is
1907
 * live, while keeping required fields on the top layer.
1908
 *
1909
 * This will modify the BlockDriverState fields, and swap contents
1910
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1911
 *
1912
 * bs_new is required to be anonymous.
1913
 *
1914
 * This function does not create any image files.
1915
 */
1916
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1917
{
1918
    bdrv_swap(bs_new, bs_top);
1919

    
1920
    /* The contents of 'tmp' will become bs_top, as we are
1921
     * swapping bs_new and bs_top contents. */
1922
    bs_top->backing_hd = bs_new;
1923
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1924
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1925
            bs_new->filename);
1926
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1927
            bs_new->drv ? bs_new->drv->format_name : "");
1928
}
1929

    
1930
static void bdrv_delete(BlockDriverState *bs)
1931
{
1932
    assert(!bs->dev);
1933
    assert(!bs->job);
1934
    assert(!bs->in_use);
1935
    assert(!bs->refcnt);
1936
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1937

    
1938
    bdrv_close(bs);
1939

    
1940
    /* remove from list, if necessary */
1941
    bdrv_make_anon(bs);
1942

    
1943
    g_free(bs);
1944
}
1945

    
1946
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1947
/* TODO change to DeviceState *dev when all users are qdevified */
1948
{
1949
    if (bs->dev) {
1950
        return -EBUSY;
1951
    }
1952
    bs->dev = dev;
1953
    bdrv_iostatus_reset(bs);
1954
    return 0;
1955
}
1956

    
1957
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1958
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1959
{
1960
    if (bdrv_attach_dev(bs, dev) < 0) {
1961
        abort();
1962
    }
1963
}
1964

    
1965
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1966
/* TODO change to DeviceState *dev when all users are qdevified */
1967
{
1968
    assert(bs->dev == dev);
1969
    bs->dev = NULL;
1970
    bs->dev_ops = NULL;
1971
    bs->dev_opaque = NULL;
1972
    bs->guest_block_size = 512;
1973
}
1974

    
1975
/* TODO change to return DeviceState * when all users are qdevified */
1976
void *bdrv_get_attached_dev(BlockDriverState *bs)
1977
{
1978
    return bs->dev;
1979
}
1980

    
1981
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1982
                      void *opaque)
1983
{
1984
    bs->dev_ops = ops;
1985
    bs->dev_opaque = opaque;
1986
}
1987

    
1988
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1989
                               enum MonitorEvent ev,
1990
                               BlockErrorAction action, bool is_read)
1991
{
1992
    QObject *data;
1993
    const char *action_str;
1994

    
1995
    switch (action) {
1996
    case BDRV_ACTION_REPORT:
1997
        action_str = "report";
1998
        break;
1999
    case BDRV_ACTION_IGNORE:
2000
        action_str = "ignore";
2001
        break;
2002
    case BDRV_ACTION_STOP:
2003
        action_str = "stop";
2004
        break;
2005
    default:
2006
        abort();
2007
    }
2008

    
2009
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2010
                              bdrv->device_name,
2011
                              action_str,
2012
                              is_read ? "read" : "write");
2013
    monitor_protocol_event(ev, data);
2014

    
2015
    qobject_decref(data);
2016
}
2017

    
2018
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2019
{
2020
    QObject *data;
2021

    
2022
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2023
                              bdrv_get_device_name(bs), ejected);
2024
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2025

    
2026
    qobject_decref(data);
2027
}
2028

    
2029
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2030
{
2031
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2032
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2033
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2034
        if (tray_was_closed) {
2035
            /* tray open */
2036
            bdrv_emit_qmp_eject_event(bs, true);
2037
        }
2038
        if (load) {
2039
            /* tray close */
2040
            bdrv_emit_qmp_eject_event(bs, false);
2041
        }
2042
    }
2043
}
2044

    
2045
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2046
{
2047
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2048
}
2049

    
2050
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2051
{
2052
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2053
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2054
    }
2055
}
2056

    
2057
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2058
{
2059
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2060
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
2061
    }
2062
    return false;
2063
}
2064

    
2065
static void bdrv_dev_resize_cb(BlockDriverState *bs)
2066
{
2067
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
2068
        bs->dev_ops->resize_cb(bs->dev_opaque);
2069
    }
2070
}
2071

    
2072
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2073
{
2074
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2075
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2076
    }
2077
    return false;
2078
}
2079

    
2080
/*
2081
 * Run consistency checks on an image
2082
 *
2083
 * Returns 0 if the check could be completed (it doesn't mean that the image is
2084
 * free of errors) or -errno when an internal error occurred. The results of the
2085
 * check are stored in res.
2086
 */
2087
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2088
{
2089
    if (bs->drv->bdrv_check == NULL) {
2090
        return -ENOTSUP;
2091
    }
2092

    
2093
    memset(res, 0, sizeof(*res));
2094
    return bs->drv->bdrv_check(bs, res, fix);
2095
}
2096

    
2097
#define COMMIT_BUF_SECTORS 2048
2098

    
2099
/* commit COW file into the raw image */
2100
int bdrv_commit(BlockDriverState *bs)
2101
{
2102
    BlockDriver *drv = bs->drv;
2103
    int64_t sector, total_sectors, length, backing_length;
2104
    int n, ro, open_flags;
2105
    int ret = 0;
2106
    uint8_t *buf = NULL;
2107
    char filename[PATH_MAX];
2108

    
2109
    if (!drv)
2110
        return -ENOMEDIUM;
2111
    
2112
    if (!bs->backing_hd) {
2113
        return -ENOTSUP;
2114
    }
2115

    
2116
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2117
        return -EBUSY;
2118
    }
2119

    
2120
    ro = bs->backing_hd->read_only;
2121
    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2122
    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2123
    open_flags =  bs->backing_hd->open_flags;
2124

    
2125
    if (ro) {
2126
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2127
            return -EACCES;
2128
        }
2129
    }
2130

    
2131
    length = bdrv_getlength(bs);
2132
    if (length < 0) {
2133
        ret = length;
2134
        goto ro_cleanup;
2135
    }
2136

    
2137
    backing_length = bdrv_getlength(bs->backing_hd);
2138
    if (backing_length < 0) {
2139
        ret = backing_length;
2140
        goto ro_cleanup;
2141
    }
2142

    
2143
    /* If our top snapshot is larger than the backing file image,
2144
     * grow the backing file image if possible.  If not possible,
2145
     * we must return an error */
2146
    if (length > backing_length) {
2147
        ret = bdrv_truncate(bs->backing_hd, length);
2148
        if (ret < 0) {
2149
            goto ro_cleanup;
2150
        }
2151
    }
2152

    
2153
    total_sectors = length >> BDRV_SECTOR_BITS;
2154
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2155

    
2156
    for (sector = 0; sector < total_sectors; sector += n) {
2157
        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2158
        if (ret < 0) {
2159
            goto ro_cleanup;
2160
        }
2161
        if (ret) {
2162
            ret = bdrv_read(bs, sector, buf, n);
2163
            if (ret < 0) {
2164
                goto ro_cleanup;
2165
            }
2166

    
2167
            ret = bdrv_write(bs->backing_hd, sector, buf, n);
2168
            if (ret < 0) {
2169
                goto ro_cleanup;
2170
            }
2171
        }
2172
    }
2173

    
2174
    if (drv->bdrv_make_empty) {
2175
        ret = drv->bdrv_make_empty(bs);
2176
        if (ret < 0) {
2177
            goto ro_cleanup;
2178
        }
2179
        bdrv_flush(bs);
2180
    }
2181

    
2182
    /*
2183
     * Make sure all data we wrote to the backing device is actually
2184
     * stable on disk.
2185
     */
2186
    if (bs->backing_hd) {
2187
        bdrv_flush(bs->backing_hd);
2188
    }
2189

    
2190
    ret = 0;
2191
ro_cleanup:
2192
    g_free(buf);
2193

    
2194
    if (ro) {
2195
        /* ignoring error return here */
2196
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2197
    }
2198

    
2199
    return ret;
2200
}
2201

    
2202
int bdrv_commit_all(void)
2203
{
2204
    BlockDriverState *bs;
2205

    
2206
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2207
        if (bs->drv && bs->backing_hd) {
2208
            int ret = bdrv_commit(bs);
2209
            if (ret < 0) {
2210
                return ret;
2211
            }
2212
        }
2213
    }
2214
    return 0;
2215
}
2216

    
2217
/**
2218
 * Remove an active request from the tracked requests list
2219
 *
2220
 * This function should be called when a tracked request is completing.
2221
 */
2222
static void tracked_request_end(BdrvTrackedRequest *req)
2223
{
2224
    if (req->serialising) {
2225
        req->bs->serialising_in_flight--;
2226
    }
2227

    
2228
    QLIST_REMOVE(req, list);
2229
    qemu_co_queue_restart_all(&req->wait_queue);
2230
}
2231

    
2232
/**
2233
 * Add an active request to the tracked requests list
2234
 */
2235
static void tracked_request_begin(BdrvTrackedRequest *req,
2236
                                  BlockDriverState *bs,
2237
                                  int64_t offset,
2238
                                  unsigned int bytes, bool is_write)
2239
{
2240
    *req = (BdrvTrackedRequest){
2241
        .bs = bs,
2242
        .offset         = offset,
2243
        .bytes          = bytes,
2244
        .is_write       = is_write,
2245
        .co             = qemu_coroutine_self(),
2246
        .serialising    = false,
2247
        .overlap_offset = offset,
2248
        .overlap_bytes  = bytes,
2249
    };
2250

    
2251
    qemu_co_queue_init(&req->wait_queue);
2252

    
2253
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2254
}
2255

    
2256
static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2257
{
2258
    int64_t overlap_offset = req->offset & ~(align - 1);
2259
    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2260
                               - overlap_offset;
2261

    
2262
    if (!req->serialising) {
2263
        req->bs->serialising_in_flight++;
2264
        req->serialising = true;
2265
    }
2266

    
2267
    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2268
    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2269
}
2270

    
2271
/**
2272
 * Round a region to cluster boundaries
2273
 */
2274
void bdrv_round_to_clusters(BlockDriverState *bs,
2275
                            int64_t sector_num, int nb_sectors,
2276
                            int64_t *cluster_sector_num,
2277
                            int *cluster_nb_sectors)
2278
{
2279
    BlockDriverInfo bdi;
2280

    
2281
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2282
        *cluster_sector_num = sector_num;
2283
        *cluster_nb_sectors = nb_sectors;
2284
    } else {
2285
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2286
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2287
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2288
                                            nb_sectors, c);
2289
    }
2290
}
2291

    
2292
static int bdrv_get_cluster_size(BlockDriverState *bs)
2293
{
2294
    BlockDriverInfo bdi;
2295
    int ret;
2296

    
2297
    ret = bdrv_get_info(bs, &bdi);
2298
    if (ret < 0 || bdi.cluster_size == 0) {
2299
        return bs->request_alignment;
2300
    } else {
2301
        return bdi.cluster_size;
2302
    }
2303
}
2304

    
2305
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2306
                                     int64_t offset, unsigned int bytes)
2307
{
2308
    /*        aaaa   bbbb */
2309
    if (offset >= req->overlap_offset + req->overlap_bytes) {
2310
        return false;
2311
    }
2312
    /* bbbb   aaaa        */
2313
    if (req->overlap_offset >= offset + bytes) {
2314
        return false;
2315
    }
2316
    return true;
2317
}
2318

    
2319
static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2320
{
2321
    BlockDriverState *bs = self->bs;
2322
    BdrvTrackedRequest *req;
2323
    bool retry;
2324
    bool waited = false;
2325

    
2326
    if (!bs->serialising_in_flight) {
2327
        return false;
2328
    }
2329

    
2330
    do {
2331
        retry = false;
2332
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
2333
            if (req == self || (!req->serialising && !self->serialising)) {
2334
                continue;
2335
            }
2336
            if (tracked_request_overlaps(req, self->overlap_offset,
2337
                                         self->overlap_bytes))
2338
            {
2339
                /* Hitting this means there was a reentrant request, for
2340
                 * example, a block driver issuing nested requests.  This must
2341
                 * never happen since it means deadlock.
2342
                 */
2343
                assert(qemu_coroutine_self() != req->co);
2344

    
2345
                /* If the request is already (indirectly) waiting for us, or
2346
                 * will wait for us as soon as it wakes up, then just go on
2347
                 * (instead of producing a deadlock in the former case). */
2348
                if (!req->waiting_for) {
2349
                    self->waiting_for = req;
2350
                    qemu_co_queue_wait(&req->wait_queue);
2351
                    self->waiting_for = NULL;
2352
                    retry = true;
2353
                    waited = true;
2354
                    break;
2355
                }
2356
            }
2357
        }
2358
    } while (retry);
2359

    
2360
    return waited;
2361
}
2362

    
2363
/*
2364
 * Return values:
2365
 * 0        - success
2366
 * -EINVAL  - backing format specified, but no file
2367
 * -ENOSPC  - can't update the backing file because no space is left in the
2368
 *            image file header
2369
 * -ENOTSUP - format driver doesn't support changing the backing file
2370
 */
2371
int bdrv_change_backing_file(BlockDriverState *bs,
2372
    const char *backing_file, const char *backing_fmt)
2373
{
2374
    BlockDriver *drv = bs->drv;
2375
    int ret;
2376

    
2377
    /* Backing file format doesn't make sense without a backing file */
2378
    if (backing_fmt && !backing_file) {
2379
        return -EINVAL;
2380
    }
2381

    
2382
    if (drv->bdrv_change_backing_file != NULL) {
2383
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2384
    } else {
2385
        ret = -ENOTSUP;
2386
    }
2387

    
2388
    if (ret == 0) {
2389
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2390
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2391
    }
2392
    return ret;
2393
}
2394

    
2395
/*
2396
 * Finds the image layer in the chain that has 'bs' as its backing file.
2397
 *
2398
 * active is the current topmost image.
2399
 *
2400
 * Returns NULL if bs is not found in active's image chain,
2401
 * or if active == bs.
2402
 */
2403
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2404
                                    BlockDriverState *bs)
2405
{
2406
    BlockDriverState *overlay = NULL;
2407
    BlockDriverState *intermediate;
2408

    
2409
    assert(active != NULL);
2410
    assert(bs != NULL);
2411

    
2412
    /* if bs is the same as active, then by definition it has no overlay
2413
     */
2414
    if (active == bs) {
2415
        return NULL;
2416
    }
2417

    
2418
    intermediate = active;
2419
    while (intermediate->backing_hd) {
2420
        if (intermediate->backing_hd == bs) {
2421
            overlay = intermediate;
2422
            break;
2423
        }
2424
        intermediate = intermediate->backing_hd;
2425
    }
2426

    
2427
    return overlay;
2428
}
2429

    
2430
typedef struct BlkIntermediateStates {
2431
    BlockDriverState *bs;
2432
    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2433
} BlkIntermediateStates;
2434

    
2435

    
2436
/*
2437
 * Drops images above 'base' up to and including 'top', and sets the image
2438
 * above 'top' to have base as its backing file.
2439
 *
2440
 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2441
 * information in 'bs' can be properly updated.
2442
 *
2443
 * E.g., this will convert the following chain:
2444
 * bottom <- base <- intermediate <- top <- active
2445
 *
2446
 * to
2447
 *
2448
 * bottom <- base <- active
2449
 *
2450
 * It is allowed for bottom==base, in which case it converts:
2451
 *
2452
 * base <- intermediate <- top <- active
2453
 *
2454
 * to
2455
 *
2456
 * base <- active
2457
 *
2458
 * Error conditions:
2459
 *  if active == top, that is considered an error
2460
 *
2461
 */
2462
int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2463
                           BlockDriverState *base)
2464
{
2465
    BlockDriverState *intermediate;
2466
    BlockDriverState *base_bs = NULL;
2467
    BlockDriverState *new_top_bs = NULL;
2468
    BlkIntermediateStates *intermediate_state, *next;
2469
    int ret = -EIO;
2470

    
2471
    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2472
    QSIMPLEQ_INIT(&states_to_delete);
2473

    
2474
    if (!top->drv || !base->drv) {
2475
        goto exit;
2476
    }
2477

    
2478
    new_top_bs = bdrv_find_overlay(active, top);
2479

    
2480
    if (new_top_bs == NULL) {
2481
        /* we could not find the image above 'top', this is an error */
2482
        goto exit;
2483
    }
2484

    
2485
    /* special case of new_top_bs->backing_hd already pointing to base - nothing
2486
     * to do, no intermediate images */
2487
    if (new_top_bs->backing_hd == base) {
2488
        ret = 0;
2489
        goto exit;
2490
    }
2491

    
2492
    intermediate = top;
2493

    
2494
    /* now we will go down through the list, and add each BDS we find
2495
     * into our deletion queue, until we hit the 'base'
2496
     */
2497
    while (intermediate) {
2498
        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2499
        intermediate_state->bs = intermediate;
2500
        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2501

    
2502
        if (intermediate->backing_hd == base) {
2503
            base_bs = intermediate->backing_hd;
2504
            break;
2505
        }
2506
        intermediate = intermediate->backing_hd;
2507
    }
2508
    if (base_bs == NULL) {
2509
        /* something went wrong, we did not end at the base. safely
2510
         * unravel everything, and exit with error */
2511
        goto exit;
2512
    }
2513

    
2514
    /* success - we can delete the intermediate states, and link top->base */
2515
    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2516
                                   base_bs->drv ? base_bs->drv->format_name : "");
2517
    if (ret) {
2518
        goto exit;
2519
    }
2520
    new_top_bs->backing_hd = base_bs;
2521

    
2522
    bdrv_refresh_limits(new_top_bs);
2523

    
2524
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2525
        /* so that bdrv_close() does not recursively close the chain */
2526
        intermediate_state->bs->backing_hd = NULL;
2527
        bdrv_unref(intermediate_state->bs);
2528
    }
2529
    ret = 0;
2530

    
2531
exit:
2532
    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2533
        g_free(intermediate_state);
2534
    }
2535
    return ret;
2536
}
2537

    
2538

    
2539
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2540
                                   size_t size)
2541
{
2542
    int64_t len;
2543

    
2544
    if (!bdrv_is_inserted(bs))
2545
        return -ENOMEDIUM;
2546

    
2547
    if (bs->growable)
2548
        return 0;
2549

    
2550
    len = bdrv_getlength(bs);
2551

    
2552
    if (offset < 0)
2553
        return -EIO;
2554

    
2555
    if ((offset > len) || (len - offset < size))
2556
        return -EIO;
2557

    
2558
    return 0;
2559
}
2560

    
2561
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2562
                              int nb_sectors)
2563
{
2564
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2565
                                   nb_sectors * BDRV_SECTOR_SIZE);
2566
}
2567

    
2568
typedef struct RwCo {
2569
    BlockDriverState *bs;
2570
    int64_t offset;
2571
    QEMUIOVector *qiov;
2572
    bool is_write;
2573
    int ret;
2574
    BdrvRequestFlags flags;
2575
} RwCo;
2576

    
2577
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2578
{
2579
    RwCo *rwco = opaque;
2580

    
2581
    if (!rwco->is_write) {
2582
        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2583
                                      rwco->qiov->size, rwco->qiov,
2584
                                      rwco->flags);
2585
    } else {
2586
        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2587
                                       rwco->qiov->size, rwco->qiov,
2588
                                       rwco->flags);
2589
    }
2590
}
2591

    
2592
/*
2593
 * Process a vectored synchronous request using coroutines
2594
 */
2595
static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2596
                        QEMUIOVector *qiov, bool is_write,
2597
                        BdrvRequestFlags flags)
2598
{
2599
    Coroutine *co;
2600
    RwCo rwco = {
2601
        .bs = bs,
2602
        .offset = offset,
2603
        .qiov = qiov,
2604
        .is_write = is_write,
2605
        .ret = NOT_DONE,
2606
        .flags = flags,
2607
    };
2608

    
2609
    /**
2610
     * In sync call context, when the vcpu is blocked, this throttling timer
2611
     * will not fire; so the I/O throttling function has to be disabled here
2612
     * if it has been enabled.
2613
     */
2614
    if (bs->io_limits_enabled) {
2615
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
2616
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
2617
        bdrv_io_limits_disable(bs);
2618
    }
2619

    
2620
    if (qemu_in_coroutine()) {
2621
        /* Fast-path if already in coroutine context */
2622
        bdrv_rw_co_entry(&rwco);
2623
    } else {
2624
        co = qemu_coroutine_create(bdrv_rw_co_entry);
2625
        qemu_coroutine_enter(co, &rwco);
2626
        while (rwco.ret == NOT_DONE) {
2627
            qemu_aio_wait();
2628
        }
2629
    }
2630
    return rwco.ret;
2631
}
2632

    
2633
/*
2634
 * Process a synchronous request using coroutines
2635
 */
2636
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2637
                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
2638
{
2639
    QEMUIOVector qiov;
2640
    struct iovec iov = {
2641
        .iov_base = (void *)buf,
2642
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2643
    };
2644

    
2645
    qemu_iovec_init_external(&qiov, &iov, 1);
2646
    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2647
                        &qiov, is_write, flags);
2648
}
2649

    
2650
/* return < 0 if error. See bdrv_write() for the return codes */
2651
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2652
              uint8_t *buf, int nb_sectors)
2653
{
2654
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2655
}
2656

    
2657
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2658
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2659
                          uint8_t *buf, int nb_sectors)
2660
{
2661
    bool enabled;
2662
    int ret;
2663

    
2664
    enabled = bs->io_limits_enabled;
2665
    bs->io_limits_enabled = false;
2666
    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2667
    bs->io_limits_enabled = enabled;
2668
    return ret;
2669
}
2670

    
2671
/* Return < 0 if error. Important errors are:
2672
  -EIO         generic I/O error (may happen for all errors)
2673
  -ENOMEDIUM   No media inserted.
2674
  -EINVAL      Invalid sector number or nb_sectors
2675
  -EACCES      Trying to write a read-only device
2676
*/
2677
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2678
               const uint8_t *buf, int nb_sectors)
2679
{
2680
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2681
}
2682

    
2683
int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2684
                      int nb_sectors, BdrvRequestFlags flags)
2685
{
2686
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2687
                      BDRV_REQ_ZERO_WRITE | flags);
2688
}
2689

    
2690
/*
2691
 * Completely zero out a block device with the help of bdrv_write_zeroes.
2692
 * The operation is sped up by checking the block status and only writing
2693
 * zeroes to the device if they currently do not return zeroes. Optional
2694
 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2695
 *
2696
 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2697
 */
2698
int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2699
{
2700
    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2701
    int64_t ret, nb_sectors, sector_num = 0;
2702
    int n;
2703

    
2704
    for (;;) {
2705
        nb_sectors = target_size - sector_num;
2706
        if (nb_sectors <= 0) {
2707
            return 0;
2708
        }
2709
        if (nb_sectors > INT_MAX) {
2710
            nb_sectors = INT_MAX;
2711
        }
2712
        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2713
        if (ret < 0) {
2714
            error_report("error getting block status at sector %" PRId64 ": %s",
2715
                         sector_num, strerror(-ret));
2716
            return ret;
2717
        }
2718
        if (ret & BDRV_BLOCK_ZERO) {
2719
            sector_num += n;
2720
            continue;
2721
        }
2722
        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2723
        if (ret < 0) {
2724
            error_report("error writing zeroes at sector %" PRId64 ": %s",
2725
                         sector_num, strerror(-ret));
2726
            return ret;
2727
        }
2728
        sector_num += n;
2729
    }
2730
}
2731

    
2732
int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2733
{
2734
    QEMUIOVector qiov;
2735
    struct iovec iov = {
2736
        .iov_base = (void *)buf,
2737
        .iov_len = bytes,
2738
    };
2739
    int ret;
2740

    
2741
    if (bytes < 0) {
2742
        return -EINVAL;
2743
    }
2744

    
2745
    qemu_iovec_init_external(&qiov, &iov, 1);
2746
    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2747
    if (ret < 0) {
2748
        return ret;
2749
    }
2750

    
2751
    return bytes;
2752
}
2753

    
2754
int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2755
{
2756
    int ret;
2757

    
2758
    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2759
    if (ret < 0) {
2760
        return ret;
2761
    }
2762

    
2763
    return qiov->size;
2764
}
2765

    
2766
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2767
                const void *buf, int bytes)
2768
{
2769
    QEMUIOVector qiov;
2770
    struct iovec iov = {
2771
        .iov_base   = (void *) buf,
2772
        .iov_len    = bytes,
2773
    };
2774

    
2775
    if (bytes < 0) {
2776
        return -EINVAL;
2777
    }
2778

    
2779
    qemu_iovec_init_external(&qiov, &iov, 1);
2780
    return bdrv_pwritev(bs, offset, &qiov);
2781
}
2782

    
2783
/*
2784
 * Writes to the file and ensures that no writes are reordered across this
2785
 * request (acts as a barrier)
2786
 *
2787
 * Returns 0 on success, -errno in error cases.
2788
 */
2789
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2790
    const void *buf, int count)
2791
{
2792
    int ret;
2793

    
2794
    ret = bdrv_pwrite(bs, offset, buf, count);
2795
    if (ret < 0) {
2796
        return ret;
2797
    }
2798

    
2799
    /* No flush needed for cache modes that already do it */
2800
    if (bs->enable_write_cache) {
2801
        bdrv_flush(bs);
2802
    }
2803

    
2804
    return 0;
2805
}
2806

    
2807
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2808
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2809
{
2810
    /* Perform I/O through a temporary buffer so that users who scribble over
2811
     * their read buffer while the operation is in progress do not end up
2812
     * modifying the image file.  This is critical for zero-copy guest I/O
2813
     * where anything might happen inside guest memory.
2814
     */
2815
    void *bounce_buffer;
2816

    
2817
    BlockDriver *drv = bs->drv;
2818
    struct iovec iov;
2819
    QEMUIOVector bounce_qiov;
2820
    int64_t cluster_sector_num;
2821
    int cluster_nb_sectors;
2822
    size_t skip_bytes;
2823
    int ret;
2824

    
2825
    /* Cover entire cluster so no additional backing file I/O is required when
2826
     * allocating cluster in the image file.
2827
     */
2828
    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2829
                           &cluster_sector_num, &cluster_nb_sectors);
2830

    
2831
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2832
                                   cluster_sector_num, cluster_nb_sectors);
2833

    
2834
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2835
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2836
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2837

    
2838
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2839
                             &bounce_qiov);
2840
    if (ret < 0) {
2841
        goto err;
2842
    }
2843

    
2844
    if (drv->bdrv_co_write_zeroes &&
2845
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2846
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2847
                                      cluster_nb_sectors, 0);
2848
    } else {
2849
        /* This does not change the data on the disk, it is not necessary
2850
         * to flush even in cache=writethrough mode.
2851
         */
2852
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2853
                                  &bounce_qiov);
2854
    }
2855

    
2856
    if (ret < 0) {
2857
        /* It might be okay to ignore write errors for guest requests.  If this
2858
         * is a deliberate copy-on-read then we don't want to ignore the error.
2859
         * Simply report it in all cases.
2860
         */
2861
        goto err;
2862
    }
2863

    
2864
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2865
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2866
                        nb_sectors * BDRV_SECTOR_SIZE);
2867

    
2868
err:
2869
    qemu_vfree(bounce_buffer);
2870
    return ret;
2871
}
2872

    
2873
/*
2874
 * Forwards an already correctly aligned request to the BlockDriver. This
2875
 * handles copy on read and zeroing after EOF; any other features must be
2876
 * implemented by the caller.
2877
 */
2878
static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2879
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2880
    int64_t align, QEMUIOVector *qiov, int flags)
2881
{
2882
    BlockDriver *drv = bs->drv;
2883
    int ret;
2884

    
2885
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2886
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2887

    
2888
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2889
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2890

    
2891
    /* Handle Copy on Read and associated serialisation */
2892
    if (flags & BDRV_REQ_COPY_ON_READ) {
2893
        /* If we touch the same cluster it counts as an overlap.  This
2894
         * guarantees that allocating writes will be serialized and not race
2895
         * with each other for the same cluster.  For example, in copy-on-read
2896
         * it ensures that the CoR read and write operations are atomic and
2897
         * guest writes cannot interleave between them. */
2898
        mark_request_serialising(req, bdrv_get_cluster_size(bs));
2899
    }
2900

    
2901
    wait_serialising_requests(req);
2902

    
2903
    if (flags & BDRV_REQ_COPY_ON_READ) {
2904
        int pnum;
2905

    
2906
        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2907
        if (ret < 0) {
2908
            goto out;
2909
        }
2910

    
2911
        if (!ret || pnum != nb_sectors) {
2912
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2913
            goto out;
2914
        }
2915
    }
2916

    
2917
    /* Forward the request to the BlockDriver */
2918
    if (!(bs->zero_beyond_eof && bs->growable)) {
2919
        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2920
    } else {
2921
        /* Read zeros after EOF of growable BDSes */
2922
        int64_t len, total_sectors, max_nb_sectors;
2923

    
2924
        len = bdrv_getlength(bs);
2925
        if (len < 0) {
2926
            ret = len;
2927
            goto out;
2928
        }
2929

    
2930
        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2931
        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2932
                                  align >> BDRV_SECTOR_BITS);
2933
        if (max_nb_sectors > 0) {
2934
            ret = drv->bdrv_co_readv(bs, sector_num,
2935
                                     MIN(nb_sectors, max_nb_sectors), qiov);
2936
        } else {
2937
            ret = 0;
2938
        }
2939

    
2940
        /* Reading beyond end of file is supposed to produce zeroes */
2941
        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2942
            uint64_t offset = MAX(0, total_sectors - sector_num);
2943
            uint64_t bytes = (sector_num + nb_sectors - offset) *
2944
                              BDRV_SECTOR_SIZE;
2945
            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2946
        }
2947
    }
2948

    
2949
out:
2950
    return ret;
2951
}
2952

    
2953
/*
2954
 * Handle a read request in coroutine context
2955
 */
2956
static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2957
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2958
    BdrvRequestFlags flags)
2959
{
2960
    BlockDriver *drv = bs->drv;
2961
    BdrvTrackedRequest req;
2962

    
2963
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2964
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2965
    uint8_t *head_buf = NULL;
2966
    uint8_t *tail_buf = NULL;
2967
    QEMUIOVector local_qiov;
2968
    bool use_local_qiov = false;
2969
    int ret;
2970

    
2971
    if (!drv) {
2972
        return -ENOMEDIUM;
2973
    }
2974
    if (bdrv_check_byte_request(bs, offset, bytes)) {
2975
        return -EIO;
2976
    }
2977

    
2978
    if (bs->copy_on_read) {
2979
        flags |= BDRV_REQ_COPY_ON_READ;
2980
    }
2981

    
2982
    /* throttling disk I/O */
2983
    if (bs->io_limits_enabled) {
2984
        bdrv_io_limits_intercept(bs, bytes, false);
2985
    }
2986

    
2987
    /* Align read if necessary by padding qiov */
2988
    if (offset & (align - 1)) {
2989
        head_buf = qemu_blockalign(bs, align);
2990
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
2991
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
2992
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
2993
        use_local_qiov = true;
2994

    
2995
        bytes += offset & (align - 1);
2996
        offset = offset & ~(align - 1);
2997
    }
2998

    
2999
    if ((offset + bytes) & (align - 1)) {
3000
        if (!use_local_qiov) {
3001
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3002
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3003
            use_local_qiov = true;
3004
        }
3005
        tail_buf = qemu_blockalign(bs, align);
3006
        qemu_iovec_add(&local_qiov, tail_buf,
3007
                       align - ((offset + bytes) & (align - 1)));
3008

    
3009
        bytes = ROUND_UP(bytes, align);
3010
    }
3011

    
3012
    tracked_request_begin(&req, bs, offset, bytes, false);
3013
    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3014
                              use_local_qiov ? &local_qiov : qiov,
3015
                              flags);
3016
    tracked_request_end(&req);
3017

    
3018
    if (use_local_qiov) {
3019
        qemu_iovec_destroy(&local_qiov);
3020
        qemu_vfree(head_buf);
3021
        qemu_vfree(tail_buf);
3022
    }
3023

    
3024
    return ret;
3025
}
3026

    
3027
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3028
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3029
    BdrvRequestFlags flags)
3030
{
3031
    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3032
        return -EINVAL;
3033
    }
3034

    
3035
    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3036
                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3037
}
3038

    
3039
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3040
    int nb_sectors, QEMUIOVector *qiov)
3041
{
3042
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3043

    
3044
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3045
}
3046

    
3047
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3048
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3049
{
3050
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3051

    
3052
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3053
                            BDRV_REQ_COPY_ON_READ);
3054
}
3055

    
3056
/* if no limit is specified in the BlockLimits use a default
3057
 * of 32768 512-byte sectors (16 MiB) per request.
3058
 */
3059
#define MAX_WRITE_ZEROES_DEFAULT 32768
3060

    
3061
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3062
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3063
{
3064
    BlockDriver *drv = bs->drv;
3065
    QEMUIOVector qiov;
3066
    struct iovec iov = {0};
3067
    int ret = 0;
3068

    
3069
    int max_write_zeroes = bs->bl.max_write_zeroes ?
3070
                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3071

    
3072
    while (nb_sectors > 0 && !ret) {
3073
        int num = nb_sectors;
3074

    
3075
        /* Align request.  Block drivers can expect the "bulk" of the request
3076
         * to be aligned.
3077
         */
3078
        if (bs->bl.write_zeroes_alignment
3079
            && num > bs->bl.write_zeroes_alignment) {
3080
            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3081
                /* Make a small request up to the first aligned sector.  */
3082
                num = bs->bl.write_zeroes_alignment;
3083
                num -= sector_num % bs->bl.write_zeroes_alignment;
3084
            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3085
                /* Shorten the request to the last aligned sector.  num cannot
3086
                 * underflow because num > bs->bl.write_zeroes_alignment.
3087
                 */
3088
                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3089
            }
3090
        }
3091

    
3092
        /* limit request size */
3093
        if (num > max_write_zeroes) {
3094
            num = max_write_zeroes;
3095
        }
3096

    
3097
        ret = -ENOTSUP;
3098
        /* First try the efficient write zeroes operation */
3099
        if (drv->bdrv_co_write_zeroes) {
3100
            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3101
        }
3102

    
3103
        if (ret == -ENOTSUP) {
3104
            /* Fall back to bounce buffer if write zeroes is unsupported */
3105
            iov.iov_len = num * BDRV_SECTOR_SIZE;
3106
            if (iov.iov_base == NULL) {
3107
                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3108
                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3109
            }
3110
            qemu_iovec_init_external(&qiov, &iov, 1);
3111

    
3112
            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3113

    
3114
            /* Keep bounce buffer around if it is big enough for all
3115
             * all future requests.
3116
             */
3117
            if (num < max_write_zeroes) {
3118
                qemu_vfree(iov.iov_base);
3119
                iov.iov_base = NULL;
3120
            }
3121
        }
3122

    
3123
        sector_num += num;
3124
        nb_sectors -= num;
3125
    }
3126

    
3127
    qemu_vfree(iov.iov_base);
3128
    return ret;
3129
}
3130

    
3131
/*
3132
 * Forwards an already correctly aligned write request to the BlockDriver.
3133
 */
3134
static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3135
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3136
    QEMUIOVector *qiov, int flags)
3137
{
3138
    BlockDriver *drv = bs->drv;
3139
    bool waited;
3140
    int ret;
3141

    
3142
    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3143
    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3144

    
3145
    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3146
    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3147

    
3148
    waited = wait_serialising_requests(req);
3149
    assert(!waited || !req->serialising);
3150
    assert(req->overlap_offset <= offset);
3151
    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3152

    
3153
    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3154

    
3155
    if (ret < 0) {
3156
        /* Do nothing, write notifier decided to fail this request */
3157
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
3158
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3159
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3160
    } else {
3161
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3162
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3163
    }
3164
    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3165

    
3166
    if (ret == 0 && !bs->enable_write_cache) {
3167
        ret = bdrv_co_flush(bs);
3168
    }
3169

    
3170
    bdrv_set_dirty(bs, sector_num, nb_sectors);
3171

    
3172
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3173
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
3174
    }
3175
    if (bs->growable && ret >= 0) {
3176
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3177
    }
3178

    
3179
    return ret;
3180
}
3181

    
3182
/*
3183
 * Handle a write request in coroutine context
3184
 */
3185
static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3186
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3187
    BdrvRequestFlags flags)
3188
{
3189
    BdrvTrackedRequest req;
3190
    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3191
    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3192
    uint8_t *head_buf = NULL;
3193
    uint8_t *tail_buf = NULL;
3194
    QEMUIOVector local_qiov;
3195
    bool use_local_qiov = false;
3196
    int ret;
3197

    
3198
    if (!bs->drv) {
3199
        return -ENOMEDIUM;
3200
    }
3201
    if (bs->read_only) {
3202
        return -EACCES;
3203
    }
3204
    if (bdrv_check_byte_request(bs, offset, bytes)) {
3205
        return -EIO;
3206
    }
3207

    
3208
    /* throttling disk I/O */
3209
    if (bs->io_limits_enabled) {
3210
        bdrv_io_limits_intercept(bs, bytes, true);
3211
    }
3212

    
3213
    /*
3214
     * Align write if necessary by performing a read-modify-write cycle.
3215
     * Pad qiov with the read parts and be sure to have a tracked request not
3216
     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3217
     */
3218
    tracked_request_begin(&req, bs, offset, bytes, true);
3219

    
3220
    if (offset & (align - 1)) {
3221
        QEMUIOVector head_qiov;
3222
        struct iovec head_iov;
3223

    
3224
        mark_request_serialising(&req, align);
3225
        wait_serialising_requests(&req);
3226

    
3227
        head_buf = qemu_blockalign(bs, align);
3228
        head_iov = (struct iovec) {
3229
            .iov_base   = head_buf,
3230
            .iov_len    = align,
3231
        };
3232
        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3233

    
3234
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3235
        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3236
                                  align, &head_qiov, 0);
3237
        if (ret < 0) {
3238
            goto fail;
3239
        }
3240
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3241

    
3242
        qemu_iovec_init(&local_qiov, qiov->niov + 2);
3243
        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3244
        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3245
        use_local_qiov = true;
3246

    
3247
        bytes += offset & (align - 1);
3248
        offset = offset & ~(align - 1);
3249
    }
3250

    
3251
    if ((offset + bytes) & (align - 1)) {
3252
        QEMUIOVector tail_qiov;
3253
        struct iovec tail_iov;
3254
        size_t tail_bytes;
3255
        bool waited;
3256

    
3257
        mark_request_serialising(&req, align);
3258
        waited = wait_serialising_requests(&req);
3259
        assert(!waited || !use_local_qiov);
3260

    
3261
        tail_buf = qemu_blockalign(bs, align);
3262
        tail_iov = (struct iovec) {
3263
            .iov_base   = tail_buf,
3264
            .iov_len    = align,
3265
        };
3266
        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3267

    
3268
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3269
        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3270
                                  align, &tail_qiov, 0);
3271
        if (ret < 0) {
3272
            goto fail;
3273
        }
3274
        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3275

    
3276
        if (!use_local_qiov) {
3277
            qemu_iovec_init(&local_qiov, qiov->niov + 1);
3278
            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3279
            use_local_qiov = true;
3280
        }
3281

    
3282
        tail_bytes = (offset + bytes) & (align - 1);
3283
        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3284

    
3285
        bytes = ROUND_UP(bytes, align);
3286
    }
3287

    
3288
    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3289
                               use_local_qiov ? &local_qiov : qiov,
3290
                               flags);
3291

    
3292
fail:
3293
    tracked_request_end(&req);
3294

    
3295
    if (use_local_qiov) {
3296
        qemu_iovec_destroy(&local_qiov);
3297
    }
3298
    qemu_vfree(head_buf);
3299
    qemu_vfree(tail_buf);
3300

    
3301
    return ret;
3302
}
3303

    
3304
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3305
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3306
    BdrvRequestFlags flags)
3307
{
3308
    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3309
        return -EINVAL;
3310
    }
3311

    
3312
    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3313
                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3314
}
3315

    
3316
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3317
    int nb_sectors, QEMUIOVector *qiov)
3318
{
3319
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3320

    
3321
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3322
}
3323

    
3324
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3325
                                      int64_t sector_num, int nb_sectors,
3326
                                      BdrvRequestFlags flags)
3327
{
3328
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3329

    
3330
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
3331
        flags &= ~BDRV_REQ_MAY_UNMAP;
3332
    }
3333

    
3334
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3335
                             BDRV_REQ_ZERO_WRITE | flags);
3336
}
3337

    
3338
/**
3339
 * Truncate file to 'offset' bytes (needed only for file protocols)
3340
 */
3341
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3342
{
3343
    BlockDriver *drv = bs->drv;
3344
    int ret;
3345
    if (!drv)
3346
        return -ENOMEDIUM;
3347
    if (!drv->bdrv_truncate)
3348
        return -ENOTSUP;
3349
    if (bs->read_only)
3350
        return -EACCES;
3351
    if (bdrv_in_use(bs))
3352
        return -EBUSY;
3353
    ret = drv->bdrv_truncate(bs, offset);
3354
    if (ret == 0) {
3355
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3356
        bdrv_dev_resize_cb(bs);
3357
    }
3358
    return ret;
3359
}
3360

    
3361
/**
3362
 * Length of a allocated file in bytes. Sparse files are counted by actual
3363
 * allocated space. Return < 0 if error or unknown.
3364
 */
3365
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3366
{
3367
    BlockDriver *drv = bs->drv;
3368
    if (!drv) {
3369
        return -ENOMEDIUM;
3370
    }
3371
    if (drv->bdrv_get_allocated_file_size) {
3372
        return drv->bdrv_get_allocated_file_size(bs);
3373
    }
3374
    if (bs->file) {
3375
        return bdrv_get_allocated_file_size(bs->file);
3376
    }
3377
    return -ENOTSUP;
3378
}
3379

    
3380
/**
3381
 * Length of a file in bytes. Return < 0 if error or unknown.
3382
 */
3383
int64_t bdrv_getlength(BlockDriverState *bs)
3384
{
3385
    BlockDriver *drv = bs->drv;
3386
    if (!drv)
3387
        return -ENOMEDIUM;
3388

    
3389
    if (drv->has_variable_length) {
3390
        int ret = refresh_total_sectors(bs, bs->total_sectors);
3391
        if (ret < 0) {
3392
            return ret;
3393
        }
3394
    }
3395
    return bs->total_sectors * BDRV_SECTOR_SIZE;
3396
}
3397

    
3398
/* return 0 as number of sectors if no device present or error */
3399
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3400
{
3401
    int64_t length;
3402
    length = bdrv_getlength(bs);
3403
    if (length < 0)
3404
        length = 0;
3405
    else
3406
        length = length >> BDRV_SECTOR_BITS;
3407
    *nb_sectors_ptr = length;
3408
}
3409

    
3410
void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3411
                       BlockdevOnError on_write_error)
3412
{
3413
    bs->on_read_error = on_read_error;
3414
    bs->on_write_error = on_write_error;
3415
}
3416

    
3417
BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3418
{
3419
    return is_read ? bs->on_read_error : bs->on_write_error;
3420
}
3421

    
3422
BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3423
{
3424
    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3425

    
3426
    switch (on_err) {
3427
    case BLOCKDEV_ON_ERROR_ENOSPC:
3428
        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3429
    case BLOCKDEV_ON_ERROR_STOP:
3430
        return BDRV_ACTION_STOP;
3431
    case BLOCKDEV_ON_ERROR_REPORT:
3432
        return BDRV_ACTION_REPORT;
3433
    case BLOCKDEV_ON_ERROR_IGNORE:
3434
        return BDRV_ACTION_IGNORE;
3435
    default:
3436
        abort();
3437
    }
3438
}
3439

    
3440
/* This is done by device models because, while the block layer knows
3441
 * about the error, it does not know whether an operation comes from
3442
 * the device or the block layer (from a job, for example).
3443
 */
3444
void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3445
                       bool is_read, int error)
3446
{
3447
    assert(error >= 0);
3448
    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3449
    if (action == BDRV_ACTION_STOP) {
3450
        vm_stop(RUN_STATE_IO_ERROR);
3451
        bdrv_iostatus_set_err(bs, error);
3452
    }
3453
}
3454

    
3455
int bdrv_is_read_only(BlockDriverState *bs)
3456
{
3457
    return bs->read_only;
3458
}
3459

    
3460
int bdrv_is_sg(BlockDriverState *bs)
3461
{
3462
    return bs->sg;
3463
}
3464

    
3465
int bdrv_enable_write_cache(BlockDriverState *bs)
3466
{
3467
    return bs->enable_write_cache;
3468
}
3469

    
3470
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3471
{
3472
    bs->enable_write_cache = wce;
3473

    
3474
    /* so a reopen() will preserve wce */
3475
    if (wce) {
3476
        bs->open_flags |= BDRV_O_CACHE_WB;
3477
    } else {
3478
        bs->open_flags &= ~BDRV_O_CACHE_WB;
3479
    }
3480
}
3481

    
3482
int bdrv_is_encrypted(BlockDriverState *bs)
3483
{
3484
    if (bs->backing_hd && bs->backing_hd->encrypted)
3485
        return 1;
3486
    return bs->encrypted;
3487
}
3488

    
3489
int bdrv_key_required(BlockDriverState *bs)
3490
{
3491
    BlockDriverState *backing_hd = bs->backing_hd;
3492

    
3493
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3494
        return 1;
3495
    return (bs->encrypted && !bs->valid_key);
3496
}
3497

    
3498
int bdrv_set_key(BlockDriverState *bs, const char *key)
3499
{
3500
    int ret;
3501
    if (bs->backing_hd && bs->backing_hd->encrypted) {
3502
        ret = bdrv_set_key(bs->backing_hd, key);
3503
        if (ret < 0)
3504
            return ret;
3505
        if (!bs->encrypted)
3506
            return 0;
3507
    }
3508
    if (!bs->encrypted) {
3509
        return -EINVAL;
3510
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3511
        return -ENOMEDIUM;
3512
    }
3513
    ret = bs->drv->bdrv_set_key(bs, key);
3514
    if (ret < 0) {
3515
        bs->valid_key = 0;
3516
    } else if (!bs->valid_key) {
3517
        bs->valid_key = 1;
3518
        /* call the change callback now, we skipped it on open */
3519
        bdrv_dev_change_media_cb(bs, true);
3520
    }
3521
    return ret;
3522
}
3523

    
3524
const char *bdrv_get_format_name(BlockDriverState *bs)
3525
{
3526
    return bs->drv ? bs->drv->format_name : NULL;
3527
}
3528

    
3529
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3530
                         void *opaque)
3531
{
3532
    BlockDriver *drv;
3533

    
3534
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
3535
        it(opaque, drv->format_name);
3536
    }
3537
}
3538

    
3539
/* This function is to find block backend bs */
3540
BlockDriverState *bdrv_find(const char *name)
3541
{
3542
    BlockDriverState *bs;
3543

    
3544
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3545
        if (!strcmp(name, bs->device_name)) {
3546
            return bs;
3547
        }
3548
    }
3549
    return NULL;
3550
}
3551

    
3552
/* This function is to find a node in the bs graph */
3553
BlockDriverState *bdrv_find_node(const char *node_name)
3554
{
3555
    BlockDriverState *bs;
3556

    
3557
    assert(node_name);
3558

    
3559
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3560
        if (!strcmp(node_name, bs->node_name)) {
3561
            return bs;
3562
        }
3563
    }
3564
    return NULL;
3565
}
3566

    
3567
/* Put this QMP function here so it can access the static graph_bdrv_states. */
3568
BlockDeviceInfoList *bdrv_named_nodes_list(void)
3569
{
3570
    BlockDeviceInfoList *list, *entry;
3571
    BlockDriverState *bs;
3572

    
3573
    list = NULL;
3574
    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3575
        entry = g_malloc0(sizeof(*entry));
3576
        entry->value = bdrv_block_device_info(bs);
3577
        entry->next = list;
3578
        list = entry;
3579
    }
3580

    
3581
    return list;
3582
}
3583

    
3584
BlockDriverState *bdrv_lookup_bs(const char *device,
3585
                                 const char *node_name,
3586
                                 Error **errp)
3587
{
3588
    BlockDriverState *bs = NULL;
3589

    
3590
    if (device) {
3591
        bs = bdrv_find(device);
3592

    
3593
        if (bs) {
3594
            return bs;
3595
        }
3596
    }
3597

    
3598
    if (node_name) {
3599
        bs = bdrv_find_node(node_name);
3600

    
3601
        if (bs) {
3602
            return bs;
3603
        }
3604
    }
3605

    
3606
    error_setg(errp, "Cannot find device=%s nor node_name=%s",
3607
                     device ? device : "",
3608
                     node_name ? node_name : "");
3609
    return NULL;
3610
}
3611

    
3612
BlockDriverState *bdrv_next(BlockDriverState *bs)
3613
{
3614
    if (!bs) {
3615
        return QTAILQ_FIRST(&bdrv_states);
3616
    }
3617
    return QTAILQ_NEXT(bs, device_list);
3618
}
3619

    
3620
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3621
{
3622
    BlockDriverState *bs;
3623

    
3624
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3625
        it(opaque, bs);
3626
    }
3627
}
3628

    
3629
const char *bdrv_get_device_name(BlockDriverState *bs)
3630
{
3631
    return bs->device_name;
3632
}
3633

    
3634
int bdrv_get_flags(BlockDriverState *bs)
3635
{
3636
    return bs->open_flags;
3637
}
3638

    
3639
int bdrv_flush_all(void)
3640
{
3641
    BlockDriverState *bs;
3642
    int result = 0;
3643

    
3644
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3645
        int ret = bdrv_flush(bs);
3646
        if (ret < 0 && !result) {
3647
            result = ret;
3648
        }
3649
    }
3650

    
3651
    return result;
3652
}
3653

    
3654
int bdrv_has_zero_init_1(BlockDriverState *bs)
3655
{
3656
    return 1;
3657
}
3658

    
3659
int bdrv_has_zero_init(BlockDriverState *bs)
3660
{
3661
    assert(bs->drv);
3662

    
3663
    /* If BS is a copy on write image, it is initialized to
3664
       the contents of the base image, which may not be zeroes.  */
3665
    if (bs->backing_hd) {
3666
        return 0;
3667
    }
3668
    if (bs->drv->bdrv_has_zero_init) {
3669
        return bs->drv->bdrv_has_zero_init(bs);
3670
    }
3671

    
3672
    /* safe default */
3673
    return 0;
3674
}
3675

    
3676
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3677
{
3678
    BlockDriverInfo bdi;
3679

    
3680
    if (bs->backing_hd) {
3681
        return false;
3682
    }
3683

    
3684
    if (bdrv_get_info(bs, &bdi) == 0) {
3685
        return bdi.unallocated_blocks_are_zero;
3686
    }
3687

    
3688
    return false;
3689
}
3690

    
3691
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3692
{
3693
    BlockDriverInfo bdi;
3694

    
3695
    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3696
        return false;
3697
    }
3698

    
3699
    if (bdrv_get_info(bs, &bdi) == 0) {
3700
        return bdi.can_write_zeroes_with_unmap;
3701
    }
3702

    
3703
    return false;
3704
}
3705

    
3706
typedef struct BdrvCoGetBlockStatusData {
3707
    BlockDriverState *bs;
3708
    BlockDriverState *base;
3709
    int64_t sector_num;
3710
    int nb_sectors;
3711
    int *pnum;
3712
    int64_t ret;
3713
    bool done;
3714
} BdrvCoGetBlockStatusData;
3715

    
3716
/*
3717
 * Returns true iff the specified sector is present in the disk image. Drivers
3718
 * not implementing the functionality are assumed to not support backing files,
3719
 * hence all their sectors are reported as allocated.
3720
 *
3721
 * If 'sector_num' is beyond the end of the disk image the return value is 0
3722
 * and 'pnum' is set to 0.
3723
 *
3724
 * 'pnum' is set to the number of sectors (including and immediately following
3725
 * the specified sector) that are known to be in the same
3726
 * allocated/unallocated state.
3727
 *
3728
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3729
 * beyond the end of the disk image it will be clamped.
3730
 */
3731
static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3732
                                                     int64_t sector_num,
3733
                                                     int nb_sectors, int *pnum)
3734
{
3735
    int64_t length;
3736
    int64_t n;
3737
    int64_t ret, ret2;
3738

    
3739
    length = bdrv_getlength(bs);
3740
    if (length < 0) {
3741
        return length;
3742
    }
3743

    
3744
    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3745
        *pnum = 0;
3746
        return 0;
3747
    }
3748

    
3749
    n = bs->total_sectors - sector_num;
3750
    if (n < nb_sectors) {
3751
        nb_sectors = n;
3752
    }
3753

    
3754
    if (!bs->drv->bdrv_co_get_block_status) {
3755
        *pnum = nb_sectors;
3756
        ret = BDRV_BLOCK_DATA;
3757
        if (bs->drv->protocol_name) {
3758
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3759
        }
3760
        return ret;
3761
    }
3762

    
3763
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3764
    if (ret < 0) {
3765
        *pnum = 0;
3766
        return ret;
3767
    }
3768

    
3769
    if (ret & BDRV_BLOCK_RAW) {
3770
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
3771
        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3772
                                     *pnum, pnum);
3773
    }
3774

    
3775
    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3776
        if (bdrv_unallocated_blocks_are_zero(bs)) {
3777
            ret |= BDRV_BLOCK_ZERO;
3778
        } else if (bs->backing_hd) {
3779
            BlockDriverState *bs2 = bs->backing_hd;
3780
            int64_t length2 = bdrv_getlength(bs2);
3781
            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3782
                ret |= BDRV_BLOCK_ZERO;
3783
            }
3784
        }
3785
    }
3786

    
3787
    if (bs->file &&
3788
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3789
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
3790
        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3791
                                        *pnum, pnum);
3792
        if (ret2 >= 0) {
3793
            /* Ignore errors.  This is just providing extra information, it
3794
             * is useful but not necessary.
3795
             */
3796
            ret |= (ret2 & BDRV_BLOCK_ZERO);
3797
        }
3798
    }
3799

    
3800
    return ret;
3801
}
3802

    
3803
/* Coroutine wrapper for bdrv_get_block_status() */
3804
static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3805
{
3806
    BdrvCoGetBlockStatusData *data = opaque;
3807
    BlockDriverState *bs = data->bs;
3808

    
3809
    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3810
                                         data->pnum);
3811
    data->done = true;
3812
}
3813

    
3814
/*
3815
 * Synchronous wrapper around bdrv_co_get_block_status().
3816
 *
3817
 * See bdrv_co_get_block_status() for details.
3818
 */
3819
int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3820
                              int nb_sectors, int *pnum)
3821
{
3822
    Coroutine *co;
3823
    BdrvCoGetBlockStatusData data = {
3824
        .bs = bs,
3825
        .sector_num = sector_num,
3826
        .nb_sectors = nb_sectors,
3827
        .pnum = pnum,
3828
        .done = false,
3829
    };
3830

    
3831
    if (qemu_in_coroutine()) {
3832
        /* Fast-path if already in coroutine context */
3833
        bdrv_get_block_status_co_entry(&data);
3834
    } else {
3835
        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3836
        qemu_coroutine_enter(co, &data);
3837
        while (!data.done) {
3838
            qemu_aio_wait();
3839
        }
3840
    }
3841
    return data.ret;
3842
}
3843

    
3844
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3845
                                   int nb_sectors, int *pnum)
3846
{
3847
    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3848
    if (ret < 0) {
3849
        return ret;
3850
    }
3851
    return
3852
        (ret & BDRV_BLOCK_DATA) ||
3853
        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3854
}
3855

    
3856
/*
3857
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3858
 *
3859
 * Return true if the given sector is allocated in any image between
3860
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3861
 * sector is allocated in any image of the chain.  Return false otherwise.
3862
 *
3863
 * 'pnum' is set to the number of sectors (including and immediately following
3864
 *  the specified sector) that are known to be in the same
3865
 *  allocated/unallocated state.
3866
 *
3867
 */
3868
int bdrv_is_allocated_above(BlockDriverState *top,
3869
                            BlockDriverState *base,
3870
                            int64_t sector_num,
3871
                            int nb_sectors, int *pnum)
3872
{
3873
    BlockDriverState *intermediate;
3874
    int ret, n = nb_sectors;
3875

    
3876
    intermediate = top;
3877
    while (intermediate && intermediate != base) {
3878
        int pnum_inter;
3879
        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3880
                                &pnum_inter);
3881
        if (ret < 0) {
3882
            return ret;
3883
        } else if (ret) {
3884
            *pnum = pnum_inter;
3885
            return 1;
3886
        }
3887

    
3888
        /*
3889
         * [sector_num, nb_sectors] is unallocated on top but intermediate
3890
         * might have
3891
         *
3892
         * [sector_num+x, nr_sectors] allocated.
3893
         */
3894
        if (n > pnum_inter &&
3895
            (intermediate == top ||
3896
             sector_num + pnum_inter < intermediate->total_sectors)) {
3897
            n = pnum_inter;
3898
        }
3899

    
3900
        intermediate = intermediate->backing_hd;
3901
    }
3902

    
3903
    *pnum = n;
3904
    return 0;
3905
}
3906

    
3907
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3908
{
3909
    if (bs->backing_hd && bs->backing_hd->encrypted)
3910
        return bs->backing_file;
3911
    else if (bs->encrypted)
3912
        return bs->filename;
3913
    else
3914
        return NULL;
3915
}
3916

    
3917
void bdrv_get_backing_filename(BlockDriverState *bs,
3918
                               char *filename, int filename_size)
3919
{
3920
    pstrcpy(filename, filename_size, bs->backing_file);
3921
}
3922

    
3923
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3924
                          const uint8_t *buf, int nb_sectors)
3925
{
3926
    BlockDriver *drv = bs->drv;
3927
    if (!drv)
3928
        return -ENOMEDIUM;
3929
    if (!drv->bdrv_write_compressed)
3930
        return -ENOTSUP;
3931
    if (bdrv_check_request(bs, sector_num, nb_sectors))
3932
        return -EIO;
3933

    
3934
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3935

    
3936
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3937
}
3938

    
3939
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3940
{
3941
    BlockDriver *drv = bs->drv;
3942
    if (!drv)
3943
        return -ENOMEDIUM;
3944
    if (!drv->bdrv_get_info)
3945
        return -ENOTSUP;
3946
    memset(bdi, 0, sizeof(*bdi));
3947
    return drv->bdrv_get_info(bs, bdi);
3948
}
3949

    
3950
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3951
{
3952
    BlockDriver *drv = bs->drv;
3953
    if (drv && drv->bdrv_get_specific_info) {
3954
        return drv->bdrv_get_specific_info(bs);
3955
    }
3956
    return NULL;
3957
}
3958

    
3959
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3960
                      int64_t pos, int size)
3961
{
3962
    QEMUIOVector qiov;
3963
    struct iovec iov = {
3964
        .iov_base   = (void *) buf,
3965
        .iov_len    = size,
3966
    };
3967

    
3968
    qemu_iovec_init_external(&qiov, &iov, 1);
3969
    return bdrv_writev_vmstate(bs, &qiov, pos);
3970
}
3971

    
3972
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3973
{
3974
    BlockDriver *drv = bs->drv;
3975

    
3976
    if (!drv) {
3977
        return -ENOMEDIUM;
3978
    } else if (drv->bdrv_save_vmstate) {
3979
        return drv->bdrv_save_vmstate(bs, qiov, pos);
3980
    } else if (bs->file) {
3981
        return bdrv_writev_vmstate(bs->file, qiov, pos);
3982
    }
3983

    
3984
    return -ENOTSUP;
3985
}
3986

    
3987
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3988
                      int64_t pos, int size)
3989
{
3990
    BlockDriver *drv = bs->drv;
3991
    if (!drv)
3992
        return -ENOMEDIUM;
3993
    if (drv->bdrv_load_vmstate)
3994
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
3995
    if (bs->file)
3996
        return bdrv_load_vmstate(bs->file, buf, pos, size);
3997
    return -ENOTSUP;
3998
}
3999

    
4000
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4001
{
4002
    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4003
        return;
4004
    }
4005

    
4006
    bs->drv->bdrv_debug_event(bs, event);
4007
}
4008

    
4009
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4010
                          const char *tag)
4011
{
4012
    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4013
        bs = bs->file;
4014
    }
4015

    
4016
    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4017
        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4018
    }
4019

    
4020
    return -ENOTSUP;
4021
}
4022

    
4023
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4024
{
4025
    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4026
        bs = bs->file;
4027
    }
4028

    
4029
    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4030
        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4031
    }
4032

    
4033
    return -ENOTSUP;
4034
}
4035

    
4036
int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4037
{
4038
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4039
        bs = bs->file;
4040
    }
4041

    
4042
    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4043
        return bs->drv->bdrv_debug_resume(bs, tag);
4044
    }
4045

    
4046
    return -ENOTSUP;
4047
}
4048

    
4049
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4050
{
4051
    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4052
        bs = bs->file;
4053
    }
4054

    
4055
    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4056
        return bs->drv->bdrv_debug_is_suspended(bs, tag);
4057
    }
4058

    
4059
    return false;
4060
}
4061

    
4062
int bdrv_is_snapshot(BlockDriverState *bs)
4063
{
4064
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4065
}
4066

    
4067
/* backing_file can either be relative, or absolute, or a protocol.  If it is
4068
 * relative, it must be relative to the chain.  So, passing in bs->filename
4069
 * from a BDS as backing_file should not be done, as that may be relative to
4070
 * the CWD rather than the chain. */
4071
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4072
        const char *backing_file)
4073
{
4074
    char *filename_full = NULL;
4075
    char *backing_file_full = NULL;
4076
    char *filename_tmp = NULL;
4077
    int is_protocol = 0;
4078
    BlockDriverState *curr_bs = NULL;
4079
    BlockDriverState *retval = NULL;
4080

    
4081
    if (!bs || !bs->drv || !backing_file) {
4082
        return NULL;
4083
    }
4084

    
4085
    filename_full     = g_malloc(PATH_MAX);
4086
    backing_file_full = g_malloc(PATH_MAX);
4087
    filename_tmp      = g_malloc(PATH_MAX);
4088

    
4089
    is_protocol = path_has_protocol(backing_file);
4090

    
4091
    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4092

    
4093
        /* If either of the filename paths is actually a protocol, then
4094
         * compare unmodified paths; otherwise make paths relative */
4095
        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4096
            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4097
                retval = curr_bs->backing_hd;
4098
                break;
4099
            }
4100
        } else {
4101
            /* If not an absolute filename path, make it relative to the current
4102
             * image's filename path */
4103
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4104
                         backing_file);
4105

    
4106
            /* We are going to compare absolute pathnames */
4107
            if (!realpath(filename_tmp, filename_full)) {
4108
                continue;
4109
            }
4110

    
4111
            /* We need to make sure the backing filename we are comparing against
4112
             * is relative to the current image filename (or absolute) */
4113
            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4114
                         curr_bs->backing_file);
4115

    
4116
            if (!realpath(filename_tmp, backing_file_full)) {
4117
                continue;
4118
            }
4119

    
4120
            if (strcmp(backing_file_full, filename_full) == 0) {
4121
                retval = curr_bs->backing_hd;
4122
                break;
4123
            }
4124
        }
4125
    }
4126

    
4127
    g_free(filename_full);
4128
    g_free(backing_file_full);
4129
    g_free(filename_tmp);
4130
    return retval;
4131
}
4132

    
4133
int bdrv_get_backing_file_depth(BlockDriverState *bs)
4134
{
4135
    if (!bs->drv) {
4136
        return 0;
4137
    }
4138

    
4139
    if (!bs->backing_hd) {
4140
        return 0;
4141
    }
4142

    
4143
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4144
}
4145

    
4146
BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4147
{
4148
    BlockDriverState *curr_bs = NULL;
4149

    
4150
    if (!bs) {
4151
        return NULL;
4152
    }
4153

    
4154
    curr_bs = bs;
4155

    
4156
    while (curr_bs->backing_hd) {
4157
        curr_bs = curr_bs->backing_hd;
4158
    }
4159
    return curr_bs;
4160
}
4161

    
4162
/**************************************************************/
4163
/* async I/Os */
4164

    
4165
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4166
                                 QEMUIOVector *qiov, int nb_sectors,
4167
                                 BlockDriverCompletionFunc *cb, void *opaque)
4168
{
4169
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4170

    
4171
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4172
                                 cb, opaque, false);
4173
}
4174

    
4175
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4176
                                  QEMUIOVector *qiov, int nb_sectors,
4177
                                  BlockDriverCompletionFunc *cb, void *opaque)
4178
{
4179
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4180

    
4181
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4182
                                 cb, opaque, true);
4183
}
4184

    
4185
BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4186
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4187
        BlockDriverCompletionFunc *cb, void *opaque)
4188
{
4189
    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4190

    
4191
    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4192
                                 BDRV_REQ_ZERO_WRITE | flags,
4193
                                 cb, opaque, true);
4194
}
4195

    
4196

    
4197
typedef struct MultiwriteCB {
4198
    int error;
4199
    int num_requests;
4200
    int num_callbacks;
4201
    struct {
4202
        BlockDriverCompletionFunc *cb;
4203
        void *opaque;
4204
        QEMUIOVector *free_qiov;
4205
    } callbacks[];
4206
} MultiwriteCB;
4207

    
4208
static void multiwrite_user_cb(MultiwriteCB *mcb)
4209
{
4210
    int i;
4211

    
4212
    for (i = 0; i < mcb->num_callbacks; i++) {
4213
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4214
        if (mcb->callbacks[i].free_qiov) {
4215
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4216
        }
4217
        g_free(mcb->callbacks[i].free_qiov);
4218
    }
4219
}
4220

    
4221
static void multiwrite_cb(void *opaque, int ret)
4222
{
4223
    MultiwriteCB *mcb = opaque;
4224

    
4225
    trace_multiwrite_cb(mcb, ret);
4226

    
4227
    if (ret < 0 && !mcb->error) {
4228
        mcb->error = ret;
4229
    }
4230

    
4231
    mcb->num_requests--;
4232
    if (mcb->num_requests == 0) {
4233
        multiwrite_user_cb(mcb);
4234
        g_free(mcb);
4235
    }
4236
}
4237

    
4238
static int multiwrite_req_compare(const void *a, const void *b)
4239
{
4240
    const BlockRequest *req1 = a, *req2 = b;
4241

    
4242
    /*
4243
     * Note that we can't simply subtract req2->sector from req1->sector
4244
     * here as that could overflow the return value.
4245
     */
4246
    if (req1->sector > req2->sector) {
4247
        return 1;
4248
    } else if (req1->sector < req2->sector) {
4249
        return -1;
4250
    } else {
4251
        return 0;
4252
    }
4253
}
4254

    
4255
/*
4256
 * Takes a bunch of requests and tries to merge them. Returns the number of
4257
 * requests that remain after merging.
4258
 */
4259
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4260
    int num_reqs, MultiwriteCB *mcb)
4261
{
4262
    int i, outidx;
4263

    
4264
    // Sort requests by start sector
4265
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4266

    
4267
    // Check if adjacent requests touch the same clusters. If so, combine them,
4268
    // filling up gaps with zero sectors.
4269
    outidx = 0;
4270
    for (i = 1; i < num_reqs; i++) {
4271
        int merge = 0;
4272
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4273

    
4274
        // Handle exactly sequential writes and overlapping writes.
4275
        if (reqs[i].sector <= oldreq_last) {
4276
            merge = 1;
4277
        }
4278

    
4279
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4280
            merge = 0;
4281
        }
4282

    
4283
        if (merge) {
4284
            size_t size;
4285
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4286
            qemu_iovec_init(qiov,
4287
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4288

    
4289
            // Add the first request to the merged one. If the requests are
4290
            // overlapping, drop the last sectors of the first request.
4291
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
4292
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4293

    
4294
            // We should need to add any zeros between the two requests
4295
            assert (reqs[i].sector <= oldreq_last);
4296

    
4297
            // Add the second request
4298
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4299

    
4300
            reqs[outidx].nb_sectors = qiov->size >> 9;
4301
            reqs[outidx].qiov = qiov;
4302

    
4303
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4304
        } else {
4305
            outidx++;
4306
            reqs[outidx].sector     = reqs[i].sector;
4307
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4308
            reqs[outidx].qiov       = reqs[i].qiov;
4309
        }
4310
    }
4311

    
4312
    return outidx + 1;
4313
}
4314

    
4315
/*
4316
 * Submit multiple AIO write requests at once.
4317
 *
4318
 * On success, the function returns 0 and all requests in the reqs array have
4319
 * been submitted. In error case this function returns -1, and any of the
4320
 * requests may or may not be submitted yet. In particular, this means that the
4321
 * callback will be called for some of the requests, for others it won't. The
4322
 * caller must check the error field of the BlockRequest to wait for the right
4323
 * callbacks (if error != 0, no callback will be called).
4324
 *
4325
 * The implementation may modify the contents of the reqs array, e.g. to merge
4326
 * requests. However, the fields opaque and error are left unmodified as they
4327
 * are used to signal failure for a single request to the caller.
4328
 */
4329
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4330
{
4331
    MultiwriteCB *mcb;
4332
    int i;
4333

    
4334
    /* don't submit writes if we don't have a medium */
4335
    if (bs->drv == NULL) {
4336
        for (i = 0; i < num_reqs; i++) {
4337
            reqs[i].error = -ENOMEDIUM;
4338
        }
4339
        return -1;
4340
    }
4341

    
4342
    if (num_reqs == 0) {
4343
        return 0;
4344
    }
4345

    
4346
    // Create MultiwriteCB structure
4347
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4348
    mcb->num_requests = 0;
4349
    mcb->num_callbacks = num_reqs;
4350

    
4351
    for (i = 0; i < num_reqs; i++) {
4352
        mcb->callbacks[i].cb = reqs[i].cb;
4353
        mcb->callbacks[i].opaque = reqs[i].opaque;
4354
    }
4355

    
4356
    // Check for mergable requests
4357
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4358

    
4359
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4360

    
4361
    /* Run the aio requests. */
4362
    mcb->num_requests = num_reqs;
4363
    for (i = 0; i < num_reqs; i++) {
4364
        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4365
                              reqs[i].nb_sectors, reqs[i].flags,
4366
                              multiwrite_cb, mcb,
4367
                              true);
4368
    }
4369

    
4370
    return 0;
4371
}
4372

    
4373
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4374
{
4375
    acb->aiocb_info->cancel(acb);
4376
}
4377

    
4378
/**************************************************************/
4379
/* async block device emulation */
4380

    
4381
typedef struct BlockDriverAIOCBSync {
4382
    BlockDriverAIOCB common;
4383
    QEMUBH *bh;
4384
    int ret;
4385
    /* vector translation state */
4386
    QEMUIOVector *qiov;
4387
    uint8_t *bounce;
4388
    int is_write;
4389
} BlockDriverAIOCBSync;
4390

    
4391
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4392
{
4393
    BlockDriverAIOCBSync *acb =
4394
        container_of(blockacb, BlockDriverAIOCBSync, common);
4395
    qemu_bh_delete(acb->bh);
4396
    acb->bh = NULL;
4397
    qemu_aio_release(acb);
4398
}
4399

    
4400
static const AIOCBInfo bdrv_em_aiocb_info = {
4401
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4402
    .cancel             = bdrv_aio_cancel_em,
4403
};
4404

    
4405
static void bdrv_aio_bh_cb(void *opaque)
4406
{
4407
    BlockDriverAIOCBSync *acb = opaque;
4408

    
4409
    if (!acb->is_write)
4410
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4411
    qemu_vfree(acb->bounce);
4412
    acb->common.cb(acb->common.opaque, acb->ret);
4413
    qemu_bh_delete(acb->bh);
4414
    acb->bh = NULL;
4415
    qemu_aio_release(acb);
4416
}
4417

    
4418
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4419
                                            int64_t sector_num,
4420
                                            QEMUIOVector *qiov,
4421
                                            int nb_sectors,
4422
                                            BlockDriverCompletionFunc *cb,
4423
                                            void *opaque,
4424
                                            int is_write)
4425

    
4426
{
4427
    BlockDriverAIOCBSync *acb;
4428

    
4429
    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4430
    acb->is_write = is_write;
4431
    acb->qiov = qiov;
4432
    acb->bounce = qemu_blockalign(bs, qiov->size);
4433
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4434

    
4435
    if (is_write) {
4436
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4437
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4438
    } else {
4439
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4440
    }
4441

    
4442
    qemu_bh_schedule(acb->bh);
4443

    
4444
    return &acb->common;
4445
}
4446

    
4447
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4448
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4449
        BlockDriverCompletionFunc *cb, void *opaque)
4450
{
4451
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4452
}
4453

    
4454
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4455
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4456
        BlockDriverCompletionFunc *cb, void *opaque)
4457
{
4458
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4459
}
4460

    
4461

    
4462
typedef struct BlockDriverAIOCBCoroutine {
4463
    BlockDriverAIOCB common;
4464
    BlockRequest req;
4465
    bool is_write;
4466
    bool *done;
4467
    QEMUBH* bh;
4468
} BlockDriverAIOCBCoroutine;
4469

    
4470
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4471
{
4472
    BlockDriverAIOCBCoroutine *acb =
4473
        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4474
    bool done = false;
4475

    
4476
    acb->done = &done;
4477
    while (!done) {
4478
        qemu_aio_wait();
4479
    }
4480
}
4481

    
4482
static const AIOCBInfo bdrv_em_co_aiocb_info = {
4483
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4484
    .cancel             = bdrv_aio_co_cancel_em,
4485
};
4486

    
4487
static void bdrv_co_em_bh(void *opaque)
4488
{
4489
    BlockDriverAIOCBCoroutine *acb = opaque;
4490

    
4491
    acb->common.cb(acb->common.opaque, acb->req.error);
4492

    
4493
    if (acb->done) {
4494
        *acb->done = true;
4495
    }
4496

    
4497
    qemu_bh_delete(acb->bh);
4498
    qemu_aio_release(acb);
4499
}
4500

    
4501
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4502
static void coroutine_fn bdrv_co_do_rw(void *opaque)
4503
{
4504
    BlockDriverAIOCBCoroutine *acb = opaque;
4505
    BlockDriverState *bs = acb->common.bs;
4506

    
4507
    if (!acb->is_write) {
4508
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4509
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4510
    } else {
4511
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4512
            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4513
    }
4514

    
4515
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4516
    qemu_bh_schedule(acb->bh);
4517
}
4518

    
4519
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4520
                                               int64_t sector_num,
4521
                                               QEMUIOVector *qiov,
4522
                                               int nb_sectors,
4523
                                               BdrvRequestFlags flags,
4524
                                               BlockDriverCompletionFunc *cb,
4525
                                               void *opaque,
4526
                                               bool is_write)
4527
{
4528
    Coroutine *co;
4529
    BlockDriverAIOCBCoroutine *acb;
4530

    
4531
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4532
    acb->req.sector = sector_num;
4533
    acb->req.nb_sectors = nb_sectors;
4534
    acb->req.qiov = qiov;
4535
    acb->req.flags = flags;
4536
    acb->is_write = is_write;
4537
    acb->done = NULL;
4538

    
4539
    co = qemu_coroutine_create(bdrv_co_do_rw);
4540
    qemu_coroutine_enter(co, acb);
4541

    
4542
    return &acb->common;
4543
}
4544

    
4545
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4546
{
4547
    BlockDriverAIOCBCoroutine *acb = opaque;
4548
    BlockDriverState *bs = acb->common.bs;
4549

    
4550
    acb->req.error = bdrv_co_flush(bs);
4551
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4552
    qemu_bh_schedule(acb->bh);
4553
}
4554

    
4555
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4556
        BlockDriverCompletionFunc *cb, void *opaque)
4557
{
4558
    trace_bdrv_aio_flush(bs, opaque);
4559

    
4560
    Coroutine *co;
4561
    BlockDriverAIOCBCoroutine *acb;
4562

    
4563
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4564
    acb->done = NULL;
4565

    
4566
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4567
    qemu_coroutine_enter(co, acb);
4568

    
4569
    return &acb->common;
4570
}
4571

    
4572
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4573
{
4574
    BlockDriverAIOCBCoroutine *acb = opaque;
4575
    BlockDriverState *bs = acb->common.bs;
4576

    
4577
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4578
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4579
    qemu_bh_schedule(acb->bh);
4580
}
4581

    
4582
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4583
        int64_t sector_num, int nb_sectors,
4584
        BlockDriverCompletionFunc *cb, void *opaque)
4585
{
4586
    Coroutine *co;
4587
    BlockDriverAIOCBCoroutine *acb;
4588

    
4589
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4590

    
4591
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4592
    acb->req.sector = sector_num;
4593
    acb->req.nb_sectors = nb_sectors;
4594
    acb->done = NULL;
4595
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4596
    qemu_coroutine_enter(co, acb);
4597

    
4598
    return &acb->common;
4599
}
4600

    
4601
void bdrv_init(void)
4602
{
4603
    module_call_init(MODULE_INIT_BLOCK);
4604
}
4605

    
4606
void bdrv_init_with_whitelist(void)
4607
{
4608
    use_bdrv_whitelist = 1;
4609
    bdrv_init();
4610
}
4611

    
4612
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4613
                   BlockDriverCompletionFunc *cb, void *opaque)
4614
{
4615
    BlockDriverAIOCB *acb;
4616

    
4617
    acb = g_slice_alloc(aiocb_info->aiocb_size);
4618
    acb->aiocb_info = aiocb_info;
4619
    acb->bs = bs;
4620
    acb->cb = cb;
4621
    acb->opaque = opaque;
4622
    return acb;
4623
}
4624

    
4625
void qemu_aio_release(void *p)
4626
{
4627
    BlockDriverAIOCB *acb = p;
4628
    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4629
}
4630

    
4631
/**************************************************************/
4632
/* Coroutine block device emulation */
4633

    
4634
typedef struct CoroutineIOCompletion {
4635
    Coroutine *coroutine;
4636
    int ret;
4637
} CoroutineIOCompletion;
4638

    
4639
static void bdrv_co_io_em_complete(void *opaque, int ret)
4640
{
4641
    CoroutineIOCompletion *co = opaque;
4642

    
4643
    co->ret = ret;
4644
    qemu_coroutine_enter(co->coroutine, NULL);
4645
}
4646

    
4647
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4648
                                      int nb_sectors, QEMUIOVector *iov,
4649
                                      bool is_write)
4650
{
4651
    CoroutineIOCompletion co = {
4652
        .coroutine = qemu_coroutine_self(),
4653
    };
4654
    BlockDriverAIOCB *acb;
4655

    
4656
    if (is_write) {
4657
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4658
                                       bdrv_co_io_em_complete, &co);
4659
    } else {
4660
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4661
                                      bdrv_co_io_em_complete, &co);
4662
    }
4663

    
4664
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4665
    if (!acb) {
4666
        return -EIO;
4667
    }
4668
    qemu_coroutine_yield();
4669

    
4670
    return co.ret;
4671
}
4672

    
4673
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4674
                                         int64_t sector_num, int nb_sectors,
4675
                                         QEMUIOVector *iov)
4676
{
4677
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4678
}
4679

    
4680
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4681
                                         int64_t sector_num, int nb_sectors,
4682
                                         QEMUIOVector *iov)
4683
{
4684
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4685
}
4686

    
4687
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4688
{
4689
    RwCo *rwco = opaque;
4690

    
4691
    rwco->ret = bdrv_co_flush(rwco->bs);
4692
}
4693

    
4694
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4695
{
4696
    int ret;
4697

    
4698
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4699
        return 0;
4700
    }
4701

    
4702
    /* Write back cached data to the OS even with cache=unsafe */
4703
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4704
    if (bs->drv->bdrv_co_flush_to_os) {
4705
        ret = bs->drv->bdrv_co_flush_to_os(bs);
4706
        if (ret < 0) {
4707
            return ret;
4708
        }
4709
    }
4710

    
4711
    /* But don't actually force it to the disk with cache=unsafe */
4712
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
4713
        goto flush_parent;
4714
    }
4715

    
4716
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4717
    if (bs->drv->bdrv_co_flush_to_disk) {
4718
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
4719
    } else if (bs->drv->bdrv_aio_flush) {
4720
        BlockDriverAIOCB *acb;
4721
        CoroutineIOCompletion co = {
4722
            .coroutine = qemu_coroutine_self(),
4723
        };
4724

    
4725
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4726
        if (acb == NULL) {
4727
            ret = -EIO;
4728
        } else {
4729
            qemu_coroutine_yield();
4730
            ret = co.ret;
4731
        }
4732
    } else {
4733
        /*
4734
         * Some block drivers always operate in either writethrough or unsafe
4735
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4736
         * know how the server works (because the behaviour is hardcoded or
4737
         * depends on server-side configuration), so we can't ensure that
4738
         * everything is safe on disk. Returning an error doesn't work because
4739
         * that would break guests even if the server operates in writethrough
4740
         * mode.
4741
         *
4742
         * Let's hope the user knows what he's doing.
4743
         */
4744
        ret = 0;
4745
    }
4746
    if (ret < 0) {
4747
        return ret;
4748
    }
4749

    
4750
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4751
     * in the case of cache=unsafe, so there are no useless flushes.
4752
     */
4753
flush_parent:
4754
    return bdrv_co_flush(bs->file);
4755
}
4756

    
4757
void bdrv_invalidate_cache(BlockDriverState *bs)
4758
{
4759
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4760
        bs->drv->bdrv_invalidate_cache(bs);
4761
    }
4762
}
4763

    
4764
void bdrv_invalidate_cache_all(void)
4765
{
4766
    BlockDriverState *bs;
4767

    
4768
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4769
        bdrv_invalidate_cache(bs);
4770
    }
4771
}
4772

    
4773
void bdrv_clear_incoming_migration_all(void)
4774
{
4775
    BlockDriverState *bs;
4776

    
4777
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4778
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4779
    }
4780
}
4781

    
4782
int bdrv_flush(BlockDriverState *bs)
4783
{
4784
    Coroutine *co;
4785
    RwCo rwco = {
4786
        .bs = bs,
4787
        .ret = NOT_DONE,
4788
    };
4789

    
4790
    if (qemu_in_coroutine()) {
4791
        /* Fast-path if already in coroutine context */
4792
        bdrv_flush_co_entry(&rwco);
4793
    } else {
4794
        co = qemu_coroutine_create(bdrv_flush_co_entry);
4795
        qemu_coroutine_enter(co, &rwco);
4796
        while (rwco.ret == NOT_DONE) {
4797
            qemu_aio_wait();
4798
        }
4799
    }
4800

    
4801
    return rwco.ret;
4802
}
4803

    
4804
typedef struct DiscardCo {
4805
    BlockDriverState *bs;
4806
    int64_t sector_num;
4807
    int nb_sectors;
4808
    int ret;
4809
} DiscardCo;
4810
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4811
{
4812
    DiscardCo *rwco = opaque;
4813

    
4814
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4815
}
4816

    
4817
/* if no limit is specified in the BlockLimits use a default
4818
 * of 32768 512-byte sectors (16 MiB) per request.
4819
 */
4820
#define MAX_DISCARD_DEFAULT 32768
4821

    
4822
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4823
                                 int nb_sectors)
4824
{
4825
    int max_discard;
4826

    
4827
    if (!bs->drv) {
4828
        return -ENOMEDIUM;
4829
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4830
        return -EIO;
4831
    } else if (bs->read_only) {
4832
        return -EROFS;
4833
    }
4834

    
4835
    bdrv_reset_dirty(bs, sector_num, nb_sectors);
4836

    
4837
    /* Do nothing if disabled.  */
4838
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
4839
        return 0;
4840
    }
4841

    
4842
    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4843
        return 0;
4844
    }
4845

    
4846
    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4847
    while (nb_sectors > 0) {
4848
        int ret;
4849
        int num = nb_sectors;
4850

    
4851
        /* align request */
4852
        if (bs->bl.discard_alignment &&
4853
            num >= bs->bl.discard_alignment &&
4854
            sector_num % bs->bl.discard_alignment) {
4855
            if (num > bs->bl.discard_alignment) {
4856
                num = bs->bl.discard_alignment;
4857
            }
4858
            num -= sector_num % bs->bl.discard_alignment;
4859
        }
4860

    
4861
        /* limit request size */
4862
        if (num > max_discard) {
4863
            num = max_discard;
4864
        }
4865

    
4866
        if (bs->drv->bdrv_co_discard) {
4867
            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4868
        } else {
4869
            BlockDriverAIOCB *acb;
4870
            CoroutineIOCompletion co = {
4871
                .coroutine = qemu_coroutine_self(),
4872
            };
4873

    
4874
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4875
                                            bdrv_co_io_em_complete, &co);
4876
            if (acb == NULL) {
4877
                return -EIO;
4878
            } else {
4879
                qemu_coroutine_yield();
4880
                ret = co.ret;
4881
            }
4882
        }
4883
        if (ret && ret != -ENOTSUP) {
4884
            return ret;
4885
        }
4886

    
4887
        sector_num += num;
4888
        nb_sectors -= num;
4889
    }
4890
    return 0;
4891
}
4892

    
4893
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4894
{
4895
    Coroutine *co;
4896
    DiscardCo rwco = {
4897
        .bs = bs,
4898
        .sector_num = sector_num,
4899
        .nb_sectors = nb_sectors,
4900
        .ret = NOT_DONE,
4901
    };
4902

    
4903
    if (qemu_in_coroutine()) {
4904
        /* Fast-path if already in coroutine context */
4905
        bdrv_discard_co_entry(&rwco);
4906
    } else {
4907
        co = qemu_coroutine_create(bdrv_discard_co_entry);
4908
        qemu_coroutine_enter(co, &rwco);
4909
        while (rwco.ret == NOT_DONE) {
4910
            qemu_aio_wait();
4911
        }
4912
    }
4913

    
4914
    return rwco.ret;
4915
}
4916

    
4917
/**************************************************************/
4918
/* removable device support */
4919

    
4920
/**
4921
 * Return TRUE if the media is present
4922
 */
4923
int bdrv_is_inserted(BlockDriverState *bs)
4924
{
4925
    BlockDriver *drv = bs->drv;
4926

    
4927
    if (!drv)
4928
        return 0;
4929
    if (!drv->bdrv_is_inserted)
4930
        return 1;
4931
    return drv->bdrv_is_inserted(bs);
4932
}
4933

    
4934
/**
4935
 * Return whether the media changed since the last call to this
4936
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4937
 */
4938
int bdrv_media_changed(BlockDriverState *bs)
4939
{
4940
    BlockDriver *drv = bs->drv;
4941

    
4942
    if (drv && drv->bdrv_media_changed) {
4943
        return drv->bdrv_media_changed(bs);
4944
    }
4945
    return -ENOTSUP;
4946
}
4947

    
4948
/**
4949
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4950
 */
4951
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4952
{
4953
    BlockDriver *drv = bs->drv;
4954

    
4955
    if (drv && drv->bdrv_eject) {
4956
        drv->bdrv_eject(bs, eject_flag);
4957
    }
4958

    
4959
    if (bs->device_name[0] != '\0') {
4960
        bdrv_emit_qmp_eject_event(bs, eject_flag);
4961
    }
4962
}
4963

    
4964
/**
4965
 * Lock or unlock the media (if it is locked, the user won't be able
4966
 * to eject it manually).
4967
 */
4968
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4969
{
4970
    BlockDriver *drv = bs->drv;
4971

    
4972
    trace_bdrv_lock_medium(bs, locked);
4973

    
4974
    if (drv && drv->bdrv_lock_medium) {
4975
        drv->bdrv_lock_medium(bs, locked);
4976
    }
4977
}
4978

    
4979
/* needed for generic scsi interface */
4980

    
4981
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4982
{
4983
    BlockDriver *drv = bs->drv;
4984

    
4985
    if (drv && drv->bdrv_ioctl)
4986
        return drv->bdrv_ioctl(bs, req, buf);
4987
    return -ENOTSUP;
4988
}
4989

    
4990
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4991
        unsigned long int req, void *buf,
4992
        BlockDriverCompletionFunc *cb, void *opaque)
4993
{
4994
    BlockDriver *drv = bs->drv;
4995

    
4996
    if (drv && drv->bdrv_aio_ioctl)
4997
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4998
    return NULL;
4999
}
5000

    
5001
void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5002
{
5003
    bs->guest_block_size = align;
5004
}
5005

    
5006
void *qemu_blockalign(BlockDriverState *bs, size_t size)
5007
{
5008
    return qemu_memalign(bdrv_opt_mem_align(bs), size);
5009
}
5010

    
5011
/*
5012
 * Check if all memory in this vector is sector aligned.
5013
 */
5014
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5015
{
5016
    int i;
5017
    size_t alignment = bdrv_opt_mem_align(bs);
5018

    
5019
    for (i = 0; i < qiov->niov; i++) {
5020
        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5021
            return false;
5022
        }
5023
        if (qiov->iov[i].iov_len % alignment) {
5024
            return false;
5025
        }
5026
    }
5027

    
5028
    return true;
5029
}
5030

    
5031
BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5032
{
5033
    int64_t bitmap_size;
5034
    BdrvDirtyBitmap *bitmap;
5035

    
5036
    assert((granularity & (granularity - 1)) == 0);
5037

    
5038
    granularity >>= BDRV_SECTOR_BITS;
5039
    assert(granularity);
5040
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5041
    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5042
    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5043
    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5044
    return bitmap;
5045
}
5046

    
5047
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5048
{
5049
    BdrvDirtyBitmap *bm, *next;
5050
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5051
        if (bm == bitmap) {
5052
            QLIST_REMOVE(bitmap, list);
5053
            hbitmap_free(bitmap->bitmap);
5054
            g_free(bitmap);
5055
            return;
5056
        }
5057
    }
5058
}
5059

    
5060
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5061
{
5062
    BdrvDirtyBitmap *bm;
5063
    BlockDirtyInfoList *list = NULL;
5064
    BlockDirtyInfoList **plist = &list;
5065

    
5066
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5067
        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5068
        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5069
        info->count = bdrv_get_dirty_count(bs, bm);
5070
        info->granularity =
5071
            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5072
        entry->value = info;
5073
        *plist = entry;
5074
        plist = &entry->next;
5075
    }
5076

    
5077
    return list;
5078
}
5079

    
5080
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5081
{
5082
    if (bitmap) {
5083
        return hbitmap_get(bitmap->bitmap, sector);
5084
    } else {
5085
        return 0;
5086
    }
5087
}
5088

    
5089
void bdrv_dirty_iter_init(BlockDriverState *bs,
5090
                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5091
{
5092
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5093
}
5094

    
5095
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5096
                    int nr_sectors)
5097
{
5098
    BdrvDirtyBitmap *bitmap;
5099
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5100
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5101
    }
5102
}
5103

    
5104
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5105
{
5106
    BdrvDirtyBitmap *bitmap;
5107
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5108
        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5109
    }
5110
}
5111

    
5112
int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5113
{
5114
    return hbitmap_count(bitmap->bitmap);
5115
}
5116

    
5117
/* Get a reference to bs */
5118
void bdrv_ref(BlockDriverState *bs)
5119
{
5120
    bs->refcnt++;
5121
}
5122

    
5123
/* Release a previously grabbed reference to bs.
5124
 * If after releasing, reference count is zero, the BlockDriverState is
5125
 * deleted. */
5126
void bdrv_unref(BlockDriverState *bs)
5127
{
5128
    assert(bs->refcnt > 0);
5129
    if (--bs->refcnt == 0) {
5130
        bdrv_delete(bs);
5131
    }
5132
}
5133

    
5134
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5135
{
5136
    assert(bs->in_use != in_use);
5137
    bs->in_use = in_use;
5138
}
5139

    
5140
int bdrv_in_use(BlockDriverState *bs)
5141
{
5142
    return bs->in_use;
5143
}
5144

    
5145
void bdrv_iostatus_enable(BlockDriverState *bs)
5146
{
5147
    bs->iostatus_enabled = true;
5148
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5149
}
5150

    
5151
/* The I/O status is only enabled if the drive explicitly
5152
 * enables it _and_ the VM is configured to stop on errors */
5153
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5154
{
5155
    return (bs->iostatus_enabled &&
5156
           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5157
            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5158
            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5159
}
5160

    
5161
void bdrv_iostatus_disable(BlockDriverState *bs)
5162
{
5163
    bs->iostatus_enabled = false;
5164
}
5165

    
5166
void bdrv_iostatus_reset(BlockDriverState *bs)
5167
{
5168
    if (bdrv_iostatus_is_enabled(bs)) {
5169
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5170
        if (bs->job) {
5171
            block_job_iostatus_reset(bs->job);
5172
        }
5173
    }
5174
}
5175

    
5176
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5177
{
5178
    assert(bdrv_iostatus_is_enabled(bs));
5179
    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5180
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5181
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
5182
    }
5183
}
5184

    
5185
void
5186
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5187
        enum BlockAcctType type)
5188
{
5189
    assert(type < BDRV_MAX_IOTYPE);
5190

    
5191
    cookie->bytes = bytes;
5192
    cookie->start_time_ns = get_clock();
5193
    cookie->type = type;
5194
}
5195

    
5196
void
5197
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5198
{
5199
    assert(cookie->type < BDRV_MAX_IOTYPE);
5200

    
5201
    bs->nr_bytes[cookie->type] += cookie->bytes;
5202
    bs->nr_ops[cookie->type]++;
5203
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5204
}
5205

    
5206
void bdrv_img_create(const char *filename, const char *fmt,
5207
                     const char *base_filename, const char *base_fmt,
5208
                     char *options, uint64_t img_size, int flags,
5209
                     Error **errp, bool quiet)
5210
{
5211
    QEMUOptionParameter *param = NULL, *create_options = NULL;
5212
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
5213
    BlockDriver *drv, *proto_drv;
5214
    BlockDriver *backing_drv = NULL;
5215
    Error *local_err = NULL;
5216
    int ret = 0;
5217

    
5218
    /* Find driver and parse its options */
5219
    drv = bdrv_find_format(fmt);
5220
    if (!drv) {
5221
        error_setg(errp, "Unknown file format '%s'", fmt);
5222
        return;
5223
    }
5224

    
5225
    proto_drv = bdrv_find_protocol(filename, true);
5226
    if (!proto_drv) {
5227
        error_setg(errp, "Unknown protocol '%s'", filename);
5228
        return;
5229
    }
5230

    
5231
    create_options = append_option_parameters(create_options,
5232
                                              drv->create_options);
5233
    create_options = append_option_parameters(create_options,
5234
                                              proto_drv->create_options);
5235

    
5236
    /* Create parameter list with default values */
5237
    param = parse_option_parameters("", create_options, param);
5238

    
5239
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5240

    
5241
    /* Parse -o options */
5242
    if (options) {
5243
        param = parse_option_parameters(options, create_options, param);
5244
        if (param == NULL) {
5245
            error_setg(errp, "Invalid options for file format '%s'.", fmt);
5246
            goto out;
5247
        }
5248
    }
5249

    
5250
    if (base_filename) {
5251
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5252
                                 base_filename)) {
5253
            error_setg(errp, "Backing file not supported for file format '%s'",
5254
                       fmt);
5255
            goto out;
5256
        }
5257
    }
5258

    
5259
    if (base_fmt) {
5260
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5261
            error_setg(errp, "Backing file format not supported for file "
5262
                             "format '%s'", fmt);
5263
            goto out;
5264
        }
5265
    }
5266

    
5267
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5268
    if (backing_file && backing_file->value.s) {
5269
        if (!strcmp(filename, backing_file->value.s)) {
5270
            error_setg(errp, "Error: Trying to create an image with the "
5271
                             "same filename as the backing file");
5272
            goto out;
5273
        }
5274
    }
5275

    
5276
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5277
    if (backing_fmt && backing_fmt->value.s) {
5278
        backing_drv = bdrv_find_format(backing_fmt->value.s);
5279
        if (!backing_drv) {
5280
            error_setg(errp, "Unknown backing file format '%s'",
5281
                       backing_fmt->value.s);
5282
            goto out;
5283
        }
5284
    }
5285

    
5286
    // The size for the image must always be specified, with one exception:
5287
    // If we are using a backing file, we can obtain the size from there
5288
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
5289
    if (size && size->value.n == -1) {
5290
        if (backing_file && backing_file->value.s) {
5291
            BlockDriverState *bs;
5292
            uint64_t size;
5293
            char buf[32];
5294
            int back_flags;
5295

    
5296
            /* backing files always opened read-only */
5297
            back_flags =
5298
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5299

    
5300
            bs = NULL;
5301
            ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5302
                            backing_drv, &local_err);
5303
            if (ret < 0) {
5304
                error_setg_errno(errp, -ret, "Could not open '%s': %s",
5305
                                 backing_file->value.s,
5306
                                 error_get_pretty(local_err));
5307
                error_free(local_err);
5308
                local_err = NULL;
5309
                goto out;
5310
            }
5311
            bdrv_get_geometry(bs, &size);
5312
            size *= 512;
5313

    
5314
            snprintf(buf, sizeof(buf), "%" PRId64, size);
5315
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5316

    
5317
            bdrv_unref(bs);
5318
        } else {
5319
            error_setg(errp, "Image creation needs a size parameter");
5320
            goto out;
5321
        }
5322
    }
5323

    
5324
    if (!quiet) {
5325
        printf("Formatting '%s', fmt=%s ", filename, fmt);
5326
        print_option_parameters(param);
5327
        puts("");
5328
    }
5329
    ret = bdrv_create(drv, filename, param, &local_err);
5330
    if (ret == -EFBIG) {
5331
        /* This is generally a better message than whatever the driver would
5332
         * deliver (especially because of the cluster_size_hint), since that
5333
         * is most probably not much different from "image too large". */
5334
        const char *cluster_size_hint = "";
5335
        if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5336
            cluster_size_hint = " (try using a larger cluster size)";
5337
        }
5338
        error_setg(errp, "The image size is too large for file format '%s'"
5339
                   "%s", fmt, cluster_size_hint);
5340
        error_free(local_err);
5341
        local_err = NULL;
5342
    }
5343

    
5344
out:
5345
    free_option_parameters(create_options);
5346
    free_option_parameters(param);
5347

    
5348
    if (local_err) {
5349
        error_propagate(errp, local_err);
5350
    }
5351
}
5352

    
5353
AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5354
{
5355
    /* Currently BlockDriverState always uses the main loop AioContext */
5356
    return qemu_get_aio_context();
5357
}
5358

    
5359
void bdrv_add_before_write_notifier(BlockDriverState *bs,
5360
                                    NotifierWithReturn *notifier)
5361
{
5362
    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5363
}
5364

    
5365
int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5366
{
5367
    if (bs->drv->bdrv_amend_options == NULL) {
5368
        return -ENOTSUP;
5369
    }
5370
    return bs->drv->bdrv_amend_options(bs, options);
5371
}
5372

    
5373
/* Used to recurse on single child block filters.
5374
 * Single child block filter will store their child in bs->file.
5375
 */
5376
bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5377
                                      BlockDriverState *candidate)
5378
{
5379
    if (!bs->drv) {
5380
        return false;
5381
    }
5382

    
5383
    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5384
        if (bs == candidate) {
5385
            return true;
5386
        } else {
5387
            return false;
5388
        }
5389
    }
5390

    
5391
    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5392
        return false;
5393
    }
5394

    
5395
    if (!bs->file) {
5396
        return false;
5397
    }
5398

    
5399
    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5400
}
5401

    
5402
bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5403
                                      BlockDriverState *candidate)
5404
{
5405
    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5406
        return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5407
    }
5408

    
5409
    return bdrv_generic_is_first_non_filter(bs, candidate);
5410
}
5411

    
5412
/* This function checks if the candidate is the first non filter bs down it's
5413
 * bs chain. Since we don't have pointers to parents it explore all bs chains
5414
 * from the top. Some filters can choose not to pass down the recursion.
5415
 */
5416
bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5417
{
5418
    BlockDriverState *bs;
5419

    
5420
    /* walk down the bs forest recursively */
5421
    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5422
        bool perm;
5423

    
5424
        perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5425

    
5426
        /* candidate is the first non filter */
5427
        if (perm) {
5428
            return true;
5429
        }
5430
    }
5431

    
5432
    return false;
5433
}