Statistics
| Branch: | Revision:

root / block.c @ dc1c13d9

History | View | Annotate | Download (118.8 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block_int.h"
29
#include "module.h"
30
#include "qjson.h"
31
#include "qemu-coroutine.h"
32
#include "qmp-commands.h"
33
#include "qemu-timer.h"
34

    
35
#ifdef CONFIG_BSD
36
#include <sys/types.h>
37
#include <sys/stat.h>
38
#include <sys/ioctl.h>
39
#include <sys/queue.h>
40
#ifndef __DragonFly__
41
#include <sys/disk.h>
42
#endif
43
#endif
44

    
45
#ifdef _WIN32
46
#include <windows.h>
47
#endif
48

    
49
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50

    
51
typedef enum {
52
    BDRV_REQ_COPY_ON_READ = 0x1,
53
    BDRV_REQ_ZERO_WRITE   = 0x2,
54
} BdrvRequestFlags;
55

    
56
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59
        BlockDriverCompletionFunc *cb, void *opaque);
60
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64
                                         int64_t sector_num, int nb_sectors,
65
                                         QEMUIOVector *iov);
66
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71
    BdrvRequestFlags flags);
72
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76
                                               int64_t sector_num,
77
                                               QEMUIOVector *qiov,
78
                                               int nb_sectors,
79
                                               BlockDriverCompletionFunc *cb,
80
                                               void *opaque,
81
                                               bool is_write);
82
static void coroutine_fn bdrv_co_do_rw(void *opaque);
83
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84
    int64_t sector_num, int nb_sectors);
85

    
86
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87
        bool is_write, double elapsed_time, uint64_t *wait);
88
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89
        double elapsed_time, uint64_t *wait);
90
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91
        bool is_write, int64_t *wait);
92

    
93
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
95

    
96
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
98

    
99
/* The device to use for VM snapshots */
100
static BlockDriverState *bs_snapshots;
101

    
102
/* If non-zero, use only whitelisted block drivers */
103
static int use_bdrv_whitelist;
104

    
105
#ifdef _WIN32
106
static int is_windows_drive_prefix(const char *filename)
107
{
108
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110
            filename[1] == ':');
111
}
112

    
113
int is_windows_drive(const char *filename)
114
{
115
    if (is_windows_drive_prefix(filename) &&
116
        filename[2] == '\0')
117
        return 1;
118
    if (strstart(filename, "\\\\.\\", NULL) ||
119
        strstart(filename, "//./", NULL))
120
        return 1;
121
    return 0;
122
}
123
#endif
124

    
125
/* throttling disk I/O limits */
126
void bdrv_io_limits_disable(BlockDriverState *bs)
127
{
128
    bs->io_limits_enabled = false;
129

    
130
    while (qemu_co_queue_next(&bs->throttled_reqs));
131

    
132
    if (bs->block_timer) {
133
        qemu_del_timer(bs->block_timer);
134
        qemu_free_timer(bs->block_timer);
135
        bs->block_timer = NULL;
136
    }
137

    
138
    bs->slice_start = 0;
139
    bs->slice_end   = 0;
140
    bs->slice_time  = 0;
141
    memset(&bs->io_base, 0, sizeof(bs->io_base));
142
}
143

    
144
static void bdrv_block_timer(void *opaque)
145
{
146
    BlockDriverState *bs = opaque;
147

    
148
    qemu_co_queue_next(&bs->throttled_reqs);
149
}
150

    
151
void bdrv_io_limits_enable(BlockDriverState *bs)
152
{
153
    qemu_co_queue_init(&bs->throttled_reqs);
154
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
156
    bs->slice_start = qemu_get_clock_ns(vm_clock);
157
    bs->slice_end   = bs->slice_start + bs->slice_time;
158
    memset(&bs->io_base, 0, sizeof(bs->io_base));
159
    bs->io_limits_enabled = true;
160
}
161

    
162
bool bdrv_io_limits_enabled(BlockDriverState *bs)
163
{
164
    BlockIOLimit *io_limits = &bs->io_limits;
165
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
166
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
169
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171
}
172

    
173
static void bdrv_io_limits_intercept(BlockDriverState *bs,
174
                                     bool is_write, int nb_sectors)
175
{
176
    int64_t wait_time = -1;
177

    
178
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179
        qemu_co_queue_wait(&bs->throttled_reqs);
180
    }
181

    
182
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183
     * throttled requests will not be dequeued until the current request is
184
     * allowed to be serviced. So if the current request still exceeds the
185
     * limits, it will be inserted to the head. All requests followed it will
186
     * be still in throttled_reqs queue.
187
     */
188

    
189
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190
        qemu_mod_timer(bs->block_timer,
191
                       wait_time + qemu_get_clock_ns(vm_clock));
192
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193
    }
194

    
195
    qemu_co_queue_next(&bs->throttled_reqs);
196
}
197

    
198
/* check if the path starts with "<protocol>:" */
199
static int path_has_protocol(const char *path)
200
{
201
    const char *p;
202

    
203
#ifdef _WIN32
204
    if (is_windows_drive(path) ||
205
        is_windows_drive_prefix(path)) {
206
        return 0;
207
    }
208
    p = path + strcspn(path, ":/\\");
209
#else
210
    p = path + strcspn(path, ":/");
211
#endif
212

    
213
    return *p == ':';
214
}
215

    
216
int path_is_absolute(const char *path)
217
{
218
#ifdef _WIN32
219
    /* specific case for names like: "\\.\d:" */
220
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221
        return 1;
222
    }
223
    return (*path == '/' || *path == '\\');
224
#else
225
    return (*path == '/');
226
#endif
227
}
228

    
229
/* if filename is absolute, just copy it to dest. Otherwise, build a
230
   path to it by considering it is relative to base_path. URL are
231
   supported. */
232
void path_combine(char *dest, int dest_size,
233
                  const char *base_path,
234
                  const char *filename)
235
{
236
    const char *p, *p1;
237
    int len;
238

    
239
    if (dest_size <= 0)
240
        return;
241
    if (path_is_absolute(filename)) {
242
        pstrcpy(dest, dest_size, filename);
243
    } else {
244
        p = strchr(base_path, ':');
245
        if (p)
246
            p++;
247
        else
248
            p = base_path;
249
        p1 = strrchr(base_path, '/');
250
#ifdef _WIN32
251
        {
252
            const char *p2;
253
            p2 = strrchr(base_path, '\\');
254
            if (!p1 || p2 > p1)
255
                p1 = p2;
256
        }
257
#endif
258
        if (p1)
259
            p1++;
260
        else
261
            p1 = base_path;
262
        if (p1 > p)
263
            p = p1;
264
        len = p - base_path;
265
        if (len > dest_size - 1)
266
            len = dest_size - 1;
267
        memcpy(dest, base_path, len);
268
        dest[len] = '\0';
269
        pstrcat(dest, dest_size, filename);
270
    }
271
}
272

    
273
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
274
{
275
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276
        pstrcpy(dest, sz, bs->backing_file);
277
    } else {
278
        path_combine(dest, sz, bs->filename, bs->backing_file);
279
    }
280
}
281

    
282
void bdrv_register(BlockDriver *bdrv)
283
{
284
    /* Block drivers without coroutine functions need emulation */
285
    if (!bdrv->bdrv_co_readv) {
286
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
287
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
288

    
289
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290
         * the block driver lacks aio we need to emulate that too.
291
         */
292
        if (!bdrv->bdrv_aio_readv) {
293
            /* add AIO emulation layer */
294
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
296
        }
297
    }
298

    
299
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
300
}
301

    
302
/* create a new block device (by default it is empty) */
303
BlockDriverState *bdrv_new(const char *device_name)
304
{
305
    BlockDriverState *bs;
306

    
307
    bs = g_malloc0(sizeof(BlockDriverState));
308
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309
    if (device_name[0] != '\0') {
310
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
311
    }
312
    bdrv_iostatus_disable(bs);
313
    return bs;
314
}
315

    
316
BlockDriver *bdrv_find_format(const char *format_name)
317
{
318
    BlockDriver *drv1;
319
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320
        if (!strcmp(drv1->format_name, format_name)) {
321
            return drv1;
322
        }
323
    }
324
    return NULL;
325
}
326

    
327
static int bdrv_is_whitelisted(BlockDriver *drv)
328
{
329
    static const char *whitelist[] = {
330
        CONFIG_BDRV_WHITELIST
331
    };
332
    const char **p;
333

    
334
    if (!whitelist[0])
335
        return 1;               /* no whitelist, anything goes */
336

    
337
    for (p = whitelist; *p; p++) {
338
        if (!strcmp(drv->format_name, *p)) {
339
            return 1;
340
        }
341
    }
342
    return 0;
343
}
344

    
345
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
346
{
347
    BlockDriver *drv = bdrv_find_format(format_name);
348
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
349
}
350

    
351
typedef struct CreateCo {
352
    BlockDriver *drv;
353
    char *filename;
354
    QEMUOptionParameter *options;
355
    int ret;
356
} CreateCo;
357

    
358
static void coroutine_fn bdrv_create_co_entry(void *opaque)
359
{
360
    CreateCo *cco = opaque;
361
    assert(cco->drv);
362

    
363
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
364
}
365

    
366
int bdrv_create(BlockDriver *drv, const char* filename,
367
    QEMUOptionParameter *options)
368
{
369
    int ret;
370

    
371
    Coroutine *co;
372
    CreateCo cco = {
373
        .drv = drv,
374
        .filename = g_strdup(filename),
375
        .options = options,
376
        .ret = NOT_DONE,
377
    };
378

    
379
    if (!drv->bdrv_create) {
380
        return -ENOTSUP;
381
    }
382

    
383
    if (qemu_in_coroutine()) {
384
        /* Fast-path if already in coroutine context */
385
        bdrv_create_co_entry(&cco);
386
    } else {
387
        co = qemu_coroutine_create(bdrv_create_co_entry);
388
        qemu_coroutine_enter(co, &cco);
389
        while (cco.ret == NOT_DONE) {
390
            qemu_aio_wait();
391
        }
392
    }
393

    
394
    ret = cco.ret;
395
    g_free(cco.filename);
396

    
397
    return ret;
398
}
399

    
400
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
401
{
402
    BlockDriver *drv;
403

    
404
    drv = bdrv_find_protocol(filename);
405
    if (drv == NULL) {
406
        return -ENOENT;
407
    }
408

    
409
    return bdrv_create(drv, filename, options);
410
}
411

    
412
/*
413
 * Create a uniquely-named empty temporary file.
414
 * Return 0 upon success, otherwise a negative errno value.
415
 */
416
int get_tmp_filename(char *filename, int size)
417
{
418
#ifdef _WIN32
419
    char temp_dir[MAX_PATH];
420
    /* GetTempFileName requires that its output buffer (4th param)
421
       have length MAX_PATH or greater.  */
422
    assert(size >= MAX_PATH);
423
    return (GetTempPath(MAX_PATH, temp_dir)
424
            && GetTempFileName(temp_dir, "qem", 0, filename)
425
            ? 0 : -GetLastError());
426
#else
427
    int fd;
428
    const char *tmpdir;
429
    tmpdir = getenv("TMPDIR");
430
    if (!tmpdir)
431
        tmpdir = "/tmp";
432
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433
        return -EOVERFLOW;
434
    }
435
    fd = mkstemp(filename);
436
    if (fd < 0) {
437
        return -errno;
438
    }
439
    if (close(fd) != 0) {
440
        unlink(filename);
441
        return -errno;
442
    }
443
    return 0;
444
#endif
445
}
446

    
447
/*
448
 * Detect host devices. By convention, /dev/cdrom[N] is always
449
 * recognized as a host CDROM.
450
 */
451
static BlockDriver *find_hdev_driver(const char *filename)
452
{
453
    int score_max = 0, score;
454
    BlockDriver *drv = NULL, *d;
455

    
456
    QLIST_FOREACH(d, &bdrv_drivers, list) {
457
        if (d->bdrv_probe_device) {
458
            score = d->bdrv_probe_device(filename);
459
            if (score > score_max) {
460
                score_max = score;
461
                drv = d;
462
            }
463
        }
464
    }
465

    
466
    return drv;
467
}
468

    
469
BlockDriver *bdrv_find_protocol(const char *filename)
470
{
471
    BlockDriver *drv1;
472
    char protocol[128];
473
    int len;
474
    const char *p;
475

    
476
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
477

    
478
    /*
479
     * XXX(hch): we really should not let host device detection
480
     * override an explicit protocol specification, but moving this
481
     * later breaks access to device names with colons in them.
482
     * Thanks to the brain-dead persistent naming schemes on udev-
483
     * based Linux systems those actually are quite common.
484
     */
485
    drv1 = find_hdev_driver(filename);
486
    if (drv1) {
487
        return drv1;
488
    }
489

    
490
    if (!path_has_protocol(filename)) {
491
        return bdrv_find_format("file");
492
    }
493
    p = strchr(filename, ':');
494
    assert(p != NULL);
495
    len = p - filename;
496
    if (len > sizeof(protocol) - 1)
497
        len = sizeof(protocol) - 1;
498
    memcpy(protocol, filename, len);
499
    protocol[len] = '\0';
500
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
501
        if (drv1->protocol_name &&
502
            !strcmp(drv1->protocol_name, protocol)) {
503
            return drv1;
504
        }
505
    }
506
    return NULL;
507
}
508

    
509
static int find_image_format(const char *filename, BlockDriver **pdrv)
510
{
511
    int ret, score, score_max;
512
    BlockDriver *drv1, *drv;
513
    uint8_t buf[2048];
514
    BlockDriverState *bs;
515

    
516
    ret = bdrv_file_open(&bs, filename, 0);
517
    if (ret < 0) {
518
        *pdrv = NULL;
519
        return ret;
520
    }
521

    
522
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
523
    if (bs->sg || !bdrv_is_inserted(bs)) {
524
        bdrv_delete(bs);
525
        drv = bdrv_find_format("raw");
526
        if (!drv) {
527
            ret = -ENOENT;
528
        }
529
        *pdrv = drv;
530
        return ret;
531
    }
532

    
533
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
534
    bdrv_delete(bs);
535
    if (ret < 0) {
536
        *pdrv = NULL;
537
        return ret;
538
    }
539

    
540
    score_max = 0;
541
    drv = NULL;
542
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
543
        if (drv1->bdrv_probe) {
544
            score = drv1->bdrv_probe(buf, ret, filename);
545
            if (score > score_max) {
546
                score_max = score;
547
                drv = drv1;
548
            }
549
        }
550
    }
551
    if (!drv) {
552
        ret = -ENOENT;
553
    }
554
    *pdrv = drv;
555
    return ret;
556
}
557

    
558
/**
559
 * Set the current 'total_sectors' value
560
 */
561
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
562
{
563
    BlockDriver *drv = bs->drv;
564

    
565
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
566
    if (bs->sg)
567
        return 0;
568

    
569
    /* query actual device if possible, otherwise just trust the hint */
570
    if (drv->bdrv_getlength) {
571
        int64_t length = drv->bdrv_getlength(bs);
572
        if (length < 0) {
573
            return length;
574
        }
575
        hint = length >> BDRV_SECTOR_BITS;
576
    }
577

    
578
    bs->total_sectors = hint;
579
    return 0;
580
}
581

    
582
/**
583
 * Set open flags for a given cache mode
584
 *
585
 * Return 0 on success, -1 if the cache mode was invalid.
586
 */
587
int bdrv_parse_cache_flags(const char *mode, int *flags)
588
{
589
    *flags &= ~BDRV_O_CACHE_MASK;
590

    
591
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
592
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
593
    } else if (!strcmp(mode, "directsync")) {
594
        *flags |= BDRV_O_NOCACHE;
595
    } else if (!strcmp(mode, "writeback")) {
596
        *flags |= BDRV_O_CACHE_WB;
597
    } else if (!strcmp(mode, "unsafe")) {
598
        *flags |= BDRV_O_CACHE_WB;
599
        *flags |= BDRV_O_NO_FLUSH;
600
    } else if (!strcmp(mode, "writethrough")) {
601
        /* this is the default */
602
    } else {
603
        return -1;
604
    }
605

    
606
    return 0;
607
}
608

    
609
/**
610
 * The copy-on-read flag is actually a reference count so multiple users may
611
 * use the feature without worrying about clobbering its previous state.
612
 * Copy-on-read stays enabled until all users have called to disable it.
613
 */
614
void bdrv_enable_copy_on_read(BlockDriverState *bs)
615
{
616
    bs->copy_on_read++;
617
}
618

    
619
void bdrv_disable_copy_on_read(BlockDriverState *bs)
620
{
621
    assert(bs->copy_on_read > 0);
622
    bs->copy_on_read--;
623
}
624

    
625
/*
626
 * Common part for opening disk images and files
627
 */
628
static int bdrv_open_common(BlockDriverState *bs, const char *filename,
629
    int flags, BlockDriver *drv)
630
{
631
    int ret, open_flags;
632

    
633
    assert(drv != NULL);
634
    assert(bs->file == NULL);
635

    
636
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
637

    
638
    bs->open_flags = flags;
639
    bs->buffer_alignment = 512;
640

    
641
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
642
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
643
        bdrv_enable_copy_on_read(bs);
644
    }
645

    
646
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
647

    
648
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
649
        return -ENOTSUP;
650
    }
651

    
652
    bs->drv = drv;
653
    bs->opaque = g_malloc0(drv->instance_size);
654

    
655
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
656
    open_flags = flags | BDRV_O_CACHE_WB;
657

    
658
    /*
659
     * Clear flags that are internal to the block layer before opening the
660
     * image.
661
     */
662
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
663

    
664
    /*
665
     * Snapshots should be writable.
666
     */
667
    if (bs->is_temporary) {
668
        open_flags |= BDRV_O_RDWR;
669
    }
670

    
671
    bs->read_only = !(open_flags & BDRV_O_RDWR);
672

    
673
    /* Open the image, either directly or using a protocol */
674
    if (drv->bdrv_file_open) {
675
        ret = drv->bdrv_file_open(bs, filename, open_flags);
676
    } else {
677
        ret = bdrv_file_open(&bs->file, filename, open_flags);
678
        if (ret >= 0) {
679
            ret = drv->bdrv_open(bs, open_flags);
680
        }
681
    }
682

    
683
    if (ret < 0) {
684
        goto free_and_fail;
685
    }
686

    
687
    ret = refresh_total_sectors(bs, bs->total_sectors);
688
    if (ret < 0) {
689
        goto free_and_fail;
690
    }
691

    
692
#ifndef _WIN32
693
    if (bs->is_temporary) {
694
        unlink(filename);
695
    }
696
#endif
697
    return 0;
698

    
699
free_and_fail:
700
    if (bs->file) {
701
        bdrv_delete(bs->file);
702
        bs->file = NULL;
703
    }
704
    g_free(bs->opaque);
705
    bs->opaque = NULL;
706
    bs->drv = NULL;
707
    return ret;
708
}
709

    
710
/*
711
 * Opens a file using a protocol (file, host_device, nbd, ...)
712
 */
713
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
714
{
715
    BlockDriverState *bs;
716
    BlockDriver *drv;
717
    int ret;
718

    
719
    drv = bdrv_find_protocol(filename);
720
    if (!drv) {
721
        return -ENOENT;
722
    }
723

    
724
    bs = bdrv_new("");
725
    ret = bdrv_open_common(bs, filename, flags, drv);
726
    if (ret < 0) {
727
        bdrv_delete(bs);
728
        return ret;
729
    }
730
    bs->growable = 1;
731
    *pbs = bs;
732
    return 0;
733
}
734

    
735
/*
736
 * Opens a disk image (raw, qcow2, vmdk, ...)
737
 */
738
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
739
              BlockDriver *drv)
740
{
741
    int ret;
742
    char tmp_filename[PATH_MAX];
743

    
744
    if (flags & BDRV_O_SNAPSHOT) {
745
        BlockDriverState *bs1;
746
        int64_t total_size;
747
        int is_protocol = 0;
748
        BlockDriver *bdrv_qcow2;
749
        QEMUOptionParameter *options;
750
        char backing_filename[PATH_MAX];
751

    
752
        /* if snapshot, we create a temporary backing file and open it
753
           instead of opening 'filename' directly */
754

    
755
        /* if there is a backing file, use it */
756
        bs1 = bdrv_new("");
757
        ret = bdrv_open(bs1, filename, 0, drv);
758
        if (ret < 0) {
759
            bdrv_delete(bs1);
760
            return ret;
761
        }
762
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
763

    
764
        if (bs1->drv && bs1->drv->protocol_name)
765
            is_protocol = 1;
766

    
767
        bdrv_delete(bs1);
768

    
769
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
770
        if (ret < 0) {
771
            return ret;
772
        }
773

    
774
        /* Real path is meaningless for protocols */
775
        if (is_protocol)
776
            snprintf(backing_filename, sizeof(backing_filename),
777
                     "%s", filename);
778
        else if (!realpath(filename, backing_filename))
779
            return -errno;
780

    
781
        bdrv_qcow2 = bdrv_find_format("qcow2");
782
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
783

    
784
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
785
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
786
        if (drv) {
787
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
788
                drv->format_name);
789
        }
790

    
791
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
792
        free_option_parameters(options);
793
        if (ret < 0) {
794
            return ret;
795
        }
796

    
797
        filename = tmp_filename;
798
        drv = bdrv_qcow2;
799
        bs->is_temporary = 1;
800
    }
801

    
802
    /* Find the right image format driver */
803
    if (!drv) {
804
        ret = find_image_format(filename, &drv);
805
    }
806

    
807
    if (!drv) {
808
        goto unlink_and_fail;
809
    }
810

    
811
    if (flags & BDRV_O_RDWR) {
812
        flags |= BDRV_O_ALLOW_RDWR;
813
    }
814

    
815
    /* Open the image */
816
    ret = bdrv_open_common(bs, filename, flags, drv);
817
    if (ret < 0) {
818
        goto unlink_and_fail;
819
    }
820

    
821
    /* If there is a backing file, use it */
822
    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
823
        char backing_filename[PATH_MAX];
824
        int back_flags;
825
        BlockDriver *back_drv = NULL;
826

    
827
        bs->backing_hd = bdrv_new("");
828
        bdrv_get_full_backing_filename(bs, backing_filename,
829
                                       sizeof(backing_filename));
830

    
831
        if (bs->backing_format[0] != '\0') {
832
            back_drv = bdrv_find_format(bs->backing_format);
833
        }
834

    
835
        /* backing files always opened read-only */
836
        back_flags =
837
            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
838

    
839
        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
840
        if (ret < 0) {
841
            bdrv_close(bs);
842
            return ret;
843
        }
844
    }
845

    
846
    if (!bdrv_key_required(bs)) {
847
        bdrv_dev_change_media_cb(bs, true);
848
    }
849

    
850
    /* throttling disk I/O limits */
851
    if (bs->io_limits_enabled) {
852
        bdrv_io_limits_enable(bs);
853
    }
854

    
855
    return 0;
856

    
857
unlink_and_fail:
858
    if (bs->is_temporary) {
859
        unlink(filename);
860
    }
861
    return ret;
862
}
863

    
864
typedef struct BlockReopenQueueEntry {
865
     bool prepared;
866
     BDRVReopenState state;
867
     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
868
} BlockReopenQueueEntry;
869

    
870
/*
871
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
872
 * reopen of multiple devices.
873
 *
874
 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
875
 * already performed, or alternatively may be NULL a new BlockReopenQueue will
876
 * be created and initialized. This newly created BlockReopenQueue should be
877
 * passed back in for subsequent calls that are intended to be of the same
878
 * atomic 'set'.
879
 *
880
 * bs is the BlockDriverState to add to the reopen queue.
881
 *
882
 * flags contains the open flags for the associated bs
883
 *
884
 * returns a pointer to bs_queue, which is either the newly allocated
885
 * bs_queue, or the existing bs_queue being used.
886
 *
887
 */
888
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
889
                                    BlockDriverState *bs, int flags)
890
{
891
    assert(bs != NULL);
892

    
893
    BlockReopenQueueEntry *bs_entry;
894
    if (bs_queue == NULL) {
895
        bs_queue = g_new0(BlockReopenQueue, 1);
896
        QSIMPLEQ_INIT(bs_queue);
897
    }
898

    
899
    if (bs->file) {
900
        bdrv_reopen_queue(bs_queue, bs->file, flags);
901
    }
902

    
903
    bs_entry = g_new0(BlockReopenQueueEntry, 1);
904
    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
905

    
906
    bs_entry->state.bs = bs;
907
    bs_entry->state.flags = flags;
908

    
909
    return bs_queue;
910
}
911

    
912
/*
913
 * Reopen multiple BlockDriverStates atomically & transactionally.
914
 *
915
 * The queue passed in (bs_queue) must have been built up previous
916
 * via bdrv_reopen_queue().
917
 *
918
 * Reopens all BDS specified in the queue, with the appropriate
919
 * flags.  All devices are prepared for reopen, and failure of any
920
 * device will cause all device changes to be abandonded, and intermediate
921
 * data cleaned up.
922
 *
923
 * If all devices prepare successfully, then the changes are committed
924
 * to all devices.
925
 *
926
 */
927
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
928
{
929
    int ret = -1;
930
    BlockReopenQueueEntry *bs_entry, *next;
931
    Error *local_err = NULL;
932

    
933
    assert(bs_queue != NULL);
934

    
935
    bdrv_drain_all();
936

    
937
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
938
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
939
            error_propagate(errp, local_err);
940
            goto cleanup;
941
        }
942
        bs_entry->prepared = true;
943
    }
944

    
945
    /* If we reach this point, we have success and just need to apply the
946
     * changes
947
     */
948
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
949
        bdrv_reopen_commit(&bs_entry->state);
950
    }
951

    
952
    ret = 0;
953

    
954
cleanup:
955
    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
956
        if (ret && bs_entry->prepared) {
957
            bdrv_reopen_abort(&bs_entry->state);
958
        }
959
        g_free(bs_entry);
960
    }
961
    g_free(bs_queue);
962
    return ret;
963
}
964

    
965

    
966
/* Reopen a single BlockDriverState with the specified flags. */
967
int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
968
{
969
    int ret = -1;
970
    Error *local_err = NULL;
971
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
972

    
973
    ret = bdrv_reopen_multiple(queue, &local_err);
974
    if (local_err != NULL) {
975
        error_propagate(errp, local_err);
976
    }
977
    return ret;
978
}
979

    
980

    
981
/*
982
 * Prepares a BlockDriverState for reopen. All changes are staged in the
983
 * 'opaque' field of the BDRVReopenState, which is used and allocated by
984
 * the block driver layer .bdrv_reopen_prepare()
985
 *
986
 * bs is the BlockDriverState to reopen
987
 * flags are the new open flags
988
 * queue is the reopen queue
989
 *
990
 * Returns 0 on success, non-zero on error.  On error errp will be set
991
 * as well.
992
 *
993
 * On failure, bdrv_reopen_abort() will be called to clean up any data.
994
 * It is the responsibility of the caller to then call the abort() or
995
 * commit() for any other BDS that have been left in a prepare() state
996
 *
997
 */
998
int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
999
                        Error **errp)
1000
{
1001
    int ret = -1;
1002
    Error *local_err = NULL;
1003
    BlockDriver *drv;
1004

    
1005
    assert(reopen_state != NULL);
1006
    assert(reopen_state->bs->drv != NULL);
1007
    drv = reopen_state->bs->drv;
1008

    
1009
    /* if we are to stay read-only, do not allow permission change
1010
     * to r/w */
1011
    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1012
        reopen_state->flags & BDRV_O_RDWR) {
1013
        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1014
                  reopen_state->bs->device_name);
1015
        goto error;
1016
    }
1017

    
1018

    
1019
    ret = bdrv_flush(reopen_state->bs);
1020
    if (ret) {
1021
        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1022
                  strerror(-ret));
1023
        goto error;
1024
    }
1025

    
1026
    if (drv->bdrv_reopen_prepare) {
1027
        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1028
        if (ret) {
1029
            if (local_err != NULL) {
1030
                error_propagate(errp, local_err);
1031
            } else {
1032
                error_set(errp, QERR_OPEN_FILE_FAILED,
1033
                          reopen_state->bs->filename);
1034
            }
1035
            goto error;
1036
        }
1037
    } else {
1038
        /* It is currently mandatory to have a bdrv_reopen_prepare()
1039
         * handler for each supported drv. */
1040
        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1041
                  drv->format_name, reopen_state->bs->device_name,
1042
                 "reopening of file");
1043
        ret = -1;
1044
        goto error;
1045
    }
1046

    
1047
    ret = 0;
1048

    
1049
error:
1050
    return ret;
1051
}
1052

    
1053
/*
1054
 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1055
 * makes them final by swapping the staging BlockDriverState contents into
1056
 * the active BlockDriverState contents.
1057
 */
1058
void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1059
{
1060
    BlockDriver *drv;
1061

    
1062
    assert(reopen_state != NULL);
1063
    drv = reopen_state->bs->drv;
1064
    assert(drv != NULL);
1065

    
1066
    /* If there are any driver level actions to take */
1067
    if (drv->bdrv_reopen_commit) {
1068
        drv->bdrv_reopen_commit(reopen_state);
1069
    }
1070

    
1071
    /* set BDS specific flags now */
1072
    reopen_state->bs->open_flags         = reopen_state->flags;
1073
    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1074
                                              BDRV_O_CACHE_WB);
1075
    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1076
}
1077

    
1078
/*
1079
 * Abort the reopen, and delete and free the staged changes in
1080
 * reopen_state
1081
 */
1082
void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1083
{
1084
    BlockDriver *drv;
1085

    
1086
    assert(reopen_state != NULL);
1087
    drv = reopen_state->bs->drv;
1088
    assert(drv != NULL);
1089

    
1090
    if (drv->bdrv_reopen_abort) {
1091
        drv->bdrv_reopen_abort(reopen_state);
1092
    }
1093
}
1094

    
1095

    
1096
void bdrv_close(BlockDriverState *bs)
1097
{
1098
    bdrv_flush(bs);
1099
    if (bs->drv) {
1100
        if (bs->job) {
1101
            block_job_cancel_sync(bs->job);
1102
        }
1103
        bdrv_drain_all();
1104

    
1105
        if (bs == bs_snapshots) {
1106
            bs_snapshots = NULL;
1107
        }
1108
        if (bs->backing_hd) {
1109
            bdrv_delete(bs->backing_hd);
1110
            bs->backing_hd = NULL;
1111
        }
1112
        bs->drv->bdrv_close(bs);
1113
        g_free(bs->opaque);
1114
#ifdef _WIN32
1115
        if (bs->is_temporary) {
1116
            unlink(bs->filename);
1117
        }
1118
#endif
1119
        bs->opaque = NULL;
1120
        bs->drv = NULL;
1121
        bs->copy_on_read = 0;
1122
        bs->backing_file[0] = '\0';
1123
        bs->backing_format[0] = '\0';
1124
        bs->total_sectors = 0;
1125
        bs->encrypted = 0;
1126
        bs->valid_key = 0;
1127
        bs->sg = 0;
1128
        bs->growable = 0;
1129

    
1130
        if (bs->file != NULL) {
1131
            bdrv_delete(bs->file);
1132
            bs->file = NULL;
1133
        }
1134
    }
1135

    
1136
    bdrv_dev_change_media_cb(bs, false);
1137

    
1138
    /*throttling disk I/O limits*/
1139
    if (bs->io_limits_enabled) {
1140
        bdrv_io_limits_disable(bs);
1141
    }
1142
}
1143

    
1144
void bdrv_close_all(void)
1145
{
1146
    BlockDriverState *bs;
1147

    
1148
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1149
        bdrv_close(bs);
1150
    }
1151
}
1152

    
1153
/*
1154
 * Wait for pending requests to complete across all BlockDriverStates
1155
 *
1156
 * This function does not flush data to disk, use bdrv_flush_all() for that
1157
 * after calling this function.
1158
 *
1159
 * Note that completion of an asynchronous I/O operation can trigger any
1160
 * number of other I/O operations on other devices---for example a coroutine
1161
 * can be arbitrarily complex and a constant flow of I/O can come until the
1162
 * coroutine is complete.  Because of this, it is not possible to have a
1163
 * function to drain a single device's I/O queue.
1164
 */
1165
void bdrv_drain_all(void)
1166
{
1167
    BlockDriverState *bs;
1168
    bool busy;
1169

    
1170
    do {
1171
        busy = qemu_aio_wait();
1172

    
1173
        /* FIXME: We do not have timer support here, so this is effectively
1174
         * a busy wait.
1175
         */
1176
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
1177
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1178
                qemu_co_queue_restart_all(&bs->throttled_reqs);
1179
                busy = true;
1180
            }
1181
        }
1182
    } while (busy);
1183

    
1184
    /* If requests are still pending there is a bug somewhere */
1185
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1186
        assert(QLIST_EMPTY(&bs->tracked_requests));
1187
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
1188
    }
1189
}
1190

    
1191
/* make a BlockDriverState anonymous by removing from bdrv_state list.
1192
   Also, NULL terminate the device_name to prevent double remove */
1193
void bdrv_make_anon(BlockDriverState *bs)
1194
{
1195
    if (bs->device_name[0] != '\0') {
1196
        QTAILQ_REMOVE(&bdrv_states, bs, list);
1197
    }
1198
    bs->device_name[0] = '\0';
1199
}
1200

    
1201
static void bdrv_rebind(BlockDriverState *bs)
1202
{
1203
    if (bs->drv && bs->drv->bdrv_rebind) {
1204
        bs->drv->bdrv_rebind(bs);
1205
    }
1206
}
1207

    
1208
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1209
                                     BlockDriverState *bs_src)
1210
{
1211
    /* move some fields that need to stay attached to the device */
1212
    bs_dest->open_flags         = bs_src->open_flags;
1213

    
1214
    /* dev info */
1215
    bs_dest->dev_ops            = bs_src->dev_ops;
1216
    bs_dest->dev_opaque         = bs_src->dev_opaque;
1217
    bs_dest->dev                = bs_src->dev;
1218
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1219
    bs_dest->copy_on_read       = bs_src->copy_on_read;
1220

    
1221
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
1222

    
1223
    /* i/o timing parameters */
1224
    bs_dest->slice_time         = bs_src->slice_time;
1225
    bs_dest->slice_start        = bs_src->slice_start;
1226
    bs_dest->slice_end          = bs_src->slice_end;
1227
    bs_dest->io_limits          = bs_src->io_limits;
1228
    bs_dest->io_base            = bs_src->io_base;
1229
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1230
    bs_dest->block_timer        = bs_src->block_timer;
1231
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1232

    
1233
    /* r/w error */
1234
    bs_dest->on_read_error      = bs_src->on_read_error;
1235
    bs_dest->on_write_error     = bs_src->on_write_error;
1236

    
1237
    /* i/o status */
1238
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1239
    bs_dest->iostatus           = bs_src->iostatus;
1240

    
1241
    /* dirty bitmap */
1242
    bs_dest->dirty_count        = bs_src->dirty_count;
1243
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1244

    
1245
    /* job */
1246
    bs_dest->in_use             = bs_src->in_use;
1247
    bs_dest->job                = bs_src->job;
1248

    
1249
    /* keep the same entry in bdrv_states */
1250
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1251
            bs_src->device_name);
1252
    bs_dest->list = bs_src->list;
1253
}
1254

    
1255
/*
1256
 * Swap bs contents for two image chains while they are live,
1257
 * while keeping required fields on the BlockDriverState that is
1258
 * actually attached to a device.
1259
 *
1260
 * This will modify the BlockDriverState fields, and swap contents
1261
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1262
 *
1263
 * bs_new is required to be anonymous.
1264
 *
1265
 * This function does not create any image files.
1266
 */
1267
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1268
{
1269
    BlockDriverState tmp;
1270

    
1271
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1272
    assert(bs_new->device_name[0] == '\0');
1273
    assert(bs_new->dirty_bitmap == NULL);
1274
    assert(bs_new->job == NULL);
1275
    assert(bs_new->dev == NULL);
1276
    assert(bs_new->in_use == 0);
1277
    assert(bs_new->io_limits_enabled == false);
1278
    assert(bs_new->block_timer == NULL);
1279

    
1280
    tmp = *bs_new;
1281
    *bs_new = *bs_old;
1282
    *bs_old = tmp;
1283

    
1284
    /* there are some fields that should not be swapped, move them back */
1285
    bdrv_move_feature_fields(&tmp, bs_old);
1286
    bdrv_move_feature_fields(bs_old, bs_new);
1287
    bdrv_move_feature_fields(bs_new, &tmp);
1288

    
1289
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1290
    assert(bs_new->device_name[0] == '\0');
1291

    
1292
    /* Check a few fields that should remain attached to the device */
1293
    assert(bs_new->dev == NULL);
1294
    assert(bs_new->job == NULL);
1295
    assert(bs_new->in_use == 0);
1296
    assert(bs_new->io_limits_enabled == false);
1297
    assert(bs_new->block_timer == NULL);
1298

    
1299
    bdrv_rebind(bs_new);
1300
    bdrv_rebind(bs_old);
1301
}
1302

    
1303
/*
1304
 * Add new bs contents at the top of an image chain while the chain is
1305
 * live, while keeping required fields on the top layer.
1306
 *
1307
 * This will modify the BlockDriverState fields, and swap contents
1308
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1309
 *
1310
 * bs_new is required to be anonymous.
1311
 *
1312
 * This function does not create any image files.
1313
 */
1314
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1315
{
1316
    bdrv_swap(bs_new, bs_top);
1317

    
1318
    /* The contents of 'tmp' will become bs_top, as we are
1319
     * swapping bs_new and bs_top contents. */
1320
    bs_top->backing_hd = bs_new;
1321
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1322
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1323
            bs_new->filename);
1324
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1325
            bs_new->drv ? bs_new->drv->format_name : "");
1326
}
1327

    
1328
void bdrv_delete(BlockDriverState *bs)
1329
{
1330
    assert(!bs->dev);
1331
    assert(!bs->job);
1332
    assert(!bs->in_use);
1333

    
1334
    /* remove from list, if necessary */
1335
    bdrv_make_anon(bs);
1336

    
1337
    bdrv_close(bs);
1338

    
1339
    assert(bs != bs_snapshots);
1340
    g_free(bs);
1341
}
1342

    
1343
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1344
/* TODO change to DeviceState *dev when all users are qdevified */
1345
{
1346
    if (bs->dev) {
1347
        return -EBUSY;
1348
    }
1349
    bs->dev = dev;
1350
    bdrv_iostatus_reset(bs);
1351
    return 0;
1352
}
1353

    
1354
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1355
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1356
{
1357
    if (bdrv_attach_dev(bs, dev) < 0) {
1358
        abort();
1359
    }
1360
}
1361

    
1362
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1363
/* TODO change to DeviceState *dev when all users are qdevified */
1364
{
1365
    assert(bs->dev == dev);
1366
    bs->dev = NULL;
1367
    bs->dev_ops = NULL;
1368
    bs->dev_opaque = NULL;
1369
    bs->buffer_alignment = 512;
1370
}
1371

    
1372
/* TODO change to return DeviceState * when all users are qdevified */
1373
void *bdrv_get_attached_dev(BlockDriverState *bs)
1374
{
1375
    return bs->dev;
1376
}
1377

    
1378
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1379
                      void *opaque)
1380
{
1381
    bs->dev_ops = ops;
1382
    bs->dev_opaque = opaque;
1383
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1384
        bs_snapshots = NULL;
1385
    }
1386
}
1387

    
1388
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1389
                               BlockQMPEventAction action, int is_read)
1390
{
1391
    QObject *data;
1392
    const char *action_str;
1393

    
1394
    switch (action) {
1395
    case BDRV_ACTION_REPORT:
1396
        action_str = "report";
1397
        break;
1398
    case BDRV_ACTION_IGNORE:
1399
        action_str = "ignore";
1400
        break;
1401
    case BDRV_ACTION_STOP:
1402
        action_str = "stop";
1403
        break;
1404
    default:
1405
        abort();
1406
    }
1407

    
1408
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1409
                              bdrv->device_name,
1410
                              action_str,
1411
                              is_read ? "read" : "write");
1412
    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1413

    
1414
    qobject_decref(data);
1415
}
1416

    
1417
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1418
{
1419
    QObject *data;
1420

    
1421
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1422
                              bdrv_get_device_name(bs), ejected);
1423
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1424

    
1425
    qobject_decref(data);
1426
}
1427

    
1428
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1429
{
1430
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1431
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1432
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1433
        if (tray_was_closed) {
1434
            /* tray open */
1435
            bdrv_emit_qmp_eject_event(bs, true);
1436
        }
1437
        if (load) {
1438
            /* tray close */
1439
            bdrv_emit_qmp_eject_event(bs, false);
1440
        }
1441
    }
1442
}
1443

    
1444
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1445
{
1446
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1447
}
1448

    
1449
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1450
{
1451
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1452
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1453
    }
1454
}
1455

    
1456
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1457
{
1458
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1459
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1460
    }
1461
    return false;
1462
}
1463

    
1464
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1465
{
1466
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1467
        bs->dev_ops->resize_cb(bs->dev_opaque);
1468
    }
1469
}
1470

    
1471
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1472
{
1473
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1474
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1475
    }
1476
    return false;
1477
}
1478

    
1479
/*
1480
 * Run consistency checks on an image
1481
 *
1482
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1483
 * free of errors) or -errno when an internal error occurred. The results of the
1484
 * check are stored in res.
1485
 */
1486
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1487
{
1488
    if (bs->drv->bdrv_check == NULL) {
1489
        return -ENOTSUP;
1490
    }
1491

    
1492
    memset(res, 0, sizeof(*res));
1493
    return bs->drv->bdrv_check(bs, res, fix);
1494
}
1495

    
1496
#define COMMIT_BUF_SECTORS 2048
1497

    
1498
/* commit COW file into the raw image */
1499
int bdrv_commit(BlockDriverState *bs)
1500
{
1501
    BlockDriver *drv = bs->drv;
1502
    int64_t sector, total_sectors;
1503
    int n, ro, open_flags;
1504
    int ret = 0;
1505
    uint8_t *buf;
1506
    char filename[1024];
1507

    
1508
    if (!drv)
1509
        return -ENOMEDIUM;
1510
    
1511
    if (!bs->backing_hd) {
1512
        return -ENOTSUP;
1513
    }
1514

    
1515
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1516
        return -EBUSY;
1517
    }
1518

    
1519
    ro = bs->backing_hd->read_only;
1520
    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1521
    open_flags =  bs->backing_hd->open_flags;
1522

    
1523
    if (ro) {
1524
        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1525
            return -EACCES;
1526
        }
1527
    }
1528

    
1529
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1530
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1531

    
1532
    for (sector = 0; sector < total_sectors; sector += n) {
1533
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1534

    
1535
            if (bdrv_read(bs, sector, buf, n) != 0) {
1536
                ret = -EIO;
1537
                goto ro_cleanup;
1538
            }
1539

    
1540
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1541
                ret = -EIO;
1542
                goto ro_cleanup;
1543
            }
1544
        }
1545
    }
1546

    
1547
    if (drv->bdrv_make_empty) {
1548
        ret = drv->bdrv_make_empty(bs);
1549
        bdrv_flush(bs);
1550
    }
1551

    
1552
    /*
1553
     * Make sure all data we wrote to the backing device is actually
1554
     * stable on disk.
1555
     */
1556
    if (bs->backing_hd)
1557
        bdrv_flush(bs->backing_hd);
1558

    
1559
ro_cleanup:
1560
    g_free(buf);
1561

    
1562
    if (ro) {
1563
        /* ignoring error return here */
1564
        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1565
    }
1566

    
1567
    return ret;
1568
}
1569

    
1570
int bdrv_commit_all(void)
1571
{
1572
    BlockDriverState *bs;
1573

    
1574
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1575
        int ret = bdrv_commit(bs);
1576
        if (ret < 0) {
1577
            return ret;
1578
        }
1579
    }
1580
    return 0;
1581
}
1582

    
1583
struct BdrvTrackedRequest {
1584
    BlockDriverState *bs;
1585
    int64_t sector_num;
1586
    int nb_sectors;
1587
    bool is_write;
1588
    QLIST_ENTRY(BdrvTrackedRequest) list;
1589
    Coroutine *co; /* owner, used for deadlock detection */
1590
    CoQueue wait_queue; /* coroutines blocked on this request */
1591
};
1592

    
1593
/**
1594
 * Remove an active request from the tracked requests list
1595
 *
1596
 * This function should be called when a tracked request is completing.
1597
 */
1598
static void tracked_request_end(BdrvTrackedRequest *req)
1599
{
1600
    QLIST_REMOVE(req, list);
1601
    qemu_co_queue_restart_all(&req->wait_queue);
1602
}
1603

    
1604
/**
1605
 * Add an active request to the tracked requests list
1606
 */
1607
static void tracked_request_begin(BdrvTrackedRequest *req,
1608
                                  BlockDriverState *bs,
1609
                                  int64_t sector_num,
1610
                                  int nb_sectors, bool is_write)
1611
{
1612
    *req = (BdrvTrackedRequest){
1613
        .bs = bs,
1614
        .sector_num = sector_num,
1615
        .nb_sectors = nb_sectors,
1616
        .is_write = is_write,
1617
        .co = qemu_coroutine_self(),
1618
    };
1619

    
1620
    qemu_co_queue_init(&req->wait_queue);
1621

    
1622
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1623
}
1624

    
1625
/**
1626
 * Round a region to cluster boundaries
1627
 */
1628
static void round_to_clusters(BlockDriverState *bs,
1629
                              int64_t sector_num, int nb_sectors,
1630
                              int64_t *cluster_sector_num,
1631
                              int *cluster_nb_sectors)
1632
{
1633
    BlockDriverInfo bdi;
1634

    
1635
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1636
        *cluster_sector_num = sector_num;
1637
        *cluster_nb_sectors = nb_sectors;
1638
    } else {
1639
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1640
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1641
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1642
                                            nb_sectors, c);
1643
    }
1644
}
1645

    
1646
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1647
                                     int64_t sector_num, int nb_sectors) {
1648
    /*        aaaa   bbbb */
1649
    if (sector_num >= req->sector_num + req->nb_sectors) {
1650
        return false;
1651
    }
1652
    /* bbbb   aaaa        */
1653
    if (req->sector_num >= sector_num + nb_sectors) {
1654
        return false;
1655
    }
1656
    return true;
1657
}
1658

    
1659
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1660
        int64_t sector_num, int nb_sectors)
1661
{
1662
    BdrvTrackedRequest *req;
1663
    int64_t cluster_sector_num;
1664
    int cluster_nb_sectors;
1665
    bool retry;
1666

    
1667
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1668
     * that allocating writes will be serialized and not race with each other
1669
     * for the same cluster.  For example, in copy-on-read it ensures that the
1670
     * CoR read and write operations are atomic and guest writes cannot
1671
     * interleave between them.
1672
     */
1673
    round_to_clusters(bs, sector_num, nb_sectors,
1674
                      &cluster_sector_num, &cluster_nb_sectors);
1675

    
1676
    do {
1677
        retry = false;
1678
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1679
            if (tracked_request_overlaps(req, cluster_sector_num,
1680
                                         cluster_nb_sectors)) {
1681
                /* Hitting this means there was a reentrant request, for
1682
                 * example, a block driver issuing nested requests.  This must
1683
                 * never happen since it means deadlock.
1684
                 */
1685
                assert(qemu_coroutine_self() != req->co);
1686

    
1687
                qemu_co_queue_wait(&req->wait_queue);
1688
                retry = true;
1689
                break;
1690
            }
1691
        }
1692
    } while (retry);
1693
}
1694

    
1695
/*
1696
 * Return values:
1697
 * 0        - success
1698
 * -EINVAL  - backing format specified, but no file
1699
 * -ENOSPC  - can't update the backing file because no space is left in the
1700
 *            image file header
1701
 * -ENOTSUP - format driver doesn't support changing the backing file
1702
 */
1703
int bdrv_change_backing_file(BlockDriverState *bs,
1704
    const char *backing_file, const char *backing_fmt)
1705
{
1706
    BlockDriver *drv = bs->drv;
1707
    int ret;
1708

    
1709
    /* Backing file format doesn't make sense without a backing file */
1710
    if (backing_fmt && !backing_file) {
1711
        return -EINVAL;
1712
    }
1713

    
1714
    if (drv->bdrv_change_backing_file != NULL) {
1715
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1716
    } else {
1717
        ret = -ENOTSUP;
1718
    }
1719

    
1720
    if (ret == 0) {
1721
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1722
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1723
    }
1724
    return ret;
1725
}
1726

    
1727
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1728
                                   size_t size)
1729
{
1730
    int64_t len;
1731

    
1732
    if (!bdrv_is_inserted(bs))
1733
        return -ENOMEDIUM;
1734

    
1735
    if (bs->growable)
1736
        return 0;
1737

    
1738
    len = bdrv_getlength(bs);
1739

    
1740
    if (offset < 0)
1741
        return -EIO;
1742

    
1743
    if ((offset > len) || (len - offset < size))
1744
        return -EIO;
1745

    
1746
    return 0;
1747
}
1748

    
1749
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1750
                              int nb_sectors)
1751
{
1752
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1753
                                   nb_sectors * BDRV_SECTOR_SIZE);
1754
}
1755

    
1756
typedef struct RwCo {
1757
    BlockDriverState *bs;
1758
    int64_t sector_num;
1759
    int nb_sectors;
1760
    QEMUIOVector *qiov;
1761
    bool is_write;
1762
    int ret;
1763
} RwCo;
1764

    
1765
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1766
{
1767
    RwCo *rwco = opaque;
1768

    
1769
    if (!rwco->is_write) {
1770
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1771
                                     rwco->nb_sectors, rwco->qiov, 0);
1772
    } else {
1773
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1774
                                      rwco->nb_sectors, rwco->qiov, 0);
1775
    }
1776
}
1777

    
1778
/*
1779
 * Process a synchronous request using coroutines
1780
 */
1781
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1782
                      int nb_sectors, bool is_write)
1783
{
1784
    QEMUIOVector qiov;
1785
    struct iovec iov = {
1786
        .iov_base = (void *)buf,
1787
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1788
    };
1789
    Coroutine *co;
1790
    RwCo rwco = {
1791
        .bs = bs,
1792
        .sector_num = sector_num,
1793
        .nb_sectors = nb_sectors,
1794
        .qiov = &qiov,
1795
        .is_write = is_write,
1796
        .ret = NOT_DONE,
1797
    };
1798

    
1799
    qemu_iovec_init_external(&qiov, &iov, 1);
1800

    
1801
    /**
1802
     * In sync call context, when the vcpu is blocked, this throttling timer
1803
     * will not fire; so the I/O throttling function has to be disabled here
1804
     * if it has been enabled.
1805
     */
1806
    if (bs->io_limits_enabled) {
1807
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
1808
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
1809
        bdrv_io_limits_disable(bs);
1810
    }
1811

    
1812
    if (qemu_in_coroutine()) {
1813
        /* Fast-path if already in coroutine context */
1814
        bdrv_rw_co_entry(&rwco);
1815
    } else {
1816
        co = qemu_coroutine_create(bdrv_rw_co_entry);
1817
        qemu_coroutine_enter(co, &rwco);
1818
        while (rwco.ret == NOT_DONE) {
1819
            qemu_aio_wait();
1820
        }
1821
    }
1822
    return rwco.ret;
1823
}
1824

    
1825
/* return < 0 if error. See bdrv_write() for the return codes */
1826
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1827
              uint8_t *buf, int nb_sectors)
1828
{
1829
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1830
}
1831

    
1832
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
1833
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
1834
                          uint8_t *buf, int nb_sectors)
1835
{
1836
    bool enabled;
1837
    int ret;
1838

    
1839
    enabled = bs->io_limits_enabled;
1840
    bs->io_limits_enabled = false;
1841
    ret = bdrv_read(bs, 0, buf, 1);
1842
    bs->io_limits_enabled = enabled;
1843
    return ret;
1844
}
1845

    
1846
#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
1847

    
1848
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1849
                             int nb_sectors, int dirty)
1850
{
1851
    int64_t start, end;
1852
    unsigned long val, idx, bit;
1853

    
1854
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1855
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1856

    
1857
    for (; start <= end; start++) {
1858
        idx = start / BITS_PER_LONG;
1859
        bit = start % BITS_PER_LONG;
1860
        val = bs->dirty_bitmap[idx];
1861
        if (dirty) {
1862
            if (!(val & (1UL << bit))) {
1863
                bs->dirty_count++;
1864
                val |= 1UL << bit;
1865
            }
1866
        } else {
1867
            if (val & (1UL << bit)) {
1868
                bs->dirty_count--;
1869
                val &= ~(1UL << bit);
1870
            }
1871
        }
1872
        bs->dirty_bitmap[idx] = val;
1873
    }
1874
}
1875

    
1876
/* Return < 0 if error. Important errors are:
1877
  -EIO         generic I/O error (may happen for all errors)
1878
  -ENOMEDIUM   No media inserted.
1879
  -EINVAL      Invalid sector number or nb_sectors
1880
  -EACCES      Trying to write a read-only device
1881
*/
1882
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1883
               const uint8_t *buf, int nb_sectors)
1884
{
1885
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1886
}
1887

    
1888
int bdrv_pread(BlockDriverState *bs, int64_t offset,
1889
               void *buf, int count1)
1890
{
1891
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1892
    int len, nb_sectors, count;
1893
    int64_t sector_num;
1894
    int ret;
1895

    
1896
    count = count1;
1897
    /* first read to align to sector start */
1898
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1899
    if (len > count)
1900
        len = count;
1901
    sector_num = offset >> BDRV_SECTOR_BITS;
1902
    if (len > 0) {
1903
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1904
            return ret;
1905
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1906
        count -= len;
1907
        if (count == 0)
1908
            return count1;
1909
        sector_num++;
1910
        buf += len;
1911
    }
1912

    
1913
    /* read the sectors "in place" */
1914
    nb_sectors = count >> BDRV_SECTOR_BITS;
1915
    if (nb_sectors > 0) {
1916
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1917
            return ret;
1918
        sector_num += nb_sectors;
1919
        len = nb_sectors << BDRV_SECTOR_BITS;
1920
        buf += len;
1921
        count -= len;
1922
    }
1923

    
1924
    /* add data from the last sector */
1925
    if (count > 0) {
1926
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1927
            return ret;
1928
        memcpy(buf, tmp_buf, count);
1929
    }
1930
    return count1;
1931
}
1932

    
1933
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1934
                const void *buf, int count1)
1935
{
1936
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1937
    int len, nb_sectors, count;
1938
    int64_t sector_num;
1939
    int ret;
1940

    
1941
    count = count1;
1942
    /* first write to align to sector start */
1943
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1944
    if (len > count)
1945
        len = count;
1946
    sector_num = offset >> BDRV_SECTOR_BITS;
1947
    if (len > 0) {
1948
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1949
            return ret;
1950
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1951
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1952
            return ret;
1953
        count -= len;
1954
        if (count == 0)
1955
            return count1;
1956
        sector_num++;
1957
        buf += len;
1958
    }
1959

    
1960
    /* write the sectors "in place" */
1961
    nb_sectors = count >> BDRV_SECTOR_BITS;
1962
    if (nb_sectors > 0) {
1963
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1964
            return ret;
1965
        sector_num += nb_sectors;
1966
        len = nb_sectors << BDRV_SECTOR_BITS;
1967
        buf += len;
1968
        count -= len;
1969
    }
1970

    
1971
    /* add data from the last sector */
1972
    if (count > 0) {
1973
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1974
            return ret;
1975
        memcpy(tmp_buf, buf, count);
1976
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1977
            return ret;
1978
    }
1979
    return count1;
1980
}
1981

    
1982
/*
1983
 * Writes to the file and ensures that no writes are reordered across this
1984
 * request (acts as a barrier)
1985
 *
1986
 * Returns 0 on success, -errno in error cases.
1987
 */
1988
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1989
    const void *buf, int count)
1990
{
1991
    int ret;
1992

    
1993
    ret = bdrv_pwrite(bs, offset, buf, count);
1994
    if (ret < 0) {
1995
        return ret;
1996
    }
1997

    
1998
    /* No flush needed for cache modes that already do it */
1999
    if (bs->enable_write_cache) {
2000
        bdrv_flush(bs);
2001
    }
2002

    
2003
    return 0;
2004
}
2005

    
2006
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2007
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2008
{
2009
    /* Perform I/O through a temporary buffer so that users who scribble over
2010
     * their read buffer while the operation is in progress do not end up
2011
     * modifying the image file.  This is critical for zero-copy guest I/O
2012
     * where anything might happen inside guest memory.
2013
     */
2014
    void *bounce_buffer;
2015

    
2016
    BlockDriver *drv = bs->drv;
2017
    struct iovec iov;
2018
    QEMUIOVector bounce_qiov;
2019
    int64_t cluster_sector_num;
2020
    int cluster_nb_sectors;
2021
    size_t skip_bytes;
2022
    int ret;
2023

    
2024
    /* Cover entire cluster so no additional backing file I/O is required when
2025
     * allocating cluster in the image file.
2026
     */
2027
    round_to_clusters(bs, sector_num, nb_sectors,
2028
                      &cluster_sector_num, &cluster_nb_sectors);
2029

    
2030
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2031
                                   cluster_sector_num, cluster_nb_sectors);
2032

    
2033
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2034
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2035
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2036

    
2037
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2038
                             &bounce_qiov);
2039
    if (ret < 0) {
2040
        goto err;
2041
    }
2042

    
2043
    if (drv->bdrv_co_write_zeroes &&
2044
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
2045
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2046
                                      cluster_nb_sectors);
2047
    } else {
2048
        /* This does not change the data on the disk, it is not necessary
2049
         * to flush even in cache=writethrough mode.
2050
         */
2051
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2052
                                  &bounce_qiov);
2053
    }
2054

    
2055
    if (ret < 0) {
2056
        /* It might be okay to ignore write errors for guest requests.  If this
2057
         * is a deliberate copy-on-read then we don't want to ignore the error.
2058
         * Simply report it in all cases.
2059
         */
2060
        goto err;
2061
    }
2062

    
2063
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2064
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2065
                        nb_sectors * BDRV_SECTOR_SIZE);
2066

    
2067
err:
2068
    qemu_vfree(bounce_buffer);
2069
    return ret;
2070
}
2071

    
2072
/*
2073
 * Handle a read request in coroutine context
2074
 */
2075
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2076
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2077
    BdrvRequestFlags flags)
2078
{
2079
    BlockDriver *drv = bs->drv;
2080
    BdrvTrackedRequest req;
2081
    int ret;
2082

    
2083
    if (!drv) {
2084
        return -ENOMEDIUM;
2085
    }
2086
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2087
        return -EIO;
2088
    }
2089

    
2090
    /* throttling disk read I/O */
2091
    if (bs->io_limits_enabled) {
2092
        bdrv_io_limits_intercept(bs, false, nb_sectors);
2093
    }
2094

    
2095
    if (bs->copy_on_read) {
2096
        flags |= BDRV_REQ_COPY_ON_READ;
2097
    }
2098
    if (flags & BDRV_REQ_COPY_ON_READ) {
2099
        bs->copy_on_read_in_flight++;
2100
    }
2101

    
2102
    if (bs->copy_on_read_in_flight) {
2103
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2104
    }
2105

    
2106
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2107

    
2108
    if (flags & BDRV_REQ_COPY_ON_READ) {
2109
        int pnum;
2110

    
2111
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2112
        if (ret < 0) {
2113
            goto out;
2114
        }
2115

    
2116
        if (!ret || pnum != nb_sectors) {
2117
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2118
            goto out;
2119
        }
2120
    }
2121

    
2122
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2123

    
2124
out:
2125
    tracked_request_end(&req);
2126

    
2127
    if (flags & BDRV_REQ_COPY_ON_READ) {
2128
        bs->copy_on_read_in_flight--;
2129
    }
2130

    
2131
    return ret;
2132
}
2133

    
2134
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2135
    int nb_sectors, QEMUIOVector *qiov)
2136
{
2137
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2138

    
2139
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2140
}
2141

    
2142
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2143
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2144
{
2145
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2146

    
2147
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2148
                            BDRV_REQ_COPY_ON_READ);
2149
}
2150

    
2151
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2152
    int64_t sector_num, int nb_sectors)
2153
{
2154
    BlockDriver *drv = bs->drv;
2155
    QEMUIOVector qiov;
2156
    struct iovec iov;
2157
    int ret;
2158

    
2159
    /* TODO Emulate only part of misaligned requests instead of letting block
2160
     * drivers return -ENOTSUP and emulate everything */
2161

    
2162
    /* First try the efficient write zeroes operation */
2163
    if (drv->bdrv_co_write_zeroes) {
2164
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2165
        if (ret != -ENOTSUP) {
2166
            return ret;
2167
        }
2168
    }
2169

    
2170
    /* Fall back to bounce buffer if write zeroes is unsupported */
2171
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2172
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2173
    memset(iov.iov_base, 0, iov.iov_len);
2174
    qemu_iovec_init_external(&qiov, &iov, 1);
2175

    
2176
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2177

    
2178
    qemu_vfree(iov.iov_base);
2179
    return ret;
2180
}
2181

    
2182
/*
2183
 * Handle a write request in coroutine context
2184
 */
2185
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2186
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2187
    BdrvRequestFlags flags)
2188
{
2189
    BlockDriver *drv = bs->drv;
2190
    BdrvTrackedRequest req;
2191
    int ret;
2192

    
2193
    if (!bs->drv) {
2194
        return -ENOMEDIUM;
2195
    }
2196
    if (bs->read_only) {
2197
        return -EACCES;
2198
    }
2199
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2200
        return -EIO;
2201
    }
2202

    
2203
    /* throttling disk write I/O */
2204
    if (bs->io_limits_enabled) {
2205
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2206
    }
2207

    
2208
    if (bs->copy_on_read_in_flight) {
2209
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2210
    }
2211

    
2212
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2213

    
2214
    if (flags & BDRV_REQ_ZERO_WRITE) {
2215
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2216
    } else {
2217
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2218
    }
2219

    
2220
    if (ret == 0 && !bs->enable_write_cache) {
2221
        ret = bdrv_co_flush(bs);
2222
    }
2223

    
2224
    if (bs->dirty_bitmap) {
2225
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2226
    }
2227

    
2228
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2229
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2230
    }
2231

    
2232
    tracked_request_end(&req);
2233

    
2234
    return ret;
2235
}
2236

    
2237
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2238
    int nb_sectors, QEMUIOVector *qiov)
2239
{
2240
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2241

    
2242
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2243
}
2244

    
2245
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2246
                                      int64_t sector_num, int nb_sectors)
2247
{
2248
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2249

    
2250
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2251
                             BDRV_REQ_ZERO_WRITE);
2252
}
2253

    
2254
/**
2255
 * Truncate file to 'offset' bytes (needed only for file protocols)
2256
 */
2257
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2258
{
2259
    BlockDriver *drv = bs->drv;
2260
    int ret;
2261
    if (!drv)
2262
        return -ENOMEDIUM;
2263
    if (!drv->bdrv_truncate)
2264
        return -ENOTSUP;
2265
    if (bs->read_only)
2266
        return -EACCES;
2267
    if (bdrv_in_use(bs))
2268
        return -EBUSY;
2269
    ret = drv->bdrv_truncate(bs, offset);
2270
    if (ret == 0) {
2271
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2272
        bdrv_dev_resize_cb(bs);
2273
    }
2274
    return ret;
2275
}
2276

    
2277
/**
2278
 * Length of a allocated file in bytes. Sparse files are counted by actual
2279
 * allocated space. Return < 0 if error or unknown.
2280
 */
2281
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2282
{
2283
    BlockDriver *drv = bs->drv;
2284
    if (!drv) {
2285
        return -ENOMEDIUM;
2286
    }
2287
    if (drv->bdrv_get_allocated_file_size) {
2288
        return drv->bdrv_get_allocated_file_size(bs);
2289
    }
2290
    if (bs->file) {
2291
        return bdrv_get_allocated_file_size(bs->file);
2292
    }
2293
    return -ENOTSUP;
2294
}
2295

    
2296
/**
2297
 * Length of a file in bytes. Return < 0 if error or unknown.
2298
 */
2299
int64_t bdrv_getlength(BlockDriverState *bs)
2300
{
2301
    BlockDriver *drv = bs->drv;
2302
    if (!drv)
2303
        return -ENOMEDIUM;
2304

    
2305
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2306
        if (drv->bdrv_getlength) {
2307
            return drv->bdrv_getlength(bs);
2308
        }
2309
    }
2310
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2311
}
2312

    
2313
/* return 0 as number of sectors if no device present or error */
2314
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2315
{
2316
    int64_t length;
2317
    length = bdrv_getlength(bs);
2318
    if (length < 0)
2319
        length = 0;
2320
    else
2321
        length = length >> BDRV_SECTOR_BITS;
2322
    *nb_sectors_ptr = length;
2323
}
2324

    
2325
/* throttling disk io limits */
2326
void bdrv_set_io_limits(BlockDriverState *bs,
2327
                        BlockIOLimit *io_limits)
2328
{
2329
    bs->io_limits = *io_limits;
2330
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2331
}
2332

    
2333
void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2334
                       BlockErrorAction on_write_error)
2335
{
2336
    bs->on_read_error = on_read_error;
2337
    bs->on_write_error = on_write_error;
2338
}
2339

    
2340
BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2341
{
2342
    return is_read ? bs->on_read_error : bs->on_write_error;
2343
}
2344

    
2345
int bdrv_is_read_only(BlockDriverState *bs)
2346
{
2347
    return bs->read_only;
2348
}
2349

    
2350
int bdrv_is_sg(BlockDriverState *bs)
2351
{
2352
    return bs->sg;
2353
}
2354

    
2355
int bdrv_enable_write_cache(BlockDriverState *bs)
2356
{
2357
    return bs->enable_write_cache;
2358
}
2359

    
2360
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2361
{
2362
    bs->enable_write_cache = wce;
2363

    
2364
    /* so a reopen() will preserve wce */
2365
    if (wce) {
2366
        bs->open_flags |= BDRV_O_CACHE_WB;
2367
    } else {
2368
        bs->open_flags &= ~BDRV_O_CACHE_WB;
2369
    }
2370
}
2371

    
2372
int bdrv_is_encrypted(BlockDriverState *bs)
2373
{
2374
    if (bs->backing_hd && bs->backing_hd->encrypted)
2375
        return 1;
2376
    return bs->encrypted;
2377
}
2378

    
2379
int bdrv_key_required(BlockDriverState *bs)
2380
{
2381
    BlockDriverState *backing_hd = bs->backing_hd;
2382

    
2383
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2384
        return 1;
2385
    return (bs->encrypted && !bs->valid_key);
2386
}
2387

    
2388
int bdrv_set_key(BlockDriverState *bs, const char *key)
2389
{
2390
    int ret;
2391
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2392
        ret = bdrv_set_key(bs->backing_hd, key);
2393
        if (ret < 0)
2394
            return ret;
2395
        if (!bs->encrypted)
2396
            return 0;
2397
    }
2398
    if (!bs->encrypted) {
2399
        return -EINVAL;
2400
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2401
        return -ENOMEDIUM;
2402
    }
2403
    ret = bs->drv->bdrv_set_key(bs, key);
2404
    if (ret < 0) {
2405
        bs->valid_key = 0;
2406
    } else if (!bs->valid_key) {
2407
        bs->valid_key = 1;
2408
        /* call the change callback now, we skipped it on open */
2409
        bdrv_dev_change_media_cb(bs, true);
2410
    }
2411
    return ret;
2412
}
2413

    
2414
const char *bdrv_get_format_name(BlockDriverState *bs)
2415
{
2416
    return bs->drv ? bs->drv->format_name : NULL;
2417
}
2418

    
2419
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2420
                         void *opaque)
2421
{
2422
    BlockDriver *drv;
2423

    
2424
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2425
        it(opaque, drv->format_name);
2426
    }
2427
}
2428

    
2429
BlockDriverState *bdrv_find(const char *name)
2430
{
2431
    BlockDriverState *bs;
2432

    
2433
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2434
        if (!strcmp(name, bs->device_name)) {
2435
            return bs;
2436
        }
2437
    }
2438
    return NULL;
2439
}
2440

    
2441
BlockDriverState *bdrv_next(BlockDriverState *bs)
2442
{
2443
    if (!bs) {
2444
        return QTAILQ_FIRST(&bdrv_states);
2445
    }
2446
    return QTAILQ_NEXT(bs, list);
2447
}
2448

    
2449
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2450
{
2451
    BlockDriverState *bs;
2452

    
2453
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2454
        it(opaque, bs);
2455
    }
2456
}
2457

    
2458
const char *bdrv_get_device_name(BlockDriverState *bs)
2459
{
2460
    return bs->device_name;
2461
}
2462

    
2463
int bdrv_get_flags(BlockDriverState *bs)
2464
{
2465
    return bs->open_flags;
2466
}
2467

    
2468
void bdrv_flush_all(void)
2469
{
2470
    BlockDriverState *bs;
2471

    
2472
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2473
        bdrv_flush(bs);
2474
    }
2475
}
2476

    
2477
int bdrv_has_zero_init(BlockDriverState *bs)
2478
{
2479
    assert(bs->drv);
2480

    
2481
    if (bs->drv->bdrv_has_zero_init) {
2482
        return bs->drv->bdrv_has_zero_init(bs);
2483
    }
2484

    
2485
    return 1;
2486
}
2487

    
2488
typedef struct BdrvCoIsAllocatedData {
2489
    BlockDriverState *bs;
2490
    int64_t sector_num;
2491
    int nb_sectors;
2492
    int *pnum;
2493
    int ret;
2494
    bool done;
2495
} BdrvCoIsAllocatedData;
2496

    
2497
/*
2498
 * Returns true iff the specified sector is present in the disk image. Drivers
2499
 * not implementing the functionality are assumed to not support backing files,
2500
 * hence all their sectors are reported as allocated.
2501
 *
2502
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2503
 * and 'pnum' is set to 0.
2504
 *
2505
 * 'pnum' is set to the number of sectors (including and immediately following
2506
 * the specified sector) that are known to be in the same
2507
 * allocated/unallocated state.
2508
 *
2509
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2510
 * beyond the end of the disk image it will be clamped.
2511
 */
2512
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2513
                                      int nb_sectors, int *pnum)
2514
{
2515
    int64_t n;
2516

    
2517
    if (sector_num >= bs->total_sectors) {
2518
        *pnum = 0;
2519
        return 0;
2520
    }
2521

    
2522
    n = bs->total_sectors - sector_num;
2523
    if (n < nb_sectors) {
2524
        nb_sectors = n;
2525
    }
2526

    
2527
    if (!bs->drv->bdrv_co_is_allocated) {
2528
        *pnum = nb_sectors;
2529
        return 1;
2530
    }
2531

    
2532
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2533
}
2534

    
2535
/* Coroutine wrapper for bdrv_is_allocated() */
2536
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2537
{
2538
    BdrvCoIsAllocatedData *data = opaque;
2539
    BlockDriverState *bs = data->bs;
2540

    
2541
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2542
                                     data->pnum);
2543
    data->done = true;
2544
}
2545

    
2546
/*
2547
 * Synchronous wrapper around bdrv_co_is_allocated().
2548
 *
2549
 * See bdrv_co_is_allocated() for details.
2550
 */
2551
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2552
                      int *pnum)
2553
{
2554
    Coroutine *co;
2555
    BdrvCoIsAllocatedData data = {
2556
        .bs = bs,
2557
        .sector_num = sector_num,
2558
        .nb_sectors = nb_sectors,
2559
        .pnum = pnum,
2560
        .done = false,
2561
    };
2562

    
2563
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2564
    qemu_coroutine_enter(co, &data);
2565
    while (!data.done) {
2566
        qemu_aio_wait();
2567
    }
2568
    return data.ret;
2569
}
2570

    
2571
/*
2572
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2573
 *
2574
 * Return true if the given sector is allocated in any image between
2575
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2576
 * sector is allocated in any image of the chain.  Return false otherwise.
2577
 *
2578
 * 'pnum' is set to the number of sectors (including and immediately following
2579
 *  the specified sector) that are known to be in the same
2580
 *  allocated/unallocated state.
2581
 *
2582
 */
2583
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2584
                                            BlockDriverState *base,
2585
                                            int64_t sector_num,
2586
                                            int nb_sectors, int *pnum)
2587
{
2588
    BlockDriverState *intermediate;
2589
    int ret, n = nb_sectors;
2590

    
2591
    intermediate = top;
2592
    while (intermediate && intermediate != base) {
2593
        int pnum_inter;
2594
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2595
                                   &pnum_inter);
2596
        if (ret < 0) {
2597
            return ret;
2598
        } else if (ret) {
2599
            *pnum = pnum_inter;
2600
            return 1;
2601
        }
2602

    
2603
        /*
2604
         * [sector_num, nb_sectors] is unallocated on top but intermediate
2605
         * might have
2606
         *
2607
         * [sector_num+x, nr_sectors] allocated.
2608
         */
2609
        if (n > pnum_inter) {
2610
            n = pnum_inter;
2611
        }
2612

    
2613
        intermediate = intermediate->backing_hd;
2614
    }
2615

    
2616
    *pnum = n;
2617
    return 0;
2618
}
2619

    
2620
BlockInfoList *qmp_query_block(Error **errp)
2621
{
2622
    BlockInfoList *head = NULL, *cur_item = NULL;
2623
    BlockDriverState *bs;
2624

    
2625
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2626
        BlockInfoList *info = g_malloc0(sizeof(*info));
2627

    
2628
        info->value = g_malloc0(sizeof(*info->value));
2629
        info->value->device = g_strdup(bs->device_name);
2630
        info->value->type = g_strdup("unknown");
2631
        info->value->locked = bdrv_dev_is_medium_locked(bs);
2632
        info->value->removable = bdrv_dev_has_removable_media(bs);
2633

    
2634
        if (bdrv_dev_has_removable_media(bs)) {
2635
            info->value->has_tray_open = true;
2636
            info->value->tray_open = bdrv_dev_is_tray_open(bs);
2637
        }
2638

    
2639
        if (bdrv_iostatus_is_enabled(bs)) {
2640
            info->value->has_io_status = true;
2641
            info->value->io_status = bs->iostatus;
2642
        }
2643

    
2644
        if (bs->drv) {
2645
            info->value->has_inserted = true;
2646
            info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2647
            info->value->inserted->file = g_strdup(bs->filename);
2648
            info->value->inserted->ro = bs->read_only;
2649
            info->value->inserted->drv = g_strdup(bs->drv->format_name);
2650
            info->value->inserted->encrypted = bs->encrypted;
2651
            info->value->inserted->encryption_key_missing = bdrv_key_required(bs);
2652
            if (bs->backing_file[0]) {
2653
                info->value->inserted->has_backing_file = true;
2654
                info->value->inserted->backing_file = g_strdup(bs->backing_file);
2655
            }
2656

    
2657
            info->value->inserted->backing_file_depth =
2658
                bdrv_get_backing_file_depth(bs);
2659

    
2660
            if (bs->io_limits_enabled) {
2661
                info->value->inserted->bps =
2662
                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2663
                info->value->inserted->bps_rd =
2664
                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2665
                info->value->inserted->bps_wr =
2666
                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2667
                info->value->inserted->iops =
2668
                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2669
                info->value->inserted->iops_rd =
2670
                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2671
                info->value->inserted->iops_wr =
2672
                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2673
            }
2674
        }
2675

    
2676
        /* XXX: waiting for the qapi to support GSList */
2677
        if (!cur_item) {
2678
            head = cur_item = info;
2679
        } else {
2680
            cur_item->next = info;
2681
            cur_item = info;
2682
        }
2683
    }
2684

    
2685
    return head;
2686
}
2687

    
2688
/* Consider exposing this as a full fledged QMP command */
2689
static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2690
{
2691
    BlockStats *s;
2692

    
2693
    s = g_malloc0(sizeof(*s));
2694

    
2695
    if (bs->device_name[0]) {
2696
        s->has_device = true;
2697
        s->device = g_strdup(bs->device_name);
2698
    }
2699

    
2700
    s->stats = g_malloc0(sizeof(*s->stats));
2701
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2702
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2703
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2704
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2705
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2706
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2707
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2708
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2709
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2710

    
2711
    if (bs->file) {
2712
        s->has_parent = true;
2713
        s->parent = qmp_query_blockstat(bs->file, NULL);
2714
    }
2715

    
2716
    return s;
2717
}
2718

    
2719
BlockStatsList *qmp_query_blockstats(Error **errp)
2720
{
2721
    BlockStatsList *head = NULL, *cur_item = NULL;
2722
    BlockDriverState *bs;
2723

    
2724
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2725
        BlockStatsList *info = g_malloc0(sizeof(*info));
2726
        info->value = qmp_query_blockstat(bs, NULL);
2727

    
2728
        /* XXX: waiting for the qapi to support GSList */
2729
        if (!cur_item) {
2730
            head = cur_item = info;
2731
        } else {
2732
            cur_item->next = info;
2733
            cur_item = info;
2734
        }
2735
    }
2736

    
2737
    return head;
2738
}
2739

    
2740
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2741
{
2742
    if (bs->backing_hd && bs->backing_hd->encrypted)
2743
        return bs->backing_file;
2744
    else if (bs->encrypted)
2745
        return bs->filename;
2746
    else
2747
        return NULL;
2748
}
2749

    
2750
void bdrv_get_backing_filename(BlockDriverState *bs,
2751
                               char *filename, int filename_size)
2752
{
2753
    pstrcpy(filename, filename_size, bs->backing_file);
2754
}
2755

    
2756
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2757
                          const uint8_t *buf, int nb_sectors)
2758
{
2759
    BlockDriver *drv = bs->drv;
2760
    if (!drv)
2761
        return -ENOMEDIUM;
2762
    if (!drv->bdrv_write_compressed)
2763
        return -ENOTSUP;
2764
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2765
        return -EIO;
2766

    
2767
    if (bs->dirty_bitmap) {
2768
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2769
    }
2770

    
2771
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2772
}
2773

    
2774
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2775
{
2776
    BlockDriver *drv = bs->drv;
2777
    if (!drv)
2778
        return -ENOMEDIUM;
2779
    if (!drv->bdrv_get_info)
2780
        return -ENOTSUP;
2781
    memset(bdi, 0, sizeof(*bdi));
2782
    return drv->bdrv_get_info(bs, bdi);
2783
}
2784

    
2785
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2786
                      int64_t pos, int size)
2787
{
2788
    BlockDriver *drv = bs->drv;
2789
    if (!drv)
2790
        return -ENOMEDIUM;
2791
    if (drv->bdrv_save_vmstate)
2792
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
2793
    if (bs->file)
2794
        return bdrv_save_vmstate(bs->file, buf, pos, size);
2795
    return -ENOTSUP;
2796
}
2797

    
2798
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2799
                      int64_t pos, int size)
2800
{
2801
    BlockDriver *drv = bs->drv;
2802
    if (!drv)
2803
        return -ENOMEDIUM;
2804
    if (drv->bdrv_load_vmstate)
2805
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
2806
    if (bs->file)
2807
        return bdrv_load_vmstate(bs->file, buf, pos, size);
2808
    return -ENOTSUP;
2809
}
2810

    
2811
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2812
{
2813
    BlockDriver *drv = bs->drv;
2814

    
2815
    if (!drv || !drv->bdrv_debug_event) {
2816
        return;
2817
    }
2818

    
2819
    drv->bdrv_debug_event(bs, event);
2820

    
2821
}
2822

    
2823
/**************************************************************/
2824
/* handling of snapshots */
2825

    
2826
int bdrv_can_snapshot(BlockDriverState *bs)
2827
{
2828
    BlockDriver *drv = bs->drv;
2829
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2830
        return 0;
2831
    }
2832

    
2833
    if (!drv->bdrv_snapshot_create) {
2834
        if (bs->file != NULL) {
2835
            return bdrv_can_snapshot(bs->file);
2836
        }
2837
        return 0;
2838
    }
2839

    
2840
    return 1;
2841
}
2842

    
2843
int bdrv_is_snapshot(BlockDriverState *bs)
2844
{
2845
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2846
}
2847

    
2848
BlockDriverState *bdrv_snapshots(void)
2849
{
2850
    BlockDriverState *bs;
2851

    
2852
    if (bs_snapshots) {
2853
        return bs_snapshots;
2854
    }
2855

    
2856
    bs = NULL;
2857
    while ((bs = bdrv_next(bs))) {
2858
        if (bdrv_can_snapshot(bs)) {
2859
            bs_snapshots = bs;
2860
            return bs;
2861
        }
2862
    }
2863
    return NULL;
2864
}
2865

    
2866
int bdrv_snapshot_create(BlockDriverState *bs,
2867
                         QEMUSnapshotInfo *sn_info)
2868
{
2869
    BlockDriver *drv = bs->drv;
2870
    if (!drv)
2871
        return -ENOMEDIUM;
2872
    if (drv->bdrv_snapshot_create)
2873
        return drv->bdrv_snapshot_create(bs, sn_info);
2874
    if (bs->file)
2875
        return bdrv_snapshot_create(bs->file, sn_info);
2876
    return -ENOTSUP;
2877
}
2878

    
2879
int bdrv_snapshot_goto(BlockDriverState *bs,
2880
                       const char *snapshot_id)
2881
{
2882
    BlockDriver *drv = bs->drv;
2883
    int ret, open_ret;
2884

    
2885
    if (!drv)
2886
        return -ENOMEDIUM;
2887
    if (drv->bdrv_snapshot_goto)
2888
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
2889

    
2890
    if (bs->file) {
2891
        drv->bdrv_close(bs);
2892
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2893
        open_ret = drv->bdrv_open(bs, bs->open_flags);
2894
        if (open_ret < 0) {
2895
            bdrv_delete(bs->file);
2896
            bs->drv = NULL;
2897
            return open_ret;
2898
        }
2899
        return ret;
2900
    }
2901

    
2902
    return -ENOTSUP;
2903
}
2904

    
2905
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2906
{
2907
    BlockDriver *drv = bs->drv;
2908
    if (!drv)
2909
        return -ENOMEDIUM;
2910
    if (drv->bdrv_snapshot_delete)
2911
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
2912
    if (bs->file)
2913
        return bdrv_snapshot_delete(bs->file, snapshot_id);
2914
    return -ENOTSUP;
2915
}
2916

    
2917
int bdrv_snapshot_list(BlockDriverState *bs,
2918
                       QEMUSnapshotInfo **psn_info)
2919
{
2920
    BlockDriver *drv = bs->drv;
2921
    if (!drv)
2922
        return -ENOMEDIUM;
2923
    if (drv->bdrv_snapshot_list)
2924
        return drv->bdrv_snapshot_list(bs, psn_info);
2925
    if (bs->file)
2926
        return bdrv_snapshot_list(bs->file, psn_info);
2927
    return -ENOTSUP;
2928
}
2929

    
2930
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2931
        const char *snapshot_name)
2932
{
2933
    BlockDriver *drv = bs->drv;
2934
    if (!drv) {
2935
        return -ENOMEDIUM;
2936
    }
2937
    if (!bs->read_only) {
2938
        return -EINVAL;
2939
    }
2940
    if (drv->bdrv_snapshot_load_tmp) {
2941
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2942
    }
2943
    return -ENOTSUP;
2944
}
2945

    
2946
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2947
        const char *backing_file)
2948
{
2949
    if (!bs->drv) {
2950
        return NULL;
2951
    }
2952

    
2953
    if (bs->backing_hd) {
2954
        if (strcmp(bs->backing_file, backing_file) == 0) {
2955
            return bs->backing_hd;
2956
        } else {
2957
            return bdrv_find_backing_image(bs->backing_hd, backing_file);
2958
        }
2959
    }
2960

    
2961
    return NULL;
2962
}
2963

    
2964
int bdrv_get_backing_file_depth(BlockDriverState *bs)
2965
{
2966
    if (!bs->drv) {
2967
        return 0;
2968
    }
2969

    
2970
    if (!bs->backing_hd) {
2971
        return 0;
2972
    }
2973

    
2974
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
2975
}
2976

    
2977
#define NB_SUFFIXES 4
2978

    
2979
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2980
{
2981
    static const char suffixes[NB_SUFFIXES] = "KMGT";
2982
    int64_t base;
2983
    int i;
2984

    
2985
    if (size <= 999) {
2986
        snprintf(buf, buf_size, "%" PRId64, size);
2987
    } else {
2988
        base = 1024;
2989
        for(i = 0; i < NB_SUFFIXES; i++) {
2990
            if (size < (10 * base)) {
2991
                snprintf(buf, buf_size, "%0.1f%c",
2992
                         (double)size / base,
2993
                         suffixes[i]);
2994
                break;
2995
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2996
                snprintf(buf, buf_size, "%" PRId64 "%c",
2997
                         ((size + (base >> 1)) / base),
2998
                         suffixes[i]);
2999
                break;
3000
            }
3001
            base = base * 1024;
3002
        }
3003
    }
3004
    return buf;
3005
}
3006

    
3007
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3008
{
3009
    char buf1[128], date_buf[128], clock_buf[128];
3010
#ifdef _WIN32
3011
    struct tm *ptm;
3012
#else
3013
    struct tm tm;
3014
#endif
3015
    time_t ti;
3016
    int64_t secs;
3017

    
3018
    if (!sn) {
3019
        snprintf(buf, buf_size,
3020
                 "%-10s%-20s%7s%20s%15s",
3021
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3022
    } else {
3023
        ti = sn->date_sec;
3024
#ifdef _WIN32
3025
        ptm = localtime(&ti);
3026
        strftime(date_buf, sizeof(date_buf),
3027
                 "%Y-%m-%d %H:%M:%S", ptm);
3028
#else
3029
        localtime_r(&ti, &tm);
3030
        strftime(date_buf, sizeof(date_buf),
3031
                 "%Y-%m-%d %H:%M:%S", &tm);
3032
#endif
3033
        secs = sn->vm_clock_nsec / 1000000000;
3034
        snprintf(clock_buf, sizeof(clock_buf),
3035
                 "%02d:%02d:%02d.%03d",
3036
                 (int)(secs / 3600),
3037
                 (int)((secs / 60) % 60),
3038
                 (int)(secs % 60),
3039
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3040
        snprintf(buf, buf_size,
3041
                 "%-10s%-20s%7s%20s%15s",
3042
                 sn->id_str, sn->name,
3043
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3044
                 date_buf,
3045
                 clock_buf);
3046
    }
3047
    return buf;
3048
}
3049

    
3050
/**************************************************************/
3051
/* async I/Os */
3052

    
3053
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3054
                                 QEMUIOVector *qiov, int nb_sectors,
3055
                                 BlockDriverCompletionFunc *cb, void *opaque)
3056
{
3057
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3058

    
3059
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3060
                                 cb, opaque, false);
3061
}
3062

    
3063
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3064
                                  QEMUIOVector *qiov, int nb_sectors,
3065
                                  BlockDriverCompletionFunc *cb, void *opaque)
3066
{
3067
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3068

    
3069
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3070
                                 cb, opaque, true);
3071
}
3072

    
3073

    
3074
typedef struct MultiwriteCB {
3075
    int error;
3076
    int num_requests;
3077
    int num_callbacks;
3078
    struct {
3079
        BlockDriverCompletionFunc *cb;
3080
        void *opaque;
3081
        QEMUIOVector *free_qiov;
3082
    } callbacks[];
3083
} MultiwriteCB;
3084

    
3085
static void multiwrite_user_cb(MultiwriteCB *mcb)
3086
{
3087
    int i;
3088

    
3089
    for (i = 0; i < mcb->num_callbacks; i++) {
3090
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3091
        if (mcb->callbacks[i].free_qiov) {
3092
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3093
        }
3094
        g_free(mcb->callbacks[i].free_qiov);
3095
    }
3096
}
3097

    
3098
static void multiwrite_cb(void *opaque, int ret)
3099
{
3100
    MultiwriteCB *mcb = opaque;
3101

    
3102
    trace_multiwrite_cb(mcb, ret);
3103

    
3104
    if (ret < 0 && !mcb->error) {
3105
        mcb->error = ret;
3106
    }
3107

    
3108
    mcb->num_requests--;
3109
    if (mcb->num_requests == 0) {
3110
        multiwrite_user_cb(mcb);
3111
        g_free(mcb);
3112
    }
3113
}
3114

    
3115
static int multiwrite_req_compare(const void *a, const void *b)
3116
{
3117
    const BlockRequest *req1 = a, *req2 = b;
3118

    
3119
    /*
3120
     * Note that we can't simply subtract req2->sector from req1->sector
3121
     * here as that could overflow the return value.
3122
     */
3123
    if (req1->sector > req2->sector) {
3124
        return 1;
3125
    } else if (req1->sector < req2->sector) {
3126
        return -1;
3127
    } else {
3128
        return 0;
3129
    }
3130
}
3131

    
3132
/*
3133
 * Takes a bunch of requests and tries to merge them. Returns the number of
3134
 * requests that remain after merging.
3135
 */
3136
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3137
    int num_reqs, MultiwriteCB *mcb)
3138
{
3139
    int i, outidx;
3140

    
3141
    // Sort requests by start sector
3142
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3143

    
3144
    // Check if adjacent requests touch the same clusters. If so, combine them,
3145
    // filling up gaps with zero sectors.
3146
    outidx = 0;
3147
    for (i = 1; i < num_reqs; i++) {
3148
        int merge = 0;
3149
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3150

    
3151
        // Handle exactly sequential writes and overlapping writes.
3152
        if (reqs[i].sector <= oldreq_last) {
3153
            merge = 1;
3154
        }
3155

    
3156
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3157
            merge = 0;
3158
        }
3159

    
3160
        if (merge) {
3161
            size_t size;
3162
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3163
            qemu_iovec_init(qiov,
3164
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3165

    
3166
            // Add the first request to the merged one. If the requests are
3167
            // overlapping, drop the last sectors of the first request.
3168
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3169
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3170

    
3171
            // We should need to add any zeros between the two requests
3172
            assert (reqs[i].sector <= oldreq_last);
3173

    
3174
            // Add the second request
3175
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3176

    
3177
            reqs[outidx].nb_sectors = qiov->size >> 9;
3178
            reqs[outidx].qiov = qiov;
3179

    
3180
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3181
        } else {
3182
            outidx++;
3183
            reqs[outidx].sector     = reqs[i].sector;
3184
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3185
            reqs[outidx].qiov       = reqs[i].qiov;
3186
        }
3187
    }
3188

    
3189
    return outidx + 1;
3190
}
3191

    
3192
/*
3193
 * Submit multiple AIO write requests at once.
3194
 *
3195
 * On success, the function returns 0 and all requests in the reqs array have
3196
 * been submitted. In error case this function returns -1, and any of the
3197
 * requests may or may not be submitted yet. In particular, this means that the
3198
 * callback will be called for some of the requests, for others it won't. The
3199
 * caller must check the error field of the BlockRequest to wait for the right
3200
 * callbacks (if error != 0, no callback will be called).
3201
 *
3202
 * The implementation may modify the contents of the reqs array, e.g. to merge
3203
 * requests. However, the fields opaque and error are left unmodified as they
3204
 * are used to signal failure for a single request to the caller.
3205
 */
3206
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3207
{
3208
    MultiwriteCB *mcb;
3209
    int i;
3210

    
3211
    /* don't submit writes if we don't have a medium */
3212
    if (bs->drv == NULL) {
3213
        for (i = 0; i < num_reqs; i++) {
3214
            reqs[i].error = -ENOMEDIUM;
3215
        }
3216
        return -1;
3217
    }
3218

    
3219
    if (num_reqs == 0) {
3220
        return 0;
3221
    }
3222

    
3223
    // Create MultiwriteCB structure
3224
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3225
    mcb->num_requests = 0;
3226
    mcb->num_callbacks = num_reqs;
3227

    
3228
    for (i = 0; i < num_reqs; i++) {
3229
        mcb->callbacks[i].cb = reqs[i].cb;
3230
        mcb->callbacks[i].opaque = reqs[i].opaque;
3231
    }
3232

    
3233
    // Check for mergable requests
3234
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3235

    
3236
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3237

    
3238
    /* Run the aio requests. */
3239
    mcb->num_requests = num_reqs;
3240
    for (i = 0; i < num_reqs; i++) {
3241
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3242
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3243
    }
3244

    
3245
    return 0;
3246
}
3247

    
3248
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3249
{
3250
    acb->pool->cancel(acb);
3251
}
3252

    
3253
/* block I/O throttling */
3254
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3255
                 bool is_write, double elapsed_time, uint64_t *wait)
3256
{
3257
    uint64_t bps_limit = 0;
3258
    double   bytes_limit, bytes_base, bytes_res;
3259
    double   slice_time, wait_time;
3260

    
3261
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3262
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3263
    } else if (bs->io_limits.bps[is_write]) {
3264
        bps_limit = bs->io_limits.bps[is_write];
3265
    } else {
3266
        if (wait) {
3267
            *wait = 0;
3268
        }
3269

    
3270
        return false;
3271
    }
3272

    
3273
    slice_time = bs->slice_end - bs->slice_start;
3274
    slice_time /= (NANOSECONDS_PER_SECOND);
3275
    bytes_limit = bps_limit * slice_time;
3276
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3277
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3278
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3279
    }
3280

    
3281
    /* bytes_base: the bytes of data which have been read/written; and
3282
     *             it is obtained from the history statistic info.
3283
     * bytes_res: the remaining bytes of data which need to be read/written.
3284
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3285
     *             the total time for completing reading/writting all data.
3286
     */
3287
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3288

    
3289
    if (bytes_base + bytes_res <= bytes_limit) {
3290
        if (wait) {
3291
            *wait = 0;
3292
        }
3293

    
3294
        return false;
3295
    }
3296

    
3297
    /* Calc approx time to dispatch */
3298
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3299

    
3300
    /* When the I/O rate at runtime exceeds the limits,
3301
     * bs->slice_end need to be extended in order that the current statistic
3302
     * info can be kept until the timer fire, so it is increased and tuned
3303
     * based on the result of experiment.
3304
     */
3305
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3306
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3307
    if (wait) {
3308
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3309
    }
3310

    
3311
    return true;
3312
}
3313

    
3314
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3315
                             double elapsed_time, uint64_t *wait)
3316
{
3317
    uint64_t iops_limit = 0;
3318
    double   ios_limit, ios_base;
3319
    double   slice_time, wait_time;
3320

    
3321
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3322
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3323
    } else if (bs->io_limits.iops[is_write]) {
3324
        iops_limit = bs->io_limits.iops[is_write];
3325
    } else {
3326
        if (wait) {
3327
            *wait = 0;
3328
        }
3329

    
3330
        return false;
3331
    }
3332

    
3333
    slice_time = bs->slice_end - bs->slice_start;
3334
    slice_time /= (NANOSECONDS_PER_SECOND);
3335
    ios_limit  = iops_limit * slice_time;
3336
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3337
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3338
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3339
    }
3340

    
3341
    if (ios_base + 1 <= ios_limit) {
3342
        if (wait) {
3343
            *wait = 0;
3344
        }
3345

    
3346
        return false;
3347
    }
3348

    
3349
    /* Calc approx time to dispatch */
3350
    wait_time = (ios_base + 1) / iops_limit;
3351
    if (wait_time > elapsed_time) {
3352
        wait_time = wait_time - elapsed_time;
3353
    } else {
3354
        wait_time = 0;
3355
    }
3356

    
3357
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3358
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3359
    if (wait) {
3360
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3361
    }
3362

    
3363
    return true;
3364
}
3365

    
3366
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3367
                           bool is_write, int64_t *wait)
3368
{
3369
    int64_t  now, max_wait;
3370
    uint64_t bps_wait = 0, iops_wait = 0;
3371
    double   elapsed_time;
3372
    int      bps_ret, iops_ret;
3373

    
3374
    now = qemu_get_clock_ns(vm_clock);
3375
    if ((bs->slice_start < now)
3376
        && (bs->slice_end > now)) {
3377
        bs->slice_end = now + bs->slice_time;
3378
    } else {
3379
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3380
        bs->slice_start = now;
3381
        bs->slice_end   = now + bs->slice_time;
3382

    
3383
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3384
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3385

    
3386
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3387
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3388
    }
3389

    
3390
    elapsed_time  = now - bs->slice_start;
3391
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3392

    
3393
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3394
                                      is_write, elapsed_time, &bps_wait);
3395
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3396
                                      elapsed_time, &iops_wait);
3397
    if (bps_ret || iops_ret) {
3398
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3399
        if (wait) {
3400
            *wait = max_wait;
3401
        }
3402

    
3403
        now = qemu_get_clock_ns(vm_clock);
3404
        if (bs->slice_end < now + max_wait) {
3405
            bs->slice_end = now + max_wait;
3406
        }
3407

    
3408
        return true;
3409
    }
3410

    
3411
    if (wait) {
3412
        *wait = 0;
3413
    }
3414

    
3415
    return false;
3416
}
3417

    
3418
/**************************************************************/
3419
/* async block device emulation */
3420

    
3421
typedef struct BlockDriverAIOCBSync {
3422
    BlockDriverAIOCB common;
3423
    QEMUBH *bh;
3424
    int ret;
3425
    /* vector translation state */
3426
    QEMUIOVector *qiov;
3427
    uint8_t *bounce;
3428
    int is_write;
3429
} BlockDriverAIOCBSync;
3430

    
3431
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3432
{
3433
    BlockDriverAIOCBSync *acb =
3434
        container_of(blockacb, BlockDriverAIOCBSync, common);
3435
    qemu_bh_delete(acb->bh);
3436
    acb->bh = NULL;
3437
    qemu_aio_release(acb);
3438
}
3439

    
3440
static AIOPool bdrv_em_aio_pool = {
3441
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3442
    .cancel             = bdrv_aio_cancel_em,
3443
};
3444

    
3445
static void bdrv_aio_bh_cb(void *opaque)
3446
{
3447
    BlockDriverAIOCBSync *acb = opaque;
3448

    
3449
    if (!acb->is_write)
3450
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3451
    qemu_vfree(acb->bounce);
3452
    acb->common.cb(acb->common.opaque, acb->ret);
3453
    qemu_bh_delete(acb->bh);
3454
    acb->bh = NULL;
3455
    qemu_aio_release(acb);
3456
}
3457

    
3458
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3459
                                            int64_t sector_num,
3460
                                            QEMUIOVector *qiov,
3461
                                            int nb_sectors,
3462
                                            BlockDriverCompletionFunc *cb,
3463
                                            void *opaque,
3464
                                            int is_write)
3465

    
3466
{
3467
    BlockDriverAIOCBSync *acb;
3468

    
3469
    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3470
    acb->is_write = is_write;
3471
    acb->qiov = qiov;
3472
    acb->bounce = qemu_blockalign(bs, qiov->size);
3473
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3474

    
3475
    if (is_write) {
3476
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3477
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3478
    } else {
3479
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3480
    }
3481

    
3482
    qemu_bh_schedule(acb->bh);
3483

    
3484
    return &acb->common;
3485
}
3486

    
3487
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3488
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3489
        BlockDriverCompletionFunc *cb, void *opaque)
3490
{
3491
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3492
}
3493

    
3494
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3495
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3496
        BlockDriverCompletionFunc *cb, void *opaque)
3497
{
3498
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3499
}
3500

    
3501

    
3502
typedef struct BlockDriverAIOCBCoroutine {
3503
    BlockDriverAIOCB common;
3504
    BlockRequest req;
3505
    bool is_write;
3506
    QEMUBH* bh;
3507
} BlockDriverAIOCBCoroutine;
3508

    
3509
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3510
{
3511
    qemu_aio_flush();
3512
}
3513

    
3514
static AIOPool bdrv_em_co_aio_pool = {
3515
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3516
    .cancel             = bdrv_aio_co_cancel_em,
3517
};
3518

    
3519
static void bdrv_co_em_bh(void *opaque)
3520
{
3521
    BlockDriverAIOCBCoroutine *acb = opaque;
3522

    
3523
    acb->common.cb(acb->common.opaque, acb->req.error);
3524
    qemu_bh_delete(acb->bh);
3525
    qemu_aio_release(acb);
3526
}
3527

    
3528
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3529
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3530
{
3531
    BlockDriverAIOCBCoroutine *acb = opaque;
3532
    BlockDriverState *bs = acb->common.bs;
3533

    
3534
    if (!acb->is_write) {
3535
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3536
            acb->req.nb_sectors, acb->req.qiov, 0);
3537
    } else {
3538
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3539
            acb->req.nb_sectors, acb->req.qiov, 0);
3540
    }
3541

    
3542
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3543
    qemu_bh_schedule(acb->bh);
3544
}
3545

    
3546
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3547
                                               int64_t sector_num,
3548
                                               QEMUIOVector *qiov,
3549
                                               int nb_sectors,
3550
                                               BlockDriverCompletionFunc *cb,
3551
                                               void *opaque,
3552
                                               bool is_write)
3553
{
3554
    Coroutine *co;
3555
    BlockDriverAIOCBCoroutine *acb;
3556

    
3557
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3558
    acb->req.sector = sector_num;
3559
    acb->req.nb_sectors = nb_sectors;
3560
    acb->req.qiov = qiov;
3561
    acb->is_write = is_write;
3562

    
3563
    co = qemu_coroutine_create(bdrv_co_do_rw);
3564
    qemu_coroutine_enter(co, acb);
3565

    
3566
    return &acb->common;
3567
}
3568

    
3569
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3570
{
3571
    BlockDriverAIOCBCoroutine *acb = opaque;
3572
    BlockDriverState *bs = acb->common.bs;
3573

    
3574
    acb->req.error = bdrv_co_flush(bs);
3575
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3576
    qemu_bh_schedule(acb->bh);
3577
}
3578

    
3579
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3580
        BlockDriverCompletionFunc *cb, void *opaque)
3581
{
3582
    trace_bdrv_aio_flush(bs, opaque);
3583

    
3584
    Coroutine *co;
3585
    BlockDriverAIOCBCoroutine *acb;
3586

    
3587
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3588
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3589
    qemu_coroutine_enter(co, acb);
3590

    
3591
    return &acb->common;
3592
}
3593

    
3594
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3595
{
3596
    BlockDriverAIOCBCoroutine *acb = opaque;
3597
    BlockDriverState *bs = acb->common.bs;
3598

    
3599
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3600
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3601
    qemu_bh_schedule(acb->bh);
3602
}
3603

    
3604
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3605
        int64_t sector_num, int nb_sectors,
3606
        BlockDriverCompletionFunc *cb, void *opaque)
3607
{
3608
    Coroutine *co;
3609
    BlockDriverAIOCBCoroutine *acb;
3610

    
3611
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3612

    
3613
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3614
    acb->req.sector = sector_num;
3615
    acb->req.nb_sectors = nb_sectors;
3616
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3617
    qemu_coroutine_enter(co, acb);
3618

    
3619
    return &acb->common;
3620
}
3621

    
3622
void bdrv_init(void)
3623
{
3624
    module_call_init(MODULE_INIT_BLOCK);
3625
}
3626

    
3627
void bdrv_init_with_whitelist(void)
3628
{
3629
    use_bdrv_whitelist = 1;
3630
    bdrv_init();
3631
}
3632

    
3633
void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3634
                   BlockDriverCompletionFunc *cb, void *opaque)
3635
{
3636
    BlockDriverAIOCB *acb;
3637

    
3638
    if (pool->free_aiocb) {
3639
        acb = pool->free_aiocb;
3640
        pool->free_aiocb = acb->next;
3641
    } else {
3642
        acb = g_malloc0(pool->aiocb_size);
3643
        acb->pool = pool;
3644
    }
3645
    acb->bs = bs;
3646
    acb->cb = cb;
3647
    acb->opaque = opaque;
3648
    return acb;
3649
}
3650

    
3651
void qemu_aio_release(void *p)
3652
{
3653
    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3654
    AIOPool *pool = acb->pool;
3655
    acb->next = pool->free_aiocb;
3656
    pool->free_aiocb = acb;
3657
}
3658

    
3659
/**************************************************************/
3660
/* Coroutine block device emulation */
3661

    
3662
typedef struct CoroutineIOCompletion {
3663
    Coroutine *coroutine;
3664
    int ret;
3665
} CoroutineIOCompletion;
3666

    
3667
static void bdrv_co_io_em_complete(void *opaque, int ret)
3668
{
3669
    CoroutineIOCompletion *co = opaque;
3670

    
3671
    co->ret = ret;
3672
    qemu_coroutine_enter(co->coroutine, NULL);
3673
}
3674

    
3675
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3676
                                      int nb_sectors, QEMUIOVector *iov,
3677
                                      bool is_write)
3678
{
3679
    CoroutineIOCompletion co = {
3680
        .coroutine = qemu_coroutine_self(),
3681
    };
3682
    BlockDriverAIOCB *acb;
3683

    
3684
    if (is_write) {
3685
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3686
                                       bdrv_co_io_em_complete, &co);
3687
    } else {
3688
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3689
                                      bdrv_co_io_em_complete, &co);
3690
    }
3691

    
3692
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3693
    if (!acb) {
3694
        return -EIO;
3695
    }
3696
    qemu_coroutine_yield();
3697

    
3698
    return co.ret;
3699
}
3700

    
3701
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3702
                                         int64_t sector_num, int nb_sectors,
3703
                                         QEMUIOVector *iov)
3704
{
3705
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3706
}
3707

    
3708
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3709
                                         int64_t sector_num, int nb_sectors,
3710
                                         QEMUIOVector *iov)
3711
{
3712
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3713
}
3714

    
3715
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3716
{
3717
    RwCo *rwco = opaque;
3718

    
3719
    rwco->ret = bdrv_co_flush(rwco->bs);
3720
}
3721

    
3722
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3723
{
3724
    int ret;
3725

    
3726
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3727
        return 0;
3728
    }
3729

    
3730
    /* Write back cached data to the OS even with cache=unsafe */
3731
    if (bs->drv->bdrv_co_flush_to_os) {
3732
        ret = bs->drv->bdrv_co_flush_to_os(bs);
3733
        if (ret < 0) {
3734
            return ret;
3735
        }
3736
    }
3737

    
3738
    /* But don't actually force it to the disk with cache=unsafe */
3739
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
3740
        goto flush_parent;
3741
    }
3742

    
3743
    if (bs->drv->bdrv_co_flush_to_disk) {
3744
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
3745
    } else if (bs->drv->bdrv_aio_flush) {
3746
        BlockDriverAIOCB *acb;
3747
        CoroutineIOCompletion co = {
3748
            .coroutine = qemu_coroutine_self(),
3749
        };
3750

    
3751
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3752
        if (acb == NULL) {
3753
            ret = -EIO;
3754
        } else {
3755
            qemu_coroutine_yield();
3756
            ret = co.ret;
3757
        }
3758
    } else {
3759
        /*
3760
         * Some block drivers always operate in either writethrough or unsafe
3761
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3762
         * know how the server works (because the behaviour is hardcoded or
3763
         * depends on server-side configuration), so we can't ensure that
3764
         * everything is safe on disk. Returning an error doesn't work because
3765
         * that would break guests even if the server operates in writethrough
3766
         * mode.
3767
         *
3768
         * Let's hope the user knows what he's doing.
3769
         */
3770
        ret = 0;
3771
    }
3772
    if (ret < 0) {
3773
        return ret;
3774
    }
3775

    
3776
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3777
     * in the case of cache=unsafe, so there are no useless flushes.
3778
     */
3779
flush_parent:
3780
    return bdrv_co_flush(bs->file);
3781
}
3782

    
3783
void bdrv_invalidate_cache(BlockDriverState *bs)
3784
{
3785
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3786
        bs->drv->bdrv_invalidate_cache(bs);
3787
    }
3788
}
3789

    
3790
void bdrv_invalidate_cache_all(void)
3791
{
3792
    BlockDriverState *bs;
3793

    
3794
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3795
        bdrv_invalidate_cache(bs);
3796
    }
3797
}
3798

    
3799
void bdrv_clear_incoming_migration_all(void)
3800
{
3801
    BlockDriverState *bs;
3802

    
3803
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3804
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3805
    }
3806
}
3807

    
3808
int bdrv_flush(BlockDriverState *bs)
3809
{
3810
    Coroutine *co;
3811
    RwCo rwco = {
3812
        .bs = bs,
3813
        .ret = NOT_DONE,
3814
    };
3815

    
3816
    if (qemu_in_coroutine()) {
3817
        /* Fast-path if already in coroutine context */
3818
        bdrv_flush_co_entry(&rwco);
3819
    } else {
3820
        co = qemu_coroutine_create(bdrv_flush_co_entry);
3821
        qemu_coroutine_enter(co, &rwco);
3822
        while (rwco.ret == NOT_DONE) {
3823
            qemu_aio_wait();
3824
        }
3825
    }
3826

    
3827
    return rwco.ret;
3828
}
3829

    
3830
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3831
{
3832
    RwCo *rwco = opaque;
3833

    
3834
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3835
}
3836

    
3837
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3838
                                 int nb_sectors)
3839
{
3840
    if (!bs->drv) {
3841
        return -ENOMEDIUM;
3842
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3843
        return -EIO;
3844
    } else if (bs->read_only) {
3845
        return -EROFS;
3846
    } else if (bs->drv->bdrv_co_discard) {
3847
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3848
    } else if (bs->drv->bdrv_aio_discard) {
3849
        BlockDriverAIOCB *acb;
3850
        CoroutineIOCompletion co = {
3851
            .coroutine = qemu_coroutine_self(),
3852
        };
3853

    
3854
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3855
                                        bdrv_co_io_em_complete, &co);
3856
        if (acb == NULL) {
3857
            return -EIO;
3858
        } else {
3859
            qemu_coroutine_yield();
3860
            return co.ret;
3861
        }
3862
    } else {
3863
        return 0;
3864
    }
3865
}
3866

    
3867
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3868
{
3869
    Coroutine *co;
3870
    RwCo rwco = {
3871
        .bs = bs,
3872
        .sector_num = sector_num,
3873
        .nb_sectors = nb_sectors,
3874
        .ret = NOT_DONE,
3875
    };
3876

    
3877
    if (qemu_in_coroutine()) {
3878
        /* Fast-path if already in coroutine context */
3879
        bdrv_discard_co_entry(&rwco);
3880
    } else {
3881
        co = qemu_coroutine_create(bdrv_discard_co_entry);
3882
        qemu_coroutine_enter(co, &rwco);
3883
        while (rwco.ret == NOT_DONE) {
3884
            qemu_aio_wait();
3885
        }
3886
    }
3887

    
3888
    return rwco.ret;
3889
}
3890

    
3891
/**************************************************************/
3892
/* removable device support */
3893

    
3894
/**
3895
 * Return TRUE if the media is present
3896
 */
3897
int bdrv_is_inserted(BlockDriverState *bs)
3898
{
3899
    BlockDriver *drv = bs->drv;
3900

    
3901
    if (!drv)
3902
        return 0;
3903
    if (!drv->bdrv_is_inserted)
3904
        return 1;
3905
    return drv->bdrv_is_inserted(bs);
3906
}
3907

    
3908
/**
3909
 * Return whether the media changed since the last call to this
3910
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3911
 */
3912
int bdrv_media_changed(BlockDriverState *bs)
3913
{
3914
    BlockDriver *drv = bs->drv;
3915

    
3916
    if (drv && drv->bdrv_media_changed) {
3917
        return drv->bdrv_media_changed(bs);
3918
    }
3919
    return -ENOTSUP;
3920
}
3921

    
3922
/**
3923
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3924
 */
3925
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3926
{
3927
    BlockDriver *drv = bs->drv;
3928

    
3929
    if (drv && drv->bdrv_eject) {
3930
        drv->bdrv_eject(bs, eject_flag);
3931
    }
3932

    
3933
    if (bs->device_name[0] != '\0') {
3934
        bdrv_emit_qmp_eject_event(bs, eject_flag);
3935
    }
3936
}
3937

    
3938
/**
3939
 * Lock or unlock the media (if it is locked, the user won't be able
3940
 * to eject it manually).
3941
 */
3942
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3943
{
3944
    BlockDriver *drv = bs->drv;
3945

    
3946
    trace_bdrv_lock_medium(bs, locked);
3947

    
3948
    if (drv && drv->bdrv_lock_medium) {
3949
        drv->bdrv_lock_medium(bs, locked);
3950
    }
3951
}
3952

    
3953
/* needed for generic scsi interface */
3954

    
3955
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3956
{
3957
    BlockDriver *drv = bs->drv;
3958

    
3959
    if (drv && drv->bdrv_ioctl)
3960
        return drv->bdrv_ioctl(bs, req, buf);
3961
    return -ENOTSUP;
3962
}
3963

    
3964
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3965
        unsigned long int req, void *buf,
3966
        BlockDriverCompletionFunc *cb, void *opaque)
3967
{
3968
    BlockDriver *drv = bs->drv;
3969

    
3970
    if (drv && drv->bdrv_aio_ioctl)
3971
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3972
    return NULL;
3973
}
3974

    
3975
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3976
{
3977
    bs->buffer_alignment = align;
3978
}
3979

    
3980
void *qemu_blockalign(BlockDriverState *bs, size_t size)
3981
{
3982
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3983
}
3984

    
3985
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3986
{
3987
    int64_t bitmap_size;
3988

    
3989
    bs->dirty_count = 0;
3990
    if (enable) {
3991
        if (!bs->dirty_bitmap) {
3992
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3993
                    BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3994
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3995

    
3996
            bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3997
        }
3998
    } else {
3999
        if (bs->dirty_bitmap) {
4000
            g_free(bs->dirty_bitmap);
4001
            bs->dirty_bitmap = NULL;
4002
        }
4003
    }
4004
}
4005

    
4006
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4007
{
4008
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4009

    
4010
    if (bs->dirty_bitmap &&
4011
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4012
        return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
4013
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
4014
    } else {
4015
        return 0;
4016
    }
4017
}
4018

    
4019
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4020
                      int nr_sectors)
4021
{
4022
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4023
}
4024

    
4025
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4026
{
4027
    return bs->dirty_count;
4028
}
4029

    
4030
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4031
{
4032
    assert(bs->in_use != in_use);
4033
    bs->in_use = in_use;
4034
}
4035

    
4036
int bdrv_in_use(BlockDriverState *bs)
4037
{
4038
    return bs->in_use;
4039
}
4040

    
4041
void bdrv_iostatus_enable(BlockDriverState *bs)
4042
{
4043
    bs->iostatus_enabled = true;
4044
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4045
}
4046

    
4047
/* The I/O status is only enabled if the drive explicitly
4048
 * enables it _and_ the VM is configured to stop on errors */
4049
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4050
{
4051
    return (bs->iostatus_enabled &&
4052
           (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
4053
            bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
4054
            bs->on_read_error == BLOCK_ERR_STOP_ANY));
4055
}
4056

    
4057
void bdrv_iostatus_disable(BlockDriverState *bs)
4058
{
4059
    bs->iostatus_enabled = false;
4060
}
4061

    
4062
void bdrv_iostatus_reset(BlockDriverState *bs)
4063
{
4064
    if (bdrv_iostatus_is_enabled(bs)) {
4065
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4066
    }
4067
}
4068

    
4069
/* XXX: Today this is set by device models because it makes the implementation
4070
   quite simple. However, the block layer knows about the error, so it's
4071
   possible to implement this without device models being involved */
4072
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4073
{
4074
    if (bdrv_iostatus_is_enabled(bs) &&
4075
        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4076
        assert(error >= 0);
4077
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4078
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4079
    }
4080
}
4081

    
4082
void
4083
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4084
        enum BlockAcctType type)
4085
{
4086
    assert(type < BDRV_MAX_IOTYPE);
4087

    
4088
    cookie->bytes = bytes;
4089
    cookie->start_time_ns = get_clock();
4090
    cookie->type = type;
4091
}
4092

    
4093
void
4094
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4095
{
4096
    assert(cookie->type < BDRV_MAX_IOTYPE);
4097

    
4098
    bs->nr_bytes[cookie->type] += cookie->bytes;
4099
    bs->nr_ops[cookie->type]++;
4100
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4101
}
4102

    
4103
int bdrv_img_create(const char *filename, const char *fmt,
4104
                    const char *base_filename, const char *base_fmt,
4105
                    char *options, uint64_t img_size, int flags)
4106
{
4107
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4108
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4109
    BlockDriverState *bs = NULL;
4110
    BlockDriver *drv, *proto_drv;
4111
    BlockDriver *backing_drv = NULL;
4112
    int ret = 0;
4113

    
4114
    /* Find driver and parse its options */
4115
    drv = bdrv_find_format(fmt);
4116
    if (!drv) {
4117
        error_report("Unknown file format '%s'", fmt);
4118
        ret = -EINVAL;
4119
        goto out;
4120
    }
4121

    
4122
    proto_drv = bdrv_find_protocol(filename);
4123
    if (!proto_drv) {
4124
        error_report("Unknown protocol '%s'", filename);
4125
        ret = -EINVAL;
4126
        goto out;
4127
    }
4128

    
4129
    create_options = append_option_parameters(create_options,
4130
                                              drv->create_options);
4131
    create_options = append_option_parameters(create_options,
4132
                                              proto_drv->create_options);
4133

    
4134
    /* Create parameter list with default values */
4135
    param = parse_option_parameters("", create_options, param);
4136

    
4137
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4138

    
4139
    /* Parse -o options */
4140
    if (options) {
4141
        param = parse_option_parameters(options, create_options, param);
4142
        if (param == NULL) {
4143
            error_report("Invalid options for file format '%s'.", fmt);
4144
            ret = -EINVAL;
4145
            goto out;
4146
        }
4147
    }
4148

    
4149
    if (base_filename) {
4150
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4151
                                 base_filename)) {
4152
            error_report("Backing file not supported for file format '%s'",
4153
                         fmt);
4154
            ret = -EINVAL;
4155
            goto out;
4156
        }
4157
    }
4158

    
4159
    if (base_fmt) {
4160
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4161
            error_report("Backing file format not supported for file "
4162
                         "format '%s'", fmt);
4163
            ret = -EINVAL;
4164
            goto out;
4165
        }
4166
    }
4167

    
4168
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4169
    if (backing_file && backing_file->value.s) {
4170
        if (!strcmp(filename, backing_file->value.s)) {
4171
            error_report("Error: Trying to create an image with the "
4172
                         "same filename as the backing file");
4173
            ret = -EINVAL;
4174
            goto out;
4175
        }
4176
    }
4177

    
4178
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4179
    if (backing_fmt && backing_fmt->value.s) {
4180
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4181
        if (!backing_drv) {
4182
            error_report("Unknown backing file format '%s'",
4183
                         backing_fmt->value.s);
4184
            ret = -EINVAL;
4185
            goto out;
4186
        }
4187
    }
4188

    
4189
    // The size for the image must always be specified, with one exception:
4190
    // If we are using a backing file, we can obtain the size from there
4191
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4192
    if (size && size->value.n == -1) {
4193
        if (backing_file && backing_file->value.s) {
4194
            uint64_t size;
4195
            char buf[32];
4196
            int back_flags;
4197

    
4198
            /* backing files always opened read-only */
4199
            back_flags =
4200
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4201

    
4202
            bs = bdrv_new("");
4203

    
4204
            ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4205
            if (ret < 0) {
4206
                error_report("Could not open '%s'", backing_file->value.s);
4207
                goto out;
4208
            }
4209
            bdrv_get_geometry(bs, &size);
4210
            size *= 512;
4211

    
4212
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4213
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4214
        } else {
4215
            error_report("Image creation needs a size parameter");
4216
            ret = -EINVAL;
4217
            goto out;
4218
        }
4219
    }
4220

    
4221
    printf("Formatting '%s', fmt=%s ", filename, fmt);
4222
    print_option_parameters(param);
4223
    puts("");
4224

    
4225
    ret = bdrv_create(drv, filename, param);
4226

    
4227
    if (ret < 0) {
4228
        if (ret == -ENOTSUP) {
4229
            error_report("Formatting or formatting option not supported for "
4230
                         "file format '%s'", fmt);
4231
        } else if (ret == -EFBIG) {
4232
            error_report("The image size is too large for file format '%s'",
4233
                         fmt);
4234
        } else {
4235
            error_report("%s: error while creating %s: %s", filename, fmt,
4236
                         strerror(-ret));
4237
        }
4238
    }
4239

    
4240
out:
4241
    free_option_parameters(create_options);
4242
    free_option_parameters(param);
4243

    
4244
    if (bs) {
4245
        bdrv_delete(bs);
4246
    }
4247

    
4248
    return ret;
4249
}
4250

    
4251
void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4252
                       int64_t speed, BlockDriverCompletionFunc *cb,
4253
                       void *opaque, Error **errp)
4254
{
4255
    BlockJob *job;
4256

    
4257
    if (bs->job || bdrv_in_use(bs)) {
4258
        error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4259
        return NULL;
4260
    }
4261
    bdrv_set_in_use(bs, 1);
4262

    
4263
    job = g_malloc0(job_type->instance_size);
4264
    job->job_type      = job_type;
4265
    job->bs            = bs;
4266
    job->cb            = cb;
4267
    job->opaque        = opaque;
4268
    job->busy          = true;
4269
    bs->job = job;
4270

    
4271
    /* Only set speed when necessary to avoid NotSupported error */
4272
    if (speed != 0) {
4273
        Error *local_err = NULL;
4274

    
4275
        block_job_set_speed(job, speed, &local_err);
4276
        if (error_is_set(&local_err)) {
4277
            bs->job = NULL;
4278
            g_free(job);
4279
            bdrv_set_in_use(bs, 0);
4280
            error_propagate(errp, local_err);
4281
            return NULL;
4282
        }
4283
    }
4284
    return job;
4285
}
4286

    
4287
void block_job_complete(BlockJob *job, int ret)
4288
{
4289
    BlockDriverState *bs = job->bs;
4290

    
4291
    assert(bs->job == job);
4292
    job->cb(job->opaque, ret);
4293
    bs->job = NULL;
4294
    g_free(job);
4295
    bdrv_set_in_use(bs, 0);
4296
}
4297

    
4298
void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4299
{
4300
    Error *local_err = NULL;
4301

    
4302
    if (!job->job_type->set_speed) {
4303
        error_set(errp, QERR_NOT_SUPPORTED);
4304
        return;
4305
    }
4306
    job->job_type->set_speed(job, speed, &local_err);
4307
    if (error_is_set(&local_err)) {
4308
        error_propagate(errp, local_err);
4309
        return;
4310
    }
4311

    
4312
    job->speed = speed;
4313
}
4314

    
4315
void block_job_cancel(BlockJob *job)
4316
{
4317
    job->cancelled = true;
4318
    if (job->co && !job->busy) {
4319
        qemu_coroutine_enter(job->co, NULL);
4320
    }
4321
}
4322

    
4323
bool block_job_is_cancelled(BlockJob *job)
4324
{
4325
    return job->cancelled;
4326
}
4327

    
4328
struct BlockCancelData {
4329
    BlockJob *job;
4330
    BlockDriverCompletionFunc *cb;
4331
    void *opaque;
4332
    bool cancelled;
4333
    int ret;
4334
};
4335

    
4336
static void block_job_cancel_cb(void *opaque, int ret)
4337
{
4338
    struct BlockCancelData *data = opaque;
4339

    
4340
    data->cancelled = block_job_is_cancelled(data->job);
4341
    data->ret = ret;
4342
    data->cb(data->opaque, ret);
4343
}
4344

    
4345
int block_job_cancel_sync(BlockJob *job)
4346
{
4347
    struct BlockCancelData data;
4348
    BlockDriverState *bs = job->bs;
4349

    
4350
    assert(bs->job == job);
4351

    
4352
    /* Set up our own callback to store the result and chain to
4353
     * the original callback.
4354
     */
4355
    data.job = job;
4356
    data.cb = job->cb;
4357
    data.opaque = job->opaque;
4358
    data.ret = -EINPROGRESS;
4359
    job->cb = block_job_cancel_cb;
4360
    job->opaque = &data;
4361
    block_job_cancel(job);
4362
    while (data.ret == -EINPROGRESS) {
4363
        qemu_aio_wait();
4364
    }
4365
    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4366
}
4367

    
4368
void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4369
{
4370
    /* Check cancellation *before* setting busy = false, too!  */
4371
    if (!block_job_is_cancelled(job)) {
4372
        job->busy = false;
4373
        co_sleep_ns(clock, ns);
4374
        job->busy = true;
4375
    }
4376
}