Statistics
| Branch: | Revision:

root / block.c @ fe235a06

History | View | Annotate | Download (113.5 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block_int.h"
29
#include "module.h"
30
#include "qjson.h"
31
#include "qemu-coroutine.h"
32
#include "qmp-commands.h"
33
#include "qemu-timer.h"
34

    
35
#ifdef CONFIG_BSD
36
#include <sys/types.h>
37
#include <sys/stat.h>
38
#include <sys/ioctl.h>
39
#include <sys/queue.h>
40
#ifndef __DragonFly__
41
#include <sys/disk.h>
42
#endif
43
#endif
44

    
45
#ifdef _WIN32
46
#include <windows.h>
47
#endif
48

    
49
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50

    
51
typedef enum {
52
    BDRV_REQ_COPY_ON_READ = 0x1,
53
    BDRV_REQ_ZERO_WRITE   = 0x2,
54
} BdrvRequestFlags;
55

    
56
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59
        BlockDriverCompletionFunc *cb, void *opaque);
60
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64
                                         int64_t sector_num, int nb_sectors,
65
                                         QEMUIOVector *iov);
66
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71
    BdrvRequestFlags flags);
72
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76
                                               int64_t sector_num,
77
                                               QEMUIOVector *qiov,
78
                                               int nb_sectors,
79
                                               BlockDriverCompletionFunc *cb,
80
                                               void *opaque,
81
                                               bool is_write);
82
static void coroutine_fn bdrv_co_do_rw(void *opaque);
83
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84
    int64_t sector_num, int nb_sectors);
85

    
86
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87
        bool is_write, double elapsed_time, uint64_t *wait);
88
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89
        double elapsed_time, uint64_t *wait);
90
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91
        bool is_write, int64_t *wait);
92

    
93
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
95

    
96
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
98

    
99
/* The device to use for VM snapshots */
100
static BlockDriverState *bs_snapshots;
101

    
102
/* If non-zero, use only whitelisted block drivers */
103
static int use_bdrv_whitelist;
104

    
105
#ifdef _WIN32
106
static int is_windows_drive_prefix(const char *filename)
107
{
108
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110
            filename[1] == ':');
111
}
112

    
113
int is_windows_drive(const char *filename)
114
{
115
    if (is_windows_drive_prefix(filename) &&
116
        filename[2] == '\0')
117
        return 1;
118
    if (strstart(filename, "\\\\.\\", NULL) ||
119
        strstart(filename, "//./", NULL))
120
        return 1;
121
    return 0;
122
}
123
#endif
124

    
125
/* throttling disk I/O limits */
126
void bdrv_io_limits_disable(BlockDriverState *bs)
127
{
128
    bs->io_limits_enabled = false;
129

    
130
    while (qemu_co_queue_next(&bs->throttled_reqs));
131

    
132
    if (bs->block_timer) {
133
        qemu_del_timer(bs->block_timer);
134
        qemu_free_timer(bs->block_timer);
135
        bs->block_timer = NULL;
136
    }
137

    
138
    bs->slice_start = 0;
139
    bs->slice_end   = 0;
140
    bs->slice_time  = 0;
141
    memset(&bs->io_base, 0, sizeof(bs->io_base));
142
}
143

    
144
static void bdrv_block_timer(void *opaque)
145
{
146
    BlockDriverState *bs = opaque;
147

    
148
    qemu_co_queue_next(&bs->throttled_reqs);
149
}
150

    
151
void bdrv_io_limits_enable(BlockDriverState *bs)
152
{
153
    qemu_co_queue_init(&bs->throttled_reqs);
154
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
156
    bs->slice_start = qemu_get_clock_ns(vm_clock);
157
    bs->slice_end   = bs->slice_start + bs->slice_time;
158
    memset(&bs->io_base, 0, sizeof(bs->io_base));
159
    bs->io_limits_enabled = true;
160
}
161

    
162
bool bdrv_io_limits_enabled(BlockDriverState *bs)
163
{
164
    BlockIOLimit *io_limits = &bs->io_limits;
165
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
166
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
169
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171
}
172

    
173
static void bdrv_io_limits_intercept(BlockDriverState *bs,
174
                                     bool is_write, int nb_sectors)
175
{
176
    int64_t wait_time = -1;
177

    
178
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179
        qemu_co_queue_wait(&bs->throttled_reqs);
180
    }
181

    
182
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183
     * throttled requests will not be dequeued until the current request is
184
     * allowed to be serviced. So if the current request still exceeds the
185
     * limits, it will be inserted to the head. All requests followed it will
186
     * be still in throttled_reqs queue.
187
     */
188

    
189
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190
        qemu_mod_timer(bs->block_timer,
191
                       wait_time + qemu_get_clock_ns(vm_clock));
192
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193
    }
194

    
195
    qemu_co_queue_next(&bs->throttled_reqs);
196
}
197

    
198
/* check if the path starts with "<protocol>:" */
199
static int path_has_protocol(const char *path)
200
{
201
    const char *p;
202

    
203
#ifdef _WIN32
204
    if (is_windows_drive(path) ||
205
        is_windows_drive_prefix(path)) {
206
        return 0;
207
    }
208
    p = path + strcspn(path, ":/\\");
209
#else
210
    p = path + strcspn(path, ":/");
211
#endif
212

    
213
    return *p == ':';
214
}
215

    
216
int path_is_absolute(const char *path)
217
{
218
#ifdef _WIN32
219
    /* specific case for names like: "\\.\d:" */
220
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221
        return 1;
222
    }
223
    return (*path == '/' || *path == '\\');
224
#else
225
    return (*path == '/');
226
#endif
227
}
228

    
229
/* if filename is absolute, just copy it to dest. Otherwise, build a
230
   path to it by considering it is relative to base_path. URL are
231
   supported. */
232
void path_combine(char *dest, int dest_size,
233
                  const char *base_path,
234
                  const char *filename)
235
{
236
    const char *p, *p1;
237
    int len;
238

    
239
    if (dest_size <= 0)
240
        return;
241
    if (path_is_absolute(filename)) {
242
        pstrcpy(dest, dest_size, filename);
243
    } else {
244
        p = strchr(base_path, ':');
245
        if (p)
246
            p++;
247
        else
248
            p = base_path;
249
        p1 = strrchr(base_path, '/');
250
#ifdef _WIN32
251
        {
252
            const char *p2;
253
            p2 = strrchr(base_path, '\\');
254
            if (!p1 || p2 > p1)
255
                p1 = p2;
256
        }
257
#endif
258
        if (p1)
259
            p1++;
260
        else
261
            p1 = base_path;
262
        if (p1 > p)
263
            p = p1;
264
        len = p - base_path;
265
        if (len > dest_size - 1)
266
            len = dest_size - 1;
267
        memcpy(dest, base_path, len);
268
        dest[len] = '\0';
269
        pstrcat(dest, dest_size, filename);
270
    }
271
}
272

    
273
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
274
{
275
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276
        pstrcpy(dest, sz, bs->backing_file);
277
    } else {
278
        path_combine(dest, sz, bs->filename, bs->backing_file);
279
    }
280
}
281

    
282
void bdrv_register(BlockDriver *bdrv)
283
{
284
    /* Block drivers without coroutine functions need emulation */
285
    if (!bdrv->bdrv_co_readv) {
286
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
287
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
288

    
289
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290
         * the block driver lacks aio we need to emulate that too.
291
         */
292
        if (!bdrv->bdrv_aio_readv) {
293
            /* add AIO emulation layer */
294
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
296
        }
297
    }
298

    
299
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
300
}
301

    
302
/* create a new block device (by default it is empty) */
303
BlockDriverState *bdrv_new(const char *device_name)
304
{
305
    BlockDriverState *bs;
306

    
307
    bs = g_malloc0(sizeof(BlockDriverState));
308
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309
    if (device_name[0] != '\0') {
310
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
311
    }
312
    bdrv_iostatus_disable(bs);
313
    return bs;
314
}
315

    
316
BlockDriver *bdrv_find_format(const char *format_name)
317
{
318
    BlockDriver *drv1;
319
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320
        if (!strcmp(drv1->format_name, format_name)) {
321
            return drv1;
322
        }
323
    }
324
    return NULL;
325
}
326

    
327
static int bdrv_is_whitelisted(BlockDriver *drv)
328
{
329
    static const char *whitelist[] = {
330
        CONFIG_BDRV_WHITELIST
331
    };
332
    const char **p;
333

    
334
    if (!whitelist[0])
335
        return 1;               /* no whitelist, anything goes */
336

    
337
    for (p = whitelist; *p; p++) {
338
        if (!strcmp(drv->format_name, *p)) {
339
            return 1;
340
        }
341
    }
342
    return 0;
343
}
344

    
345
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
346
{
347
    BlockDriver *drv = bdrv_find_format(format_name);
348
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
349
}
350

    
351
typedef struct CreateCo {
352
    BlockDriver *drv;
353
    char *filename;
354
    QEMUOptionParameter *options;
355
    int ret;
356
} CreateCo;
357

    
358
static void coroutine_fn bdrv_create_co_entry(void *opaque)
359
{
360
    CreateCo *cco = opaque;
361
    assert(cco->drv);
362

    
363
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
364
}
365

    
366
int bdrv_create(BlockDriver *drv, const char* filename,
367
    QEMUOptionParameter *options)
368
{
369
    int ret;
370

    
371
    Coroutine *co;
372
    CreateCo cco = {
373
        .drv = drv,
374
        .filename = g_strdup(filename),
375
        .options = options,
376
        .ret = NOT_DONE,
377
    };
378

    
379
    if (!drv->bdrv_create) {
380
        return -ENOTSUP;
381
    }
382

    
383
    if (qemu_in_coroutine()) {
384
        /* Fast-path if already in coroutine context */
385
        bdrv_create_co_entry(&cco);
386
    } else {
387
        co = qemu_coroutine_create(bdrv_create_co_entry);
388
        qemu_coroutine_enter(co, &cco);
389
        while (cco.ret == NOT_DONE) {
390
            qemu_aio_wait();
391
        }
392
    }
393

    
394
    ret = cco.ret;
395
    g_free(cco.filename);
396

    
397
    return ret;
398
}
399

    
400
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
401
{
402
    BlockDriver *drv;
403

    
404
    drv = bdrv_find_protocol(filename);
405
    if (drv == NULL) {
406
        return -ENOENT;
407
    }
408

    
409
    return bdrv_create(drv, filename, options);
410
}
411

    
412
/*
413
 * Create a uniquely-named empty temporary file.
414
 * Return 0 upon success, otherwise a negative errno value.
415
 */
416
int get_tmp_filename(char *filename, int size)
417
{
418
#ifdef _WIN32
419
    char temp_dir[MAX_PATH];
420
    /* GetTempFileName requires that its output buffer (4th param)
421
       have length MAX_PATH or greater.  */
422
    assert(size >= MAX_PATH);
423
    return (GetTempPath(MAX_PATH, temp_dir)
424
            && GetTempFileName(temp_dir, "qem", 0, filename)
425
            ? 0 : -GetLastError());
426
#else
427
    int fd;
428
    const char *tmpdir;
429
    tmpdir = getenv("TMPDIR");
430
    if (!tmpdir)
431
        tmpdir = "/tmp";
432
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433
        return -EOVERFLOW;
434
    }
435
    fd = mkstemp(filename);
436
    if (fd < 0) {
437
        return -errno;
438
    }
439
    if (close(fd) != 0) {
440
        unlink(filename);
441
        return -errno;
442
    }
443
    return 0;
444
#endif
445
}
446

    
447
/*
448
 * Detect host devices. By convention, /dev/cdrom[N] is always
449
 * recognized as a host CDROM.
450
 */
451
static BlockDriver *find_hdev_driver(const char *filename)
452
{
453
    int score_max = 0, score;
454
    BlockDriver *drv = NULL, *d;
455

    
456
    QLIST_FOREACH(d, &bdrv_drivers, list) {
457
        if (d->bdrv_probe_device) {
458
            score = d->bdrv_probe_device(filename);
459
            if (score > score_max) {
460
                score_max = score;
461
                drv = d;
462
            }
463
        }
464
    }
465

    
466
    return drv;
467
}
468

    
469
BlockDriver *bdrv_find_protocol(const char *filename)
470
{
471
    BlockDriver *drv1;
472
    char protocol[128];
473
    int len;
474
    const char *p;
475

    
476
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
477

    
478
    /*
479
     * XXX(hch): we really should not let host device detection
480
     * override an explicit protocol specification, but moving this
481
     * later breaks access to device names with colons in them.
482
     * Thanks to the brain-dead persistent naming schemes on udev-
483
     * based Linux systems those actually are quite common.
484
     */
485
    drv1 = find_hdev_driver(filename);
486
    if (drv1) {
487
        return drv1;
488
    }
489

    
490
    if (!path_has_protocol(filename)) {
491
        return bdrv_find_format("file");
492
    }
493
    p = strchr(filename, ':');
494
    assert(p != NULL);
495
    len = p - filename;
496
    if (len > sizeof(protocol) - 1)
497
        len = sizeof(protocol) - 1;
498
    memcpy(protocol, filename, len);
499
    protocol[len] = '\0';
500
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
501
        if (drv1->protocol_name &&
502
            !strcmp(drv1->protocol_name, protocol)) {
503
            return drv1;
504
        }
505
    }
506
    return NULL;
507
}
508

    
509
static int find_image_format(const char *filename, BlockDriver **pdrv)
510
{
511
    int ret, score, score_max;
512
    BlockDriver *drv1, *drv;
513
    uint8_t buf[2048];
514
    BlockDriverState *bs;
515

    
516
    ret = bdrv_file_open(&bs, filename, 0);
517
    if (ret < 0) {
518
        *pdrv = NULL;
519
        return ret;
520
    }
521

    
522
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
523
    if (bs->sg || !bdrv_is_inserted(bs)) {
524
        bdrv_delete(bs);
525
        drv = bdrv_find_format("raw");
526
        if (!drv) {
527
            ret = -ENOENT;
528
        }
529
        *pdrv = drv;
530
        return ret;
531
    }
532

    
533
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
534
    bdrv_delete(bs);
535
    if (ret < 0) {
536
        *pdrv = NULL;
537
        return ret;
538
    }
539

    
540
    score_max = 0;
541
    drv = NULL;
542
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
543
        if (drv1->bdrv_probe) {
544
            score = drv1->bdrv_probe(buf, ret, filename);
545
            if (score > score_max) {
546
                score_max = score;
547
                drv = drv1;
548
            }
549
        }
550
    }
551
    if (!drv) {
552
        ret = -ENOENT;
553
    }
554
    *pdrv = drv;
555
    return ret;
556
}
557

    
558
/**
559
 * Set the current 'total_sectors' value
560
 */
561
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
562
{
563
    BlockDriver *drv = bs->drv;
564

    
565
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
566
    if (bs->sg)
567
        return 0;
568

    
569
    /* query actual device if possible, otherwise just trust the hint */
570
    if (drv->bdrv_getlength) {
571
        int64_t length = drv->bdrv_getlength(bs);
572
        if (length < 0) {
573
            return length;
574
        }
575
        hint = length >> BDRV_SECTOR_BITS;
576
    }
577

    
578
    bs->total_sectors = hint;
579
    return 0;
580
}
581

    
582
/**
583
 * Set open flags for a given cache mode
584
 *
585
 * Return 0 on success, -1 if the cache mode was invalid.
586
 */
587
int bdrv_parse_cache_flags(const char *mode, int *flags)
588
{
589
    *flags &= ~BDRV_O_CACHE_MASK;
590

    
591
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
592
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
593
    } else if (!strcmp(mode, "directsync")) {
594
        *flags |= BDRV_O_NOCACHE;
595
    } else if (!strcmp(mode, "writeback")) {
596
        *flags |= BDRV_O_CACHE_WB;
597
    } else if (!strcmp(mode, "unsafe")) {
598
        *flags |= BDRV_O_CACHE_WB;
599
        *flags |= BDRV_O_NO_FLUSH;
600
    } else if (!strcmp(mode, "writethrough")) {
601
        /* this is the default */
602
    } else {
603
        return -1;
604
    }
605

    
606
    return 0;
607
}
608

    
609
/**
610
 * The copy-on-read flag is actually a reference count so multiple users may
611
 * use the feature without worrying about clobbering its previous state.
612
 * Copy-on-read stays enabled until all users have called to disable it.
613
 */
614
void bdrv_enable_copy_on_read(BlockDriverState *bs)
615
{
616
    bs->copy_on_read++;
617
}
618

    
619
void bdrv_disable_copy_on_read(BlockDriverState *bs)
620
{
621
    assert(bs->copy_on_read > 0);
622
    bs->copy_on_read--;
623
}
624

    
625
/*
626
 * Common part for opening disk images and files
627
 */
628
static int bdrv_open_common(BlockDriverState *bs, const char *filename,
629
    int flags, BlockDriver *drv)
630
{
631
    int ret, open_flags;
632

    
633
    assert(drv != NULL);
634
    assert(bs->file == NULL);
635

    
636
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
637

    
638
    bs->open_flags = flags;
639
    bs->buffer_alignment = 512;
640

    
641
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
642
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
643
        bdrv_enable_copy_on_read(bs);
644
    }
645

    
646
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
647

    
648
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
649
        return -ENOTSUP;
650
    }
651

    
652
    bs->drv = drv;
653
    bs->opaque = g_malloc0(drv->instance_size);
654

    
655
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
656
    open_flags = flags | BDRV_O_CACHE_WB;
657

    
658
    /*
659
     * Clear flags that are internal to the block layer before opening the
660
     * image.
661
     */
662
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
663

    
664
    /*
665
     * Snapshots should be writable.
666
     */
667
    if (bs->is_temporary) {
668
        open_flags |= BDRV_O_RDWR;
669
    }
670

    
671
    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
672

    
673
    /* Open the image, either directly or using a protocol */
674
    if (drv->bdrv_file_open) {
675
        ret = drv->bdrv_file_open(bs, filename, open_flags);
676
    } else {
677
        ret = bdrv_file_open(&bs->file, filename, open_flags);
678
        if (ret >= 0) {
679
            ret = drv->bdrv_open(bs, open_flags);
680
        }
681
    }
682

    
683
    if (ret < 0) {
684
        goto free_and_fail;
685
    }
686

    
687
    ret = refresh_total_sectors(bs, bs->total_sectors);
688
    if (ret < 0) {
689
        goto free_and_fail;
690
    }
691

    
692
#ifndef _WIN32
693
    if (bs->is_temporary) {
694
        unlink(filename);
695
    }
696
#endif
697
    return 0;
698

    
699
free_and_fail:
700
    if (bs->file) {
701
        bdrv_delete(bs->file);
702
        bs->file = NULL;
703
    }
704
    g_free(bs->opaque);
705
    bs->opaque = NULL;
706
    bs->drv = NULL;
707
    return ret;
708
}
709

    
710
/*
711
 * Opens a file using a protocol (file, host_device, nbd, ...)
712
 */
713
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
714
{
715
    BlockDriverState *bs;
716
    BlockDriver *drv;
717
    int ret;
718

    
719
    drv = bdrv_find_protocol(filename);
720
    if (!drv) {
721
        return -ENOENT;
722
    }
723

    
724
    bs = bdrv_new("");
725
    ret = bdrv_open_common(bs, filename, flags, drv);
726
    if (ret < 0) {
727
        bdrv_delete(bs);
728
        return ret;
729
    }
730
    bs->growable = 1;
731
    *pbs = bs;
732
    return 0;
733
}
734

    
735
/*
736
 * Opens a disk image (raw, qcow2, vmdk, ...)
737
 */
738
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
739
              BlockDriver *drv)
740
{
741
    int ret;
742
    char tmp_filename[PATH_MAX];
743

    
744
    if (flags & BDRV_O_SNAPSHOT) {
745
        BlockDriverState *bs1;
746
        int64_t total_size;
747
        int is_protocol = 0;
748
        BlockDriver *bdrv_qcow2;
749
        QEMUOptionParameter *options;
750
        char backing_filename[PATH_MAX];
751

    
752
        /* if snapshot, we create a temporary backing file and open it
753
           instead of opening 'filename' directly */
754

    
755
        /* if there is a backing file, use it */
756
        bs1 = bdrv_new("");
757
        ret = bdrv_open(bs1, filename, 0, drv);
758
        if (ret < 0) {
759
            bdrv_delete(bs1);
760
            return ret;
761
        }
762
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
763

    
764
        if (bs1->drv && bs1->drv->protocol_name)
765
            is_protocol = 1;
766

    
767
        bdrv_delete(bs1);
768

    
769
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
770
        if (ret < 0) {
771
            return ret;
772
        }
773

    
774
        /* Real path is meaningless for protocols */
775
        if (is_protocol)
776
            snprintf(backing_filename, sizeof(backing_filename),
777
                     "%s", filename);
778
        else if (!realpath(filename, backing_filename))
779
            return -errno;
780

    
781
        bdrv_qcow2 = bdrv_find_format("qcow2");
782
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
783

    
784
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
785
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
786
        if (drv) {
787
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
788
                drv->format_name);
789
        }
790

    
791
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
792
        free_option_parameters(options);
793
        if (ret < 0) {
794
            return ret;
795
        }
796

    
797
        filename = tmp_filename;
798
        drv = bdrv_qcow2;
799
        bs->is_temporary = 1;
800
    }
801

    
802
    /* Find the right image format driver */
803
    if (!drv) {
804
        ret = find_image_format(filename, &drv);
805
    }
806

    
807
    if (!drv) {
808
        goto unlink_and_fail;
809
    }
810

    
811
    /* Open the image */
812
    ret = bdrv_open_common(bs, filename, flags, drv);
813
    if (ret < 0) {
814
        goto unlink_and_fail;
815
    }
816

    
817
    /* If there is a backing file, use it */
818
    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
819
        char backing_filename[PATH_MAX];
820
        int back_flags;
821
        BlockDriver *back_drv = NULL;
822

    
823
        bs->backing_hd = bdrv_new("");
824
        bdrv_get_full_backing_filename(bs, backing_filename,
825
                                       sizeof(backing_filename));
826

    
827
        if (bs->backing_format[0] != '\0') {
828
            back_drv = bdrv_find_format(bs->backing_format);
829
        }
830

    
831
        /* backing files always opened read-only */
832
        back_flags =
833
            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
834

    
835
        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
836
        if (ret < 0) {
837
            bdrv_close(bs);
838
            return ret;
839
        }
840
        if (bs->is_temporary) {
841
            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
842
        } else {
843
            /* base image inherits from "parent" */
844
            bs->backing_hd->keep_read_only = bs->keep_read_only;
845
        }
846
    }
847

    
848
    if (!bdrv_key_required(bs)) {
849
        bdrv_dev_change_media_cb(bs, true);
850
    }
851

    
852
    /* throttling disk I/O limits */
853
    if (bs->io_limits_enabled) {
854
        bdrv_io_limits_enable(bs);
855
    }
856

    
857
    return 0;
858

    
859
unlink_and_fail:
860
    if (bs->is_temporary) {
861
        unlink(filename);
862
    }
863
    return ret;
864
}
865

    
866
void bdrv_close(BlockDriverState *bs)
867
{
868
    bdrv_flush(bs);
869
    if (bs->drv) {
870
        if (bs->job) {
871
            block_job_cancel_sync(bs->job);
872
        }
873
        bdrv_drain_all();
874

    
875
        if (bs == bs_snapshots) {
876
            bs_snapshots = NULL;
877
        }
878
        if (bs->backing_hd) {
879
            bdrv_delete(bs->backing_hd);
880
            bs->backing_hd = NULL;
881
        }
882
        bs->drv->bdrv_close(bs);
883
        g_free(bs->opaque);
884
#ifdef _WIN32
885
        if (bs->is_temporary) {
886
            unlink(bs->filename);
887
        }
888
#endif
889
        bs->opaque = NULL;
890
        bs->drv = NULL;
891
        bs->copy_on_read = 0;
892
        bs->backing_file[0] = '\0';
893
        bs->backing_format[0] = '\0';
894
        bs->total_sectors = 0;
895
        bs->encrypted = 0;
896
        bs->valid_key = 0;
897
        bs->sg = 0;
898
        bs->growable = 0;
899

    
900
        if (bs->file != NULL) {
901
            bdrv_delete(bs->file);
902
            bs->file = NULL;
903
        }
904
    }
905

    
906
    bdrv_dev_change_media_cb(bs, false);
907

    
908
    /*throttling disk I/O limits*/
909
    if (bs->io_limits_enabled) {
910
        bdrv_io_limits_disable(bs);
911
    }
912
}
913

    
914
void bdrv_close_all(void)
915
{
916
    BlockDriverState *bs;
917

    
918
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
919
        bdrv_close(bs);
920
    }
921
}
922

    
923
/*
924
 * Wait for pending requests to complete across all BlockDriverStates
925
 *
926
 * This function does not flush data to disk, use bdrv_flush_all() for that
927
 * after calling this function.
928
 *
929
 * Note that completion of an asynchronous I/O operation can trigger any
930
 * number of other I/O operations on other devices---for example a coroutine
931
 * can be arbitrarily complex and a constant flow of I/O can come until the
932
 * coroutine is complete.  Because of this, it is not possible to have a
933
 * function to drain a single device's I/O queue.
934
 */
935
void bdrv_drain_all(void)
936
{
937
    BlockDriverState *bs;
938
    bool busy;
939

    
940
    do {
941
        busy = qemu_aio_wait();
942

    
943
        /* FIXME: We do not have timer support here, so this is effectively
944
         * a busy wait.
945
         */
946
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
947
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
948
                qemu_co_queue_restart_all(&bs->throttled_reqs);
949
                busy = true;
950
            }
951
        }
952
    } while (busy);
953

    
954
    /* If requests are still pending there is a bug somewhere */
955
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
956
        assert(QLIST_EMPTY(&bs->tracked_requests));
957
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
958
    }
959
}
960

    
961
/* make a BlockDriverState anonymous by removing from bdrv_state list.
962
   Also, NULL terminate the device_name to prevent double remove */
963
void bdrv_make_anon(BlockDriverState *bs)
964
{
965
    if (bs->device_name[0] != '\0') {
966
        QTAILQ_REMOVE(&bdrv_states, bs, list);
967
    }
968
    bs->device_name[0] = '\0';
969
}
970

    
971
static void bdrv_rebind(BlockDriverState *bs)
972
{
973
    if (bs->drv && bs->drv->bdrv_rebind) {
974
        bs->drv->bdrv_rebind(bs);
975
    }
976
}
977

    
978
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
979
                                     BlockDriverState *bs_src)
980
{
981
    /* move some fields that need to stay attached to the device */
982
    bs_dest->open_flags         = bs_src->open_flags;
983

    
984
    /* dev info */
985
    bs_dest->dev_ops            = bs_src->dev_ops;
986
    bs_dest->dev_opaque         = bs_src->dev_opaque;
987
    bs_dest->dev                = bs_src->dev;
988
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
989
    bs_dest->copy_on_read       = bs_src->copy_on_read;
990

    
991
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
992

    
993
    /* i/o timing parameters */
994
    bs_dest->slice_time         = bs_src->slice_time;
995
    bs_dest->slice_start        = bs_src->slice_start;
996
    bs_dest->slice_end          = bs_src->slice_end;
997
    bs_dest->io_limits          = bs_src->io_limits;
998
    bs_dest->io_base            = bs_src->io_base;
999
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1000
    bs_dest->block_timer        = bs_src->block_timer;
1001
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1002

    
1003
    /* r/w error */
1004
    bs_dest->on_read_error      = bs_src->on_read_error;
1005
    bs_dest->on_write_error     = bs_src->on_write_error;
1006

    
1007
    /* i/o status */
1008
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1009
    bs_dest->iostatus           = bs_src->iostatus;
1010

    
1011
    /* dirty bitmap */
1012
    bs_dest->dirty_count        = bs_src->dirty_count;
1013
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1014

    
1015
    /* job */
1016
    bs_dest->in_use             = bs_src->in_use;
1017
    bs_dest->job                = bs_src->job;
1018

    
1019
    /* keep the same entry in bdrv_states */
1020
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1021
            bs_src->device_name);
1022
    bs_dest->list = bs_src->list;
1023
}
1024

    
1025
/*
1026
 * Swap bs contents for two image chains while they are live,
1027
 * while keeping required fields on the BlockDriverState that is
1028
 * actually attached to a device.
1029
 *
1030
 * This will modify the BlockDriverState fields, and swap contents
1031
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1032
 *
1033
 * bs_new is required to be anonymous.
1034
 *
1035
 * This function does not create any image files.
1036
 */
1037
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1038
{
1039
    BlockDriverState tmp;
1040

    
1041
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1042
    assert(bs_new->device_name[0] == '\0');
1043
    assert(bs_new->dirty_bitmap == NULL);
1044
    assert(bs_new->job == NULL);
1045
    assert(bs_new->dev == NULL);
1046
    assert(bs_new->in_use == 0);
1047
    assert(bs_new->io_limits_enabled == false);
1048
    assert(bs_new->block_timer == NULL);
1049

    
1050
    tmp = *bs_new;
1051
    *bs_new = *bs_old;
1052
    *bs_old = tmp;
1053

    
1054
    /* there are some fields that should not be swapped, move them back */
1055
    bdrv_move_feature_fields(&tmp, bs_old);
1056
    bdrv_move_feature_fields(bs_old, bs_new);
1057
    bdrv_move_feature_fields(bs_new, &tmp);
1058

    
1059
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1060
    assert(bs_new->device_name[0] == '\0');
1061

    
1062
    /* Check a few fields that should remain attached to the device */
1063
    assert(bs_new->dev == NULL);
1064
    assert(bs_new->job == NULL);
1065
    assert(bs_new->in_use == 0);
1066
    assert(bs_new->io_limits_enabled == false);
1067
    assert(bs_new->block_timer == NULL);
1068

    
1069
    bdrv_rebind(bs_new);
1070
    bdrv_rebind(bs_old);
1071
}
1072

    
1073
/*
1074
 * Add new bs contents at the top of an image chain while the chain is
1075
 * live, while keeping required fields on the top layer.
1076
 *
1077
 * This will modify the BlockDriverState fields, and swap contents
1078
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1079
 *
1080
 * bs_new is required to be anonymous.
1081
 *
1082
 * This function does not create any image files.
1083
 */
1084
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1085
{
1086
    bdrv_swap(bs_new, bs_top);
1087

    
1088
    /* The contents of 'tmp' will become bs_top, as we are
1089
     * swapping bs_new and bs_top contents. */
1090
    bs_top->backing_hd = bs_new;
1091
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1092
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1093
            bs_new->filename);
1094
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1095
            bs_new->drv ? bs_new->drv->format_name : "");
1096
}
1097

    
1098
void bdrv_delete(BlockDriverState *bs)
1099
{
1100
    assert(!bs->dev);
1101
    assert(!bs->job);
1102
    assert(!bs->in_use);
1103

    
1104
    /* remove from list, if necessary */
1105
    bdrv_make_anon(bs);
1106

    
1107
    bdrv_close(bs);
1108

    
1109
    assert(bs != bs_snapshots);
1110
    g_free(bs);
1111
}
1112

    
1113
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1114
/* TODO change to DeviceState *dev when all users are qdevified */
1115
{
1116
    if (bs->dev) {
1117
        return -EBUSY;
1118
    }
1119
    bs->dev = dev;
1120
    bdrv_iostatus_reset(bs);
1121
    return 0;
1122
}
1123

    
1124
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1125
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1126
{
1127
    if (bdrv_attach_dev(bs, dev) < 0) {
1128
        abort();
1129
    }
1130
}
1131

    
1132
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1133
/* TODO change to DeviceState *dev when all users are qdevified */
1134
{
1135
    assert(bs->dev == dev);
1136
    bs->dev = NULL;
1137
    bs->dev_ops = NULL;
1138
    bs->dev_opaque = NULL;
1139
    bs->buffer_alignment = 512;
1140
}
1141

    
1142
/* TODO change to return DeviceState * when all users are qdevified */
1143
void *bdrv_get_attached_dev(BlockDriverState *bs)
1144
{
1145
    return bs->dev;
1146
}
1147

    
1148
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1149
                      void *opaque)
1150
{
1151
    bs->dev_ops = ops;
1152
    bs->dev_opaque = opaque;
1153
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1154
        bs_snapshots = NULL;
1155
    }
1156
}
1157

    
1158
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1159
                               BlockQMPEventAction action, int is_read)
1160
{
1161
    QObject *data;
1162
    const char *action_str;
1163

    
1164
    switch (action) {
1165
    case BDRV_ACTION_REPORT:
1166
        action_str = "report";
1167
        break;
1168
    case BDRV_ACTION_IGNORE:
1169
        action_str = "ignore";
1170
        break;
1171
    case BDRV_ACTION_STOP:
1172
        action_str = "stop";
1173
        break;
1174
    default:
1175
        abort();
1176
    }
1177

    
1178
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1179
                              bdrv->device_name,
1180
                              action_str,
1181
                              is_read ? "read" : "write");
1182
    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1183

    
1184
    qobject_decref(data);
1185
}
1186

    
1187
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1188
{
1189
    QObject *data;
1190

    
1191
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1192
                              bdrv_get_device_name(bs), ejected);
1193
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1194

    
1195
    qobject_decref(data);
1196
}
1197

    
1198
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1199
{
1200
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1201
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1202
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1203
        if (tray_was_closed) {
1204
            /* tray open */
1205
            bdrv_emit_qmp_eject_event(bs, true);
1206
        }
1207
        if (load) {
1208
            /* tray close */
1209
            bdrv_emit_qmp_eject_event(bs, false);
1210
        }
1211
    }
1212
}
1213

    
1214
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1215
{
1216
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1217
}
1218

    
1219
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1220
{
1221
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1222
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1223
    }
1224
}
1225

    
1226
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1227
{
1228
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1229
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1230
    }
1231
    return false;
1232
}
1233

    
1234
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1235
{
1236
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1237
        bs->dev_ops->resize_cb(bs->dev_opaque);
1238
    }
1239
}
1240

    
1241
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1242
{
1243
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1244
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1245
    }
1246
    return false;
1247
}
1248

    
1249
/*
1250
 * Run consistency checks on an image
1251
 *
1252
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1253
 * free of errors) or -errno when an internal error occurred. The results of the
1254
 * check are stored in res.
1255
 */
1256
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1257
{
1258
    if (bs->drv->bdrv_check == NULL) {
1259
        return -ENOTSUP;
1260
    }
1261

    
1262
    memset(res, 0, sizeof(*res));
1263
    return bs->drv->bdrv_check(bs, res, fix);
1264
}
1265

    
1266
#define COMMIT_BUF_SECTORS 2048
1267

    
1268
/* commit COW file into the raw image */
1269
int bdrv_commit(BlockDriverState *bs)
1270
{
1271
    BlockDriver *drv = bs->drv;
1272
    BlockDriver *backing_drv;
1273
    int64_t sector, total_sectors;
1274
    int n, ro, open_flags;
1275
    int ret = 0, rw_ret = 0;
1276
    uint8_t *buf;
1277
    char filename[1024];
1278
    BlockDriverState *bs_rw, *bs_ro;
1279

    
1280
    if (!drv)
1281
        return -ENOMEDIUM;
1282
    
1283
    if (!bs->backing_hd) {
1284
        return -ENOTSUP;
1285
    }
1286

    
1287
    if (bs->backing_hd->keep_read_only) {
1288
        return -EACCES;
1289
    }
1290

    
1291
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1292
        return -EBUSY;
1293
    }
1294

    
1295
    backing_drv = bs->backing_hd->drv;
1296
    ro = bs->backing_hd->read_only;
1297
    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1298
    open_flags =  bs->backing_hd->open_flags;
1299

    
1300
    if (ro) {
1301
        /* re-open as RW */
1302
        bdrv_delete(bs->backing_hd);
1303
        bs->backing_hd = NULL;
1304
        bs_rw = bdrv_new("");
1305
        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1306
            backing_drv);
1307
        if (rw_ret < 0) {
1308
            bdrv_delete(bs_rw);
1309
            /* try to re-open read-only */
1310
            bs_ro = bdrv_new("");
1311
            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1312
                backing_drv);
1313
            if (ret < 0) {
1314
                bdrv_delete(bs_ro);
1315
                /* drive not functional anymore */
1316
                bs->drv = NULL;
1317
                return ret;
1318
            }
1319
            bs->backing_hd = bs_ro;
1320
            return rw_ret;
1321
        }
1322
        bs->backing_hd = bs_rw;
1323
    }
1324

    
1325
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1326
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1327

    
1328
    for (sector = 0; sector < total_sectors; sector += n) {
1329
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1330

    
1331
            if (bdrv_read(bs, sector, buf, n) != 0) {
1332
                ret = -EIO;
1333
                goto ro_cleanup;
1334
            }
1335

    
1336
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1337
                ret = -EIO;
1338
                goto ro_cleanup;
1339
            }
1340
        }
1341
    }
1342

    
1343
    if (drv->bdrv_make_empty) {
1344
        ret = drv->bdrv_make_empty(bs);
1345
        bdrv_flush(bs);
1346
    }
1347

    
1348
    /*
1349
     * Make sure all data we wrote to the backing device is actually
1350
     * stable on disk.
1351
     */
1352
    if (bs->backing_hd)
1353
        bdrv_flush(bs->backing_hd);
1354

    
1355
ro_cleanup:
1356
    g_free(buf);
1357

    
1358
    if (ro) {
1359
        /* re-open as RO */
1360
        bdrv_delete(bs->backing_hd);
1361
        bs->backing_hd = NULL;
1362
        bs_ro = bdrv_new("");
1363
        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1364
            backing_drv);
1365
        if (ret < 0) {
1366
            bdrv_delete(bs_ro);
1367
            /* drive not functional anymore */
1368
            bs->drv = NULL;
1369
            return ret;
1370
        }
1371
        bs->backing_hd = bs_ro;
1372
        bs->backing_hd->keep_read_only = 0;
1373
    }
1374

    
1375
    return ret;
1376
}
1377

    
1378
int bdrv_commit_all(void)
1379
{
1380
    BlockDriverState *bs;
1381

    
1382
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1383
        int ret = bdrv_commit(bs);
1384
        if (ret < 0) {
1385
            return ret;
1386
        }
1387
    }
1388
    return 0;
1389
}
1390

    
1391
struct BdrvTrackedRequest {
1392
    BlockDriverState *bs;
1393
    int64_t sector_num;
1394
    int nb_sectors;
1395
    bool is_write;
1396
    QLIST_ENTRY(BdrvTrackedRequest) list;
1397
    Coroutine *co; /* owner, used for deadlock detection */
1398
    CoQueue wait_queue; /* coroutines blocked on this request */
1399
};
1400

    
1401
/**
1402
 * Remove an active request from the tracked requests list
1403
 *
1404
 * This function should be called when a tracked request is completing.
1405
 */
1406
static void tracked_request_end(BdrvTrackedRequest *req)
1407
{
1408
    QLIST_REMOVE(req, list);
1409
    qemu_co_queue_restart_all(&req->wait_queue);
1410
}
1411

    
1412
/**
1413
 * Add an active request to the tracked requests list
1414
 */
1415
static void tracked_request_begin(BdrvTrackedRequest *req,
1416
                                  BlockDriverState *bs,
1417
                                  int64_t sector_num,
1418
                                  int nb_sectors, bool is_write)
1419
{
1420
    *req = (BdrvTrackedRequest){
1421
        .bs = bs,
1422
        .sector_num = sector_num,
1423
        .nb_sectors = nb_sectors,
1424
        .is_write = is_write,
1425
        .co = qemu_coroutine_self(),
1426
    };
1427

    
1428
    qemu_co_queue_init(&req->wait_queue);
1429

    
1430
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1431
}
1432

    
1433
/**
1434
 * Round a region to cluster boundaries
1435
 */
1436
static void round_to_clusters(BlockDriverState *bs,
1437
                              int64_t sector_num, int nb_sectors,
1438
                              int64_t *cluster_sector_num,
1439
                              int *cluster_nb_sectors)
1440
{
1441
    BlockDriverInfo bdi;
1442

    
1443
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1444
        *cluster_sector_num = sector_num;
1445
        *cluster_nb_sectors = nb_sectors;
1446
    } else {
1447
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1448
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1449
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1450
                                            nb_sectors, c);
1451
    }
1452
}
1453

    
1454
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1455
                                     int64_t sector_num, int nb_sectors) {
1456
    /*        aaaa   bbbb */
1457
    if (sector_num >= req->sector_num + req->nb_sectors) {
1458
        return false;
1459
    }
1460
    /* bbbb   aaaa        */
1461
    if (req->sector_num >= sector_num + nb_sectors) {
1462
        return false;
1463
    }
1464
    return true;
1465
}
1466

    
1467
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1468
        int64_t sector_num, int nb_sectors)
1469
{
1470
    BdrvTrackedRequest *req;
1471
    int64_t cluster_sector_num;
1472
    int cluster_nb_sectors;
1473
    bool retry;
1474

    
1475
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1476
     * that allocating writes will be serialized and not race with each other
1477
     * for the same cluster.  For example, in copy-on-read it ensures that the
1478
     * CoR read and write operations are atomic and guest writes cannot
1479
     * interleave between them.
1480
     */
1481
    round_to_clusters(bs, sector_num, nb_sectors,
1482
                      &cluster_sector_num, &cluster_nb_sectors);
1483

    
1484
    do {
1485
        retry = false;
1486
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1487
            if (tracked_request_overlaps(req, cluster_sector_num,
1488
                                         cluster_nb_sectors)) {
1489
                /* Hitting this means there was a reentrant request, for
1490
                 * example, a block driver issuing nested requests.  This must
1491
                 * never happen since it means deadlock.
1492
                 */
1493
                assert(qemu_coroutine_self() != req->co);
1494

    
1495
                qemu_co_queue_wait(&req->wait_queue);
1496
                retry = true;
1497
                break;
1498
            }
1499
        }
1500
    } while (retry);
1501
}
1502

    
1503
/*
1504
 * Return values:
1505
 * 0        - success
1506
 * -EINVAL  - backing format specified, but no file
1507
 * -ENOSPC  - can't update the backing file because no space is left in the
1508
 *            image file header
1509
 * -ENOTSUP - format driver doesn't support changing the backing file
1510
 */
1511
int bdrv_change_backing_file(BlockDriverState *bs,
1512
    const char *backing_file, const char *backing_fmt)
1513
{
1514
    BlockDriver *drv = bs->drv;
1515
    int ret;
1516

    
1517
    /* Backing file format doesn't make sense without a backing file */
1518
    if (backing_fmt && !backing_file) {
1519
        return -EINVAL;
1520
    }
1521

    
1522
    if (drv->bdrv_change_backing_file != NULL) {
1523
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1524
    } else {
1525
        ret = -ENOTSUP;
1526
    }
1527

    
1528
    if (ret == 0) {
1529
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1530
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1531
    }
1532
    return ret;
1533
}
1534

    
1535
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1536
                                   size_t size)
1537
{
1538
    int64_t len;
1539

    
1540
    if (!bdrv_is_inserted(bs))
1541
        return -ENOMEDIUM;
1542

    
1543
    if (bs->growable)
1544
        return 0;
1545

    
1546
    len = bdrv_getlength(bs);
1547

    
1548
    if (offset < 0)
1549
        return -EIO;
1550

    
1551
    if ((offset > len) || (len - offset < size))
1552
        return -EIO;
1553

    
1554
    return 0;
1555
}
1556

    
1557
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1558
                              int nb_sectors)
1559
{
1560
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1561
                                   nb_sectors * BDRV_SECTOR_SIZE);
1562
}
1563

    
1564
typedef struct RwCo {
1565
    BlockDriverState *bs;
1566
    int64_t sector_num;
1567
    int nb_sectors;
1568
    QEMUIOVector *qiov;
1569
    bool is_write;
1570
    int ret;
1571
} RwCo;
1572

    
1573
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1574
{
1575
    RwCo *rwco = opaque;
1576

    
1577
    if (!rwco->is_write) {
1578
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1579
                                     rwco->nb_sectors, rwco->qiov, 0);
1580
    } else {
1581
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1582
                                      rwco->nb_sectors, rwco->qiov, 0);
1583
    }
1584
}
1585

    
1586
/*
1587
 * Process a synchronous request using coroutines
1588
 */
1589
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1590
                      int nb_sectors, bool is_write)
1591
{
1592
    QEMUIOVector qiov;
1593
    struct iovec iov = {
1594
        .iov_base = (void *)buf,
1595
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1596
    };
1597
    Coroutine *co;
1598
    RwCo rwco = {
1599
        .bs = bs,
1600
        .sector_num = sector_num,
1601
        .nb_sectors = nb_sectors,
1602
        .qiov = &qiov,
1603
        .is_write = is_write,
1604
        .ret = NOT_DONE,
1605
    };
1606

    
1607
    qemu_iovec_init_external(&qiov, &iov, 1);
1608

    
1609
    /**
1610
     * In sync call context, when the vcpu is blocked, this throttling timer
1611
     * will not fire; so the I/O throttling function has to be disabled here
1612
     * if it has been enabled.
1613
     */
1614
    if (bs->io_limits_enabled) {
1615
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
1616
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
1617
        bdrv_io_limits_disable(bs);
1618
    }
1619

    
1620
    if (qemu_in_coroutine()) {
1621
        /* Fast-path if already in coroutine context */
1622
        bdrv_rw_co_entry(&rwco);
1623
    } else {
1624
        co = qemu_coroutine_create(bdrv_rw_co_entry);
1625
        qemu_coroutine_enter(co, &rwco);
1626
        while (rwco.ret == NOT_DONE) {
1627
            qemu_aio_wait();
1628
        }
1629
    }
1630
    return rwco.ret;
1631
}
1632

    
1633
/* return < 0 if error. See bdrv_write() for the return codes */
1634
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1635
              uint8_t *buf, int nb_sectors)
1636
{
1637
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1638
}
1639

    
1640
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
1641
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
1642
                          uint8_t *buf, int nb_sectors)
1643
{
1644
    bool enabled;
1645
    int ret;
1646

    
1647
    enabled = bs->io_limits_enabled;
1648
    bs->io_limits_enabled = false;
1649
    ret = bdrv_read(bs, 0, buf, 1);
1650
    bs->io_limits_enabled = enabled;
1651
    return ret;
1652
}
1653

    
1654
#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
1655

    
1656
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1657
                             int nb_sectors, int dirty)
1658
{
1659
    int64_t start, end;
1660
    unsigned long val, idx, bit;
1661

    
1662
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1663
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1664

    
1665
    for (; start <= end; start++) {
1666
        idx = start / BITS_PER_LONG;
1667
        bit = start % BITS_PER_LONG;
1668
        val = bs->dirty_bitmap[idx];
1669
        if (dirty) {
1670
            if (!(val & (1UL << bit))) {
1671
                bs->dirty_count++;
1672
                val |= 1UL << bit;
1673
            }
1674
        } else {
1675
            if (val & (1UL << bit)) {
1676
                bs->dirty_count--;
1677
                val &= ~(1UL << bit);
1678
            }
1679
        }
1680
        bs->dirty_bitmap[idx] = val;
1681
    }
1682
}
1683

    
1684
/* Return < 0 if error. Important errors are:
1685
  -EIO         generic I/O error (may happen for all errors)
1686
  -ENOMEDIUM   No media inserted.
1687
  -EINVAL      Invalid sector number or nb_sectors
1688
  -EACCES      Trying to write a read-only device
1689
*/
1690
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1691
               const uint8_t *buf, int nb_sectors)
1692
{
1693
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1694
}
1695

    
1696
int bdrv_pread(BlockDriverState *bs, int64_t offset,
1697
               void *buf, int count1)
1698
{
1699
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1700
    int len, nb_sectors, count;
1701
    int64_t sector_num;
1702
    int ret;
1703

    
1704
    count = count1;
1705
    /* first read to align to sector start */
1706
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1707
    if (len > count)
1708
        len = count;
1709
    sector_num = offset >> BDRV_SECTOR_BITS;
1710
    if (len > 0) {
1711
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1712
            return ret;
1713
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1714
        count -= len;
1715
        if (count == 0)
1716
            return count1;
1717
        sector_num++;
1718
        buf += len;
1719
    }
1720

    
1721
    /* read the sectors "in place" */
1722
    nb_sectors = count >> BDRV_SECTOR_BITS;
1723
    if (nb_sectors > 0) {
1724
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1725
            return ret;
1726
        sector_num += nb_sectors;
1727
        len = nb_sectors << BDRV_SECTOR_BITS;
1728
        buf += len;
1729
        count -= len;
1730
    }
1731

    
1732
    /* add data from the last sector */
1733
    if (count > 0) {
1734
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1735
            return ret;
1736
        memcpy(buf, tmp_buf, count);
1737
    }
1738
    return count1;
1739
}
1740

    
1741
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1742
                const void *buf, int count1)
1743
{
1744
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1745
    int len, nb_sectors, count;
1746
    int64_t sector_num;
1747
    int ret;
1748

    
1749
    count = count1;
1750
    /* first write to align to sector start */
1751
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1752
    if (len > count)
1753
        len = count;
1754
    sector_num = offset >> BDRV_SECTOR_BITS;
1755
    if (len > 0) {
1756
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1757
            return ret;
1758
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1759
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1760
            return ret;
1761
        count -= len;
1762
        if (count == 0)
1763
            return count1;
1764
        sector_num++;
1765
        buf += len;
1766
    }
1767

    
1768
    /* write the sectors "in place" */
1769
    nb_sectors = count >> BDRV_SECTOR_BITS;
1770
    if (nb_sectors > 0) {
1771
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1772
            return ret;
1773
        sector_num += nb_sectors;
1774
        len = nb_sectors << BDRV_SECTOR_BITS;
1775
        buf += len;
1776
        count -= len;
1777
    }
1778

    
1779
    /* add data from the last sector */
1780
    if (count > 0) {
1781
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1782
            return ret;
1783
        memcpy(tmp_buf, buf, count);
1784
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1785
            return ret;
1786
    }
1787
    return count1;
1788
}
1789

    
1790
/*
1791
 * Writes to the file and ensures that no writes are reordered across this
1792
 * request (acts as a barrier)
1793
 *
1794
 * Returns 0 on success, -errno in error cases.
1795
 */
1796
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1797
    const void *buf, int count)
1798
{
1799
    int ret;
1800

    
1801
    ret = bdrv_pwrite(bs, offset, buf, count);
1802
    if (ret < 0) {
1803
        return ret;
1804
    }
1805

    
1806
    /* No flush needed for cache modes that already do it */
1807
    if (bs->enable_write_cache) {
1808
        bdrv_flush(bs);
1809
    }
1810

    
1811
    return 0;
1812
}
1813

    
1814
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1815
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1816
{
1817
    /* Perform I/O through a temporary buffer so that users who scribble over
1818
     * their read buffer while the operation is in progress do not end up
1819
     * modifying the image file.  This is critical for zero-copy guest I/O
1820
     * where anything might happen inside guest memory.
1821
     */
1822
    void *bounce_buffer;
1823

    
1824
    BlockDriver *drv = bs->drv;
1825
    struct iovec iov;
1826
    QEMUIOVector bounce_qiov;
1827
    int64_t cluster_sector_num;
1828
    int cluster_nb_sectors;
1829
    size_t skip_bytes;
1830
    int ret;
1831

    
1832
    /* Cover entire cluster so no additional backing file I/O is required when
1833
     * allocating cluster in the image file.
1834
     */
1835
    round_to_clusters(bs, sector_num, nb_sectors,
1836
                      &cluster_sector_num, &cluster_nb_sectors);
1837

    
1838
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1839
                                   cluster_sector_num, cluster_nb_sectors);
1840

    
1841
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1842
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1843
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1844

    
1845
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1846
                             &bounce_qiov);
1847
    if (ret < 0) {
1848
        goto err;
1849
    }
1850

    
1851
    if (drv->bdrv_co_write_zeroes &&
1852
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
1853
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1854
                                      cluster_nb_sectors);
1855
    } else {
1856
        /* This does not change the data on the disk, it is not necessary
1857
         * to flush even in cache=writethrough mode.
1858
         */
1859
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1860
                                  &bounce_qiov);
1861
    }
1862

    
1863
    if (ret < 0) {
1864
        /* It might be okay to ignore write errors for guest requests.  If this
1865
         * is a deliberate copy-on-read then we don't want to ignore the error.
1866
         * Simply report it in all cases.
1867
         */
1868
        goto err;
1869
    }
1870

    
1871
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1872
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
1873
                        nb_sectors * BDRV_SECTOR_SIZE);
1874

    
1875
err:
1876
    qemu_vfree(bounce_buffer);
1877
    return ret;
1878
}
1879

    
1880
/*
1881
 * Handle a read request in coroutine context
1882
 */
1883
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1884
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1885
    BdrvRequestFlags flags)
1886
{
1887
    BlockDriver *drv = bs->drv;
1888
    BdrvTrackedRequest req;
1889
    int ret;
1890

    
1891
    if (!drv) {
1892
        return -ENOMEDIUM;
1893
    }
1894
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1895
        return -EIO;
1896
    }
1897

    
1898
    /* throttling disk read I/O */
1899
    if (bs->io_limits_enabled) {
1900
        bdrv_io_limits_intercept(bs, false, nb_sectors);
1901
    }
1902

    
1903
    if (bs->copy_on_read) {
1904
        flags |= BDRV_REQ_COPY_ON_READ;
1905
    }
1906
    if (flags & BDRV_REQ_COPY_ON_READ) {
1907
        bs->copy_on_read_in_flight++;
1908
    }
1909

    
1910
    if (bs->copy_on_read_in_flight) {
1911
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1912
    }
1913

    
1914
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1915

    
1916
    if (flags & BDRV_REQ_COPY_ON_READ) {
1917
        int pnum;
1918

    
1919
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1920
        if (ret < 0) {
1921
            goto out;
1922
        }
1923

    
1924
        if (!ret || pnum != nb_sectors) {
1925
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1926
            goto out;
1927
        }
1928
    }
1929

    
1930
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1931

    
1932
out:
1933
    tracked_request_end(&req);
1934

    
1935
    if (flags & BDRV_REQ_COPY_ON_READ) {
1936
        bs->copy_on_read_in_flight--;
1937
    }
1938

    
1939
    return ret;
1940
}
1941

    
1942
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1943
    int nb_sectors, QEMUIOVector *qiov)
1944
{
1945
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1946

    
1947
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1948
}
1949

    
1950
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1951
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1952
{
1953
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1954

    
1955
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1956
                            BDRV_REQ_COPY_ON_READ);
1957
}
1958

    
1959
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1960
    int64_t sector_num, int nb_sectors)
1961
{
1962
    BlockDriver *drv = bs->drv;
1963
    QEMUIOVector qiov;
1964
    struct iovec iov;
1965
    int ret;
1966

    
1967
    /* TODO Emulate only part of misaligned requests instead of letting block
1968
     * drivers return -ENOTSUP and emulate everything */
1969

    
1970
    /* First try the efficient write zeroes operation */
1971
    if (drv->bdrv_co_write_zeroes) {
1972
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1973
        if (ret != -ENOTSUP) {
1974
            return ret;
1975
        }
1976
    }
1977

    
1978
    /* Fall back to bounce buffer if write zeroes is unsupported */
1979
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1980
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1981
    memset(iov.iov_base, 0, iov.iov_len);
1982
    qemu_iovec_init_external(&qiov, &iov, 1);
1983

    
1984
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1985

    
1986
    qemu_vfree(iov.iov_base);
1987
    return ret;
1988
}
1989

    
1990
/*
1991
 * Handle a write request in coroutine context
1992
 */
1993
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1994
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1995
    BdrvRequestFlags flags)
1996
{
1997
    BlockDriver *drv = bs->drv;
1998
    BdrvTrackedRequest req;
1999
    int ret;
2000

    
2001
    if (!bs->drv) {
2002
        return -ENOMEDIUM;
2003
    }
2004
    if (bs->read_only) {
2005
        return -EACCES;
2006
    }
2007
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2008
        return -EIO;
2009
    }
2010

    
2011
    /* throttling disk write I/O */
2012
    if (bs->io_limits_enabled) {
2013
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2014
    }
2015

    
2016
    if (bs->copy_on_read_in_flight) {
2017
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2018
    }
2019

    
2020
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2021

    
2022
    if (flags & BDRV_REQ_ZERO_WRITE) {
2023
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2024
    } else {
2025
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2026
    }
2027

    
2028
    if (ret == 0 && !bs->enable_write_cache) {
2029
        ret = bdrv_co_flush(bs);
2030
    }
2031

    
2032
    if (bs->dirty_bitmap) {
2033
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2034
    }
2035

    
2036
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2037
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2038
    }
2039

    
2040
    tracked_request_end(&req);
2041

    
2042
    return ret;
2043
}
2044

    
2045
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2046
    int nb_sectors, QEMUIOVector *qiov)
2047
{
2048
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2049

    
2050
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2051
}
2052

    
2053
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2054
                                      int64_t sector_num, int nb_sectors)
2055
{
2056
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2057

    
2058
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2059
                             BDRV_REQ_ZERO_WRITE);
2060
}
2061

    
2062
/**
2063
 * Truncate file to 'offset' bytes (needed only for file protocols)
2064
 */
2065
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2066
{
2067
    BlockDriver *drv = bs->drv;
2068
    int ret;
2069
    if (!drv)
2070
        return -ENOMEDIUM;
2071
    if (!drv->bdrv_truncate)
2072
        return -ENOTSUP;
2073
    if (bs->read_only)
2074
        return -EACCES;
2075
    if (bdrv_in_use(bs))
2076
        return -EBUSY;
2077
    ret = drv->bdrv_truncate(bs, offset);
2078
    if (ret == 0) {
2079
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2080
        bdrv_dev_resize_cb(bs);
2081
    }
2082
    return ret;
2083
}
2084

    
2085
/**
2086
 * Length of a allocated file in bytes. Sparse files are counted by actual
2087
 * allocated space. Return < 0 if error or unknown.
2088
 */
2089
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2090
{
2091
    BlockDriver *drv = bs->drv;
2092
    if (!drv) {
2093
        return -ENOMEDIUM;
2094
    }
2095
    if (drv->bdrv_get_allocated_file_size) {
2096
        return drv->bdrv_get_allocated_file_size(bs);
2097
    }
2098
    if (bs->file) {
2099
        return bdrv_get_allocated_file_size(bs->file);
2100
    }
2101
    return -ENOTSUP;
2102
}
2103

    
2104
/**
2105
 * Length of a file in bytes. Return < 0 if error or unknown.
2106
 */
2107
int64_t bdrv_getlength(BlockDriverState *bs)
2108
{
2109
    BlockDriver *drv = bs->drv;
2110
    if (!drv)
2111
        return -ENOMEDIUM;
2112

    
2113
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2114
        if (drv->bdrv_getlength) {
2115
            return drv->bdrv_getlength(bs);
2116
        }
2117
    }
2118
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2119
}
2120

    
2121
/* return 0 as number of sectors if no device present or error */
2122
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2123
{
2124
    int64_t length;
2125
    length = bdrv_getlength(bs);
2126
    if (length < 0)
2127
        length = 0;
2128
    else
2129
        length = length >> BDRV_SECTOR_BITS;
2130
    *nb_sectors_ptr = length;
2131
}
2132

    
2133
/* throttling disk io limits */
2134
void bdrv_set_io_limits(BlockDriverState *bs,
2135
                        BlockIOLimit *io_limits)
2136
{
2137
    bs->io_limits = *io_limits;
2138
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2139
}
2140

    
2141
void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2142
                       BlockErrorAction on_write_error)
2143
{
2144
    bs->on_read_error = on_read_error;
2145
    bs->on_write_error = on_write_error;
2146
}
2147

    
2148
BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2149
{
2150
    return is_read ? bs->on_read_error : bs->on_write_error;
2151
}
2152

    
2153
int bdrv_is_read_only(BlockDriverState *bs)
2154
{
2155
    return bs->read_only;
2156
}
2157

    
2158
int bdrv_is_sg(BlockDriverState *bs)
2159
{
2160
    return bs->sg;
2161
}
2162

    
2163
int bdrv_enable_write_cache(BlockDriverState *bs)
2164
{
2165
    return bs->enable_write_cache;
2166
}
2167

    
2168
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2169
{
2170
    bs->enable_write_cache = wce;
2171
}
2172

    
2173
int bdrv_is_encrypted(BlockDriverState *bs)
2174
{
2175
    if (bs->backing_hd && bs->backing_hd->encrypted)
2176
        return 1;
2177
    return bs->encrypted;
2178
}
2179

    
2180
int bdrv_key_required(BlockDriverState *bs)
2181
{
2182
    BlockDriverState *backing_hd = bs->backing_hd;
2183

    
2184
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2185
        return 1;
2186
    return (bs->encrypted && !bs->valid_key);
2187
}
2188

    
2189
int bdrv_set_key(BlockDriverState *bs, const char *key)
2190
{
2191
    int ret;
2192
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2193
        ret = bdrv_set_key(bs->backing_hd, key);
2194
        if (ret < 0)
2195
            return ret;
2196
        if (!bs->encrypted)
2197
            return 0;
2198
    }
2199
    if (!bs->encrypted) {
2200
        return -EINVAL;
2201
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2202
        return -ENOMEDIUM;
2203
    }
2204
    ret = bs->drv->bdrv_set_key(bs, key);
2205
    if (ret < 0) {
2206
        bs->valid_key = 0;
2207
    } else if (!bs->valid_key) {
2208
        bs->valid_key = 1;
2209
        /* call the change callback now, we skipped it on open */
2210
        bdrv_dev_change_media_cb(bs, true);
2211
    }
2212
    return ret;
2213
}
2214

    
2215
const char *bdrv_get_format_name(BlockDriverState *bs)
2216
{
2217
    return bs->drv ? bs->drv->format_name : NULL;
2218
}
2219

    
2220
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2221
                         void *opaque)
2222
{
2223
    BlockDriver *drv;
2224

    
2225
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2226
        it(opaque, drv->format_name);
2227
    }
2228
}
2229

    
2230
BlockDriverState *bdrv_find(const char *name)
2231
{
2232
    BlockDriverState *bs;
2233

    
2234
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2235
        if (!strcmp(name, bs->device_name)) {
2236
            return bs;
2237
        }
2238
    }
2239
    return NULL;
2240
}
2241

    
2242
BlockDriverState *bdrv_next(BlockDriverState *bs)
2243
{
2244
    if (!bs) {
2245
        return QTAILQ_FIRST(&bdrv_states);
2246
    }
2247
    return QTAILQ_NEXT(bs, list);
2248
}
2249

    
2250
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2251
{
2252
    BlockDriverState *bs;
2253

    
2254
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2255
        it(opaque, bs);
2256
    }
2257
}
2258

    
2259
const char *bdrv_get_device_name(BlockDriverState *bs)
2260
{
2261
    return bs->device_name;
2262
}
2263

    
2264
int bdrv_get_flags(BlockDriverState *bs)
2265
{
2266
    return bs->open_flags;
2267
}
2268

    
2269
void bdrv_flush_all(void)
2270
{
2271
    BlockDriverState *bs;
2272

    
2273
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2274
        bdrv_flush(bs);
2275
    }
2276
}
2277

    
2278
int bdrv_has_zero_init(BlockDriverState *bs)
2279
{
2280
    assert(bs->drv);
2281

    
2282
    if (bs->drv->bdrv_has_zero_init) {
2283
        return bs->drv->bdrv_has_zero_init(bs);
2284
    }
2285

    
2286
    return 1;
2287
}
2288

    
2289
typedef struct BdrvCoIsAllocatedData {
2290
    BlockDriverState *bs;
2291
    int64_t sector_num;
2292
    int nb_sectors;
2293
    int *pnum;
2294
    int ret;
2295
    bool done;
2296
} BdrvCoIsAllocatedData;
2297

    
2298
/*
2299
 * Returns true iff the specified sector is present in the disk image. Drivers
2300
 * not implementing the functionality are assumed to not support backing files,
2301
 * hence all their sectors are reported as allocated.
2302
 *
2303
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2304
 * and 'pnum' is set to 0.
2305
 *
2306
 * 'pnum' is set to the number of sectors (including and immediately following
2307
 * the specified sector) that are known to be in the same
2308
 * allocated/unallocated state.
2309
 *
2310
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2311
 * beyond the end of the disk image it will be clamped.
2312
 */
2313
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2314
                                      int nb_sectors, int *pnum)
2315
{
2316
    int64_t n;
2317

    
2318
    if (sector_num >= bs->total_sectors) {
2319
        *pnum = 0;
2320
        return 0;
2321
    }
2322

    
2323
    n = bs->total_sectors - sector_num;
2324
    if (n < nb_sectors) {
2325
        nb_sectors = n;
2326
    }
2327

    
2328
    if (!bs->drv->bdrv_co_is_allocated) {
2329
        *pnum = nb_sectors;
2330
        return 1;
2331
    }
2332

    
2333
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2334
}
2335

    
2336
/* Coroutine wrapper for bdrv_is_allocated() */
2337
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2338
{
2339
    BdrvCoIsAllocatedData *data = opaque;
2340
    BlockDriverState *bs = data->bs;
2341

    
2342
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2343
                                     data->pnum);
2344
    data->done = true;
2345
}
2346

    
2347
/*
2348
 * Synchronous wrapper around bdrv_co_is_allocated().
2349
 *
2350
 * See bdrv_co_is_allocated() for details.
2351
 */
2352
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2353
                      int *pnum)
2354
{
2355
    Coroutine *co;
2356
    BdrvCoIsAllocatedData data = {
2357
        .bs = bs,
2358
        .sector_num = sector_num,
2359
        .nb_sectors = nb_sectors,
2360
        .pnum = pnum,
2361
        .done = false,
2362
    };
2363

    
2364
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2365
    qemu_coroutine_enter(co, &data);
2366
    while (!data.done) {
2367
        qemu_aio_wait();
2368
    }
2369
    return data.ret;
2370
}
2371

    
2372
/*
2373
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2374
 *
2375
 * Return true if the given sector is allocated in any image between
2376
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2377
 * sector is allocated in any image of the chain.  Return false otherwise.
2378
 *
2379
 * 'pnum' is set to the number of sectors (including and immediately following
2380
 *  the specified sector) that are known to be in the same
2381
 *  allocated/unallocated state.
2382
 *
2383
 */
2384
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2385
                                            BlockDriverState *base,
2386
                                            int64_t sector_num,
2387
                                            int nb_sectors, int *pnum)
2388
{
2389
    BlockDriverState *intermediate;
2390
    int ret, n = nb_sectors;
2391

    
2392
    intermediate = top;
2393
    while (intermediate && intermediate != base) {
2394
        int pnum_inter;
2395
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2396
                                   &pnum_inter);
2397
        if (ret < 0) {
2398
            return ret;
2399
        } else if (ret) {
2400
            *pnum = pnum_inter;
2401
            return 1;
2402
        }
2403

    
2404
        /*
2405
         * [sector_num, nb_sectors] is unallocated on top but intermediate
2406
         * might have
2407
         *
2408
         * [sector_num+x, nr_sectors] allocated.
2409
         */
2410
        if (n > pnum_inter) {
2411
            n = pnum_inter;
2412
        }
2413

    
2414
        intermediate = intermediate->backing_hd;
2415
    }
2416

    
2417
    *pnum = n;
2418
    return 0;
2419
}
2420

    
2421
BlockInfoList *qmp_query_block(Error **errp)
2422
{
2423
    BlockInfoList *head = NULL, *cur_item = NULL;
2424
    BlockDriverState *bs;
2425

    
2426
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2427
        BlockInfoList *info = g_malloc0(sizeof(*info));
2428

    
2429
        info->value = g_malloc0(sizeof(*info->value));
2430
        info->value->device = g_strdup(bs->device_name);
2431
        info->value->type = g_strdup("unknown");
2432
        info->value->locked = bdrv_dev_is_medium_locked(bs);
2433
        info->value->removable = bdrv_dev_has_removable_media(bs);
2434

    
2435
        if (bdrv_dev_has_removable_media(bs)) {
2436
            info->value->has_tray_open = true;
2437
            info->value->tray_open = bdrv_dev_is_tray_open(bs);
2438
        }
2439

    
2440
        if (bdrv_iostatus_is_enabled(bs)) {
2441
            info->value->has_io_status = true;
2442
            info->value->io_status = bs->iostatus;
2443
        }
2444

    
2445
        if (bs->drv) {
2446
            info->value->has_inserted = true;
2447
            info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2448
            info->value->inserted->file = g_strdup(bs->filename);
2449
            info->value->inserted->ro = bs->read_only;
2450
            info->value->inserted->drv = g_strdup(bs->drv->format_name);
2451
            info->value->inserted->encrypted = bs->encrypted;
2452
            info->value->inserted->encryption_key_missing = bdrv_key_required(bs);
2453
            if (bs->backing_file[0]) {
2454
                info->value->inserted->has_backing_file = true;
2455
                info->value->inserted->backing_file = g_strdup(bs->backing_file);
2456
            }
2457

    
2458
            info->value->inserted->backing_file_depth =
2459
                bdrv_get_backing_file_depth(bs);
2460

    
2461
            if (bs->io_limits_enabled) {
2462
                info->value->inserted->bps =
2463
                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2464
                info->value->inserted->bps_rd =
2465
                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2466
                info->value->inserted->bps_wr =
2467
                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2468
                info->value->inserted->iops =
2469
                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2470
                info->value->inserted->iops_rd =
2471
                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2472
                info->value->inserted->iops_wr =
2473
                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2474
            }
2475
        }
2476

    
2477
        /* XXX: waiting for the qapi to support GSList */
2478
        if (!cur_item) {
2479
            head = cur_item = info;
2480
        } else {
2481
            cur_item->next = info;
2482
            cur_item = info;
2483
        }
2484
    }
2485

    
2486
    return head;
2487
}
2488

    
2489
/* Consider exposing this as a full fledged QMP command */
2490
static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2491
{
2492
    BlockStats *s;
2493

    
2494
    s = g_malloc0(sizeof(*s));
2495

    
2496
    if (bs->device_name[0]) {
2497
        s->has_device = true;
2498
        s->device = g_strdup(bs->device_name);
2499
    }
2500

    
2501
    s->stats = g_malloc0(sizeof(*s->stats));
2502
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2503
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2504
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2505
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2506
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2507
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2508
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2509
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2510
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2511

    
2512
    if (bs->file) {
2513
        s->has_parent = true;
2514
        s->parent = qmp_query_blockstat(bs->file, NULL);
2515
    }
2516

    
2517
    return s;
2518
}
2519

    
2520
BlockStatsList *qmp_query_blockstats(Error **errp)
2521
{
2522
    BlockStatsList *head = NULL, *cur_item = NULL;
2523
    BlockDriverState *bs;
2524

    
2525
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2526
        BlockStatsList *info = g_malloc0(sizeof(*info));
2527
        info->value = qmp_query_blockstat(bs, NULL);
2528

    
2529
        /* XXX: waiting for the qapi to support GSList */
2530
        if (!cur_item) {
2531
            head = cur_item = info;
2532
        } else {
2533
            cur_item->next = info;
2534
            cur_item = info;
2535
        }
2536
    }
2537

    
2538
    return head;
2539
}
2540

    
2541
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2542
{
2543
    if (bs->backing_hd && bs->backing_hd->encrypted)
2544
        return bs->backing_file;
2545
    else if (bs->encrypted)
2546
        return bs->filename;
2547
    else
2548
        return NULL;
2549
}
2550

    
2551
void bdrv_get_backing_filename(BlockDriverState *bs,
2552
                               char *filename, int filename_size)
2553
{
2554
    pstrcpy(filename, filename_size, bs->backing_file);
2555
}
2556

    
2557
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2558
                          const uint8_t *buf, int nb_sectors)
2559
{
2560
    BlockDriver *drv = bs->drv;
2561
    if (!drv)
2562
        return -ENOMEDIUM;
2563
    if (!drv->bdrv_write_compressed)
2564
        return -ENOTSUP;
2565
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2566
        return -EIO;
2567

    
2568
    if (bs->dirty_bitmap) {
2569
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2570
    }
2571

    
2572
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2573
}
2574

    
2575
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2576
{
2577
    BlockDriver *drv = bs->drv;
2578
    if (!drv)
2579
        return -ENOMEDIUM;
2580
    if (!drv->bdrv_get_info)
2581
        return -ENOTSUP;
2582
    memset(bdi, 0, sizeof(*bdi));
2583
    return drv->bdrv_get_info(bs, bdi);
2584
}
2585

    
2586
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2587
                      int64_t pos, int size)
2588
{
2589
    BlockDriver *drv = bs->drv;
2590
    if (!drv)
2591
        return -ENOMEDIUM;
2592
    if (drv->bdrv_save_vmstate)
2593
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
2594
    if (bs->file)
2595
        return bdrv_save_vmstate(bs->file, buf, pos, size);
2596
    return -ENOTSUP;
2597
}
2598

    
2599
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2600
                      int64_t pos, int size)
2601
{
2602
    BlockDriver *drv = bs->drv;
2603
    if (!drv)
2604
        return -ENOMEDIUM;
2605
    if (drv->bdrv_load_vmstate)
2606
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
2607
    if (bs->file)
2608
        return bdrv_load_vmstate(bs->file, buf, pos, size);
2609
    return -ENOTSUP;
2610
}
2611

    
2612
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2613
{
2614
    BlockDriver *drv = bs->drv;
2615

    
2616
    if (!drv || !drv->bdrv_debug_event) {
2617
        return;
2618
    }
2619

    
2620
    drv->bdrv_debug_event(bs, event);
2621

    
2622
}
2623

    
2624
/**************************************************************/
2625
/* handling of snapshots */
2626

    
2627
int bdrv_can_snapshot(BlockDriverState *bs)
2628
{
2629
    BlockDriver *drv = bs->drv;
2630
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2631
        return 0;
2632
    }
2633

    
2634
    if (!drv->bdrv_snapshot_create) {
2635
        if (bs->file != NULL) {
2636
            return bdrv_can_snapshot(bs->file);
2637
        }
2638
        return 0;
2639
    }
2640

    
2641
    return 1;
2642
}
2643

    
2644
int bdrv_is_snapshot(BlockDriverState *bs)
2645
{
2646
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2647
}
2648

    
2649
BlockDriverState *bdrv_snapshots(void)
2650
{
2651
    BlockDriverState *bs;
2652

    
2653
    if (bs_snapshots) {
2654
        return bs_snapshots;
2655
    }
2656

    
2657
    bs = NULL;
2658
    while ((bs = bdrv_next(bs))) {
2659
        if (bdrv_can_snapshot(bs)) {
2660
            bs_snapshots = bs;
2661
            return bs;
2662
        }
2663
    }
2664
    return NULL;
2665
}
2666

    
2667
int bdrv_snapshot_create(BlockDriverState *bs,
2668
                         QEMUSnapshotInfo *sn_info)
2669
{
2670
    BlockDriver *drv = bs->drv;
2671
    if (!drv)
2672
        return -ENOMEDIUM;
2673
    if (drv->bdrv_snapshot_create)
2674
        return drv->bdrv_snapshot_create(bs, sn_info);
2675
    if (bs->file)
2676
        return bdrv_snapshot_create(bs->file, sn_info);
2677
    return -ENOTSUP;
2678
}
2679

    
2680
int bdrv_snapshot_goto(BlockDriverState *bs,
2681
                       const char *snapshot_id)
2682
{
2683
    BlockDriver *drv = bs->drv;
2684
    int ret, open_ret;
2685

    
2686
    if (!drv)
2687
        return -ENOMEDIUM;
2688
    if (drv->bdrv_snapshot_goto)
2689
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
2690

    
2691
    if (bs->file) {
2692
        drv->bdrv_close(bs);
2693
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2694
        open_ret = drv->bdrv_open(bs, bs->open_flags);
2695
        if (open_ret < 0) {
2696
            bdrv_delete(bs->file);
2697
            bs->drv = NULL;
2698
            return open_ret;
2699
        }
2700
        return ret;
2701
    }
2702

    
2703
    return -ENOTSUP;
2704
}
2705

    
2706
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2707
{
2708
    BlockDriver *drv = bs->drv;
2709
    if (!drv)
2710
        return -ENOMEDIUM;
2711
    if (drv->bdrv_snapshot_delete)
2712
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
2713
    if (bs->file)
2714
        return bdrv_snapshot_delete(bs->file, snapshot_id);
2715
    return -ENOTSUP;
2716
}
2717

    
2718
int bdrv_snapshot_list(BlockDriverState *bs,
2719
                       QEMUSnapshotInfo **psn_info)
2720
{
2721
    BlockDriver *drv = bs->drv;
2722
    if (!drv)
2723
        return -ENOMEDIUM;
2724
    if (drv->bdrv_snapshot_list)
2725
        return drv->bdrv_snapshot_list(bs, psn_info);
2726
    if (bs->file)
2727
        return bdrv_snapshot_list(bs->file, psn_info);
2728
    return -ENOTSUP;
2729
}
2730

    
2731
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2732
        const char *snapshot_name)
2733
{
2734
    BlockDriver *drv = bs->drv;
2735
    if (!drv) {
2736
        return -ENOMEDIUM;
2737
    }
2738
    if (!bs->read_only) {
2739
        return -EINVAL;
2740
    }
2741
    if (drv->bdrv_snapshot_load_tmp) {
2742
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2743
    }
2744
    return -ENOTSUP;
2745
}
2746

    
2747
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2748
        const char *backing_file)
2749
{
2750
    if (!bs->drv) {
2751
        return NULL;
2752
    }
2753

    
2754
    if (bs->backing_hd) {
2755
        if (strcmp(bs->backing_file, backing_file) == 0) {
2756
            return bs->backing_hd;
2757
        } else {
2758
            return bdrv_find_backing_image(bs->backing_hd, backing_file);
2759
        }
2760
    }
2761

    
2762
    return NULL;
2763
}
2764

    
2765
int bdrv_get_backing_file_depth(BlockDriverState *bs)
2766
{
2767
    if (!bs->drv) {
2768
        return 0;
2769
    }
2770

    
2771
    if (!bs->backing_hd) {
2772
        return 0;
2773
    }
2774

    
2775
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
2776
}
2777

    
2778
#define NB_SUFFIXES 4
2779

    
2780
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2781
{
2782
    static const char suffixes[NB_SUFFIXES] = "KMGT";
2783
    int64_t base;
2784
    int i;
2785

    
2786
    if (size <= 999) {
2787
        snprintf(buf, buf_size, "%" PRId64, size);
2788
    } else {
2789
        base = 1024;
2790
        for(i = 0; i < NB_SUFFIXES; i++) {
2791
            if (size < (10 * base)) {
2792
                snprintf(buf, buf_size, "%0.1f%c",
2793
                         (double)size / base,
2794
                         suffixes[i]);
2795
                break;
2796
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2797
                snprintf(buf, buf_size, "%" PRId64 "%c",
2798
                         ((size + (base >> 1)) / base),
2799
                         suffixes[i]);
2800
                break;
2801
            }
2802
            base = base * 1024;
2803
        }
2804
    }
2805
    return buf;
2806
}
2807

    
2808
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2809
{
2810
    char buf1[128], date_buf[128], clock_buf[128];
2811
#ifdef _WIN32
2812
    struct tm *ptm;
2813
#else
2814
    struct tm tm;
2815
#endif
2816
    time_t ti;
2817
    int64_t secs;
2818

    
2819
    if (!sn) {
2820
        snprintf(buf, buf_size,
2821
                 "%-10s%-20s%7s%20s%15s",
2822
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2823
    } else {
2824
        ti = sn->date_sec;
2825
#ifdef _WIN32
2826
        ptm = localtime(&ti);
2827
        strftime(date_buf, sizeof(date_buf),
2828
                 "%Y-%m-%d %H:%M:%S", ptm);
2829
#else
2830
        localtime_r(&ti, &tm);
2831
        strftime(date_buf, sizeof(date_buf),
2832
                 "%Y-%m-%d %H:%M:%S", &tm);
2833
#endif
2834
        secs = sn->vm_clock_nsec / 1000000000;
2835
        snprintf(clock_buf, sizeof(clock_buf),
2836
                 "%02d:%02d:%02d.%03d",
2837
                 (int)(secs / 3600),
2838
                 (int)((secs / 60) % 60),
2839
                 (int)(secs % 60),
2840
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2841
        snprintf(buf, buf_size,
2842
                 "%-10s%-20s%7s%20s%15s",
2843
                 sn->id_str, sn->name,
2844
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2845
                 date_buf,
2846
                 clock_buf);
2847
    }
2848
    return buf;
2849
}
2850

    
2851
/**************************************************************/
2852
/* async I/Os */
2853

    
2854
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2855
                                 QEMUIOVector *qiov, int nb_sectors,
2856
                                 BlockDriverCompletionFunc *cb, void *opaque)
2857
{
2858
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2859

    
2860
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2861
                                 cb, opaque, false);
2862
}
2863

    
2864
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2865
                                  QEMUIOVector *qiov, int nb_sectors,
2866
                                  BlockDriverCompletionFunc *cb, void *opaque)
2867
{
2868
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2869

    
2870
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2871
                                 cb, opaque, true);
2872
}
2873

    
2874

    
2875
typedef struct MultiwriteCB {
2876
    int error;
2877
    int num_requests;
2878
    int num_callbacks;
2879
    struct {
2880
        BlockDriverCompletionFunc *cb;
2881
        void *opaque;
2882
        QEMUIOVector *free_qiov;
2883
    } callbacks[];
2884
} MultiwriteCB;
2885

    
2886
static void multiwrite_user_cb(MultiwriteCB *mcb)
2887
{
2888
    int i;
2889

    
2890
    for (i = 0; i < mcb->num_callbacks; i++) {
2891
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2892
        if (mcb->callbacks[i].free_qiov) {
2893
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2894
        }
2895
        g_free(mcb->callbacks[i].free_qiov);
2896
    }
2897
}
2898

    
2899
static void multiwrite_cb(void *opaque, int ret)
2900
{
2901
    MultiwriteCB *mcb = opaque;
2902

    
2903
    trace_multiwrite_cb(mcb, ret);
2904

    
2905
    if (ret < 0 && !mcb->error) {
2906
        mcb->error = ret;
2907
    }
2908

    
2909
    mcb->num_requests--;
2910
    if (mcb->num_requests == 0) {
2911
        multiwrite_user_cb(mcb);
2912
        g_free(mcb);
2913
    }
2914
}
2915

    
2916
static int multiwrite_req_compare(const void *a, const void *b)
2917
{
2918
    const BlockRequest *req1 = a, *req2 = b;
2919

    
2920
    /*
2921
     * Note that we can't simply subtract req2->sector from req1->sector
2922
     * here as that could overflow the return value.
2923
     */
2924
    if (req1->sector > req2->sector) {
2925
        return 1;
2926
    } else if (req1->sector < req2->sector) {
2927
        return -1;
2928
    } else {
2929
        return 0;
2930
    }
2931
}
2932

    
2933
/*
2934
 * Takes a bunch of requests and tries to merge them. Returns the number of
2935
 * requests that remain after merging.
2936
 */
2937
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2938
    int num_reqs, MultiwriteCB *mcb)
2939
{
2940
    int i, outidx;
2941

    
2942
    // Sort requests by start sector
2943
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2944

    
2945
    // Check if adjacent requests touch the same clusters. If so, combine them,
2946
    // filling up gaps with zero sectors.
2947
    outidx = 0;
2948
    for (i = 1; i < num_reqs; i++) {
2949
        int merge = 0;
2950
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2951

    
2952
        // Handle exactly sequential writes and overlapping writes.
2953
        if (reqs[i].sector <= oldreq_last) {
2954
            merge = 1;
2955
        }
2956

    
2957
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2958
            merge = 0;
2959
        }
2960

    
2961
        if (merge) {
2962
            size_t size;
2963
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2964
            qemu_iovec_init(qiov,
2965
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2966

    
2967
            // Add the first request to the merged one. If the requests are
2968
            // overlapping, drop the last sectors of the first request.
2969
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
2970
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
2971

    
2972
            // We should need to add any zeros between the two requests
2973
            assert (reqs[i].sector <= oldreq_last);
2974

    
2975
            // Add the second request
2976
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
2977

    
2978
            reqs[outidx].nb_sectors = qiov->size >> 9;
2979
            reqs[outidx].qiov = qiov;
2980

    
2981
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2982
        } else {
2983
            outidx++;
2984
            reqs[outidx].sector     = reqs[i].sector;
2985
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2986
            reqs[outidx].qiov       = reqs[i].qiov;
2987
        }
2988
    }
2989

    
2990
    return outidx + 1;
2991
}
2992

    
2993
/*
2994
 * Submit multiple AIO write requests at once.
2995
 *
2996
 * On success, the function returns 0 and all requests in the reqs array have
2997
 * been submitted. In error case this function returns -1, and any of the
2998
 * requests may or may not be submitted yet. In particular, this means that the
2999
 * callback will be called for some of the requests, for others it won't. The
3000
 * caller must check the error field of the BlockRequest to wait for the right
3001
 * callbacks (if error != 0, no callback will be called).
3002
 *
3003
 * The implementation may modify the contents of the reqs array, e.g. to merge
3004
 * requests. However, the fields opaque and error are left unmodified as they
3005
 * are used to signal failure for a single request to the caller.
3006
 */
3007
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3008
{
3009
    MultiwriteCB *mcb;
3010
    int i;
3011

    
3012
    /* don't submit writes if we don't have a medium */
3013
    if (bs->drv == NULL) {
3014
        for (i = 0; i < num_reqs; i++) {
3015
            reqs[i].error = -ENOMEDIUM;
3016
        }
3017
        return -1;
3018
    }
3019

    
3020
    if (num_reqs == 0) {
3021
        return 0;
3022
    }
3023

    
3024
    // Create MultiwriteCB structure
3025
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3026
    mcb->num_requests = 0;
3027
    mcb->num_callbacks = num_reqs;
3028

    
3029
    for (i = 0; i < num_reqs; i++) {
3030
        mcb->callbacks[i].cb = reqs[i].cb;
3031
        mcb->callbacks[i].opaque = reqs[i].opaque;
3032
    }
3033

    
3034
    // Check for mergable requests
3035
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3036

    
3037
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3038

    
3039
    /* Run the aio requests. */
3040
    mcb->num_requests = num_reqs;
3041
    for (i = 0; i < num_reqs; i++) {
3042
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3043
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3044
    }
3045

    
3046
    return 0;
3047
}
3048

    
3049
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3050
{
3051
    acb->pool->cancel(acb);
3052
}
3053

    
3054
/* block I/O throttling */
3055
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3056
                 bool is_write, double elapsed_time, uint64_t *wait)
3057
{
3058
    uint64_t bps_limit = 0;
3059
    double   bytes_limit, bytes_base, bytes_res;
3060
    double   slice_time, wait_time;
3061

    
3062
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3063
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3064
    } else if (bs->io_limits.bps[is_write]) {
3065
        bps_limit = bs->io_limits.bps[is_write];
3066
    } else {
3067
        if (wait) {
3068
            *wait = 0;
3069
        }
3070

    
3071
        return false;
3072
    }
3073

    
3074
    slice_time = bs->slice_end - bs->slice_start;
3075
    slice_time /= (NANOSECONDS_PER_SECOND);
3076
    bytes_limit = bps_limit * slice_time;
3077
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3078
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3079
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3080
    }
3081

    
3082
    /* bytes_base: the bytes of data which have been read/written; and
3083
     *             it is obtained from the history statistic info.
3084
     * bytes_res: the remaining bytes of data which need to be read/written.
3085
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3086
     *             the total time for completing reading/writting all data.
3087
     */
3088
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3089

    
3090
    if (bytes_base + bytes_res <= bytes_limit) {
3091
        if (wait) {
3092
            *wait = 0;
3093
        }
3094

    
3095
        return false;
3096
    }
3097

    
3098
    /* Calc approx time to dispatch */
3099
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3100

    
3101
    /* When the I/O rate at runtime exceeds the limits,
3102
     * bs->slice_end need to be extended in order that the current statistic
3103
     * info can be kept until the timer fire, so it is increased and tuned
3104
     * based on the result of experiment.
3105
     */
3106
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3107
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3108
    if (wait) {
3109
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3110
    }
3111

    
3112
    return true;
3113
}
3114

    
3115
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3116
                             double elapsed_time, uint64_t *wait)
3117
{
3118
    uint64_t iops_limit = 0;
3119
    double   ios_limit, ios_base;
3120
    double   slice_time, wait_time;
3121

    
3122
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3123
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3124
    } else if (bs->io_limits.iops[is_write]) {
3125
        iops_limit = bs->io_limits.iops[is_write];
3126
    } else {
3127
        if (wait) {
3128
            *wait = 0;
3129
        }
3130

    
3131
        return false;
3132
    }
3133

    
3134
    slice_time = bs->slice_end - bs->slice_start;
3135
    slice_time /= (NANOSECONDS_PER_SECOND);
3136
    ios_limit  = iops_limit * slice_time;
3137
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3138
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3139
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3140
    }
3141

    
3142
    if (ios_base + 1 <= ios_limit) {
3143
        if (wait) {
3144
            *wait = 0;
3145
        }
3146

    
3147
        return false;
3148
    }
3149

    
3150
    /* Calc approx time to dispatch */
3151
    wait_time = (ios_base + 1) / iops_limit;
3152
    if (wait_time > elapsed_time) {
3153
        wait_time = wait_time - elapsed_time;
3154
    } else {
3155
        wait_time = 0;
3156
    }
3157

    
3158
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3159
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3160
    if (wait) {
3161
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3162
    }
3163

    
3164
    return true;
3165
}
3166

    
3167
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3168
                           bool is_write, int64_t *wait)
3169
{
3170
    int64_t  now, max_wait;
3171
    uint64_t bps_wait = 0, iops_wait = 0;
3172
    double   elapsed_time;
3173
    int      bps_ret, iops_ret;
3174

    
3175
    now = qemu_get_clock_ns(vm_clock);
3176
    if ((bs->slice_start < now)
3177
        && (bs->slice_end > now)) {
3178
        bs->slice_end = now + bs->slice_time;
3179
    } else {
3180
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3181
        bs->slice_start = now;
3182
        bs->slice_end   = now + bs->slice_time;
3183

    
3184
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3185
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3186

    
3187
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3188
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3189
    }
3190

    
3191
    elapsed_time  = now - bs->slice_start;
3192
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3193

    
3194
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3195
                                      is_write, elapsed_time, &bps_wait);
3196
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3197
                                      elapsed_time, &iops_wait);
3198
    if (bps_ret || iops_ret) {
3199
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3200
        if (wait) {
3201
            *wait = max_wait;
3202
        }
3203

    
3204
        now = qemu_get_clock_ns(vm_clock);
3205
        if (bs->slice_end < now + max_wait) {
3206
            bs->slice_end = now + max_wait;
3207
        }
3208

    
3209
        return true;
3210
    }
3211

    
3212
    if (wait) {
3213
        *wait = 0;
3214
    }
3215

    
3216
    return false;
3217
}
3218

    
3219
/**************************************************************/
3220
/* async block device emulation */
3221

    
3222
typedef struct BlockDriverAIOCBSync {
3223
    BlockDriverAIOCB common;
3224
    QEMUBH *bh;
3225
    int ret;
3226
    /* vector translation state */
3227
    QEMUIOVector *qiov;
3228
    uint8_t *bounce;
3229
    int is_write;
3230
} BlockDriverAIOCBSync;
3231

    
3232
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3233
{
3234
    BlockDriverAIOCBSync *acb =
3235
        container_of(blockacb, BlockDriverAIOCBSync, common);
3236
    qemu_bh_delete(acb->bh);
3237
    acb->bh = NULL;
3238
    qemu_aio_release(acb);
3239
}
3240

    
3241
static AIOPool bdrv_em_aio_pool = {
3242
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3243
    .cancel             = bdrv_aio_cancel_em,
3244
};
3245

    
3246
static void bdrv_aio_bh_cb(void *opaque)
3247
{
3248
    BlockDriverAIOCBSync *acb = opaque;
3249

    
3250
    if (!acb->is_write)
3251
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3252
    qemu_vfree(acb->bounce);
3253
    acb->common.cb(acb->common.opaque, acb->ret);
3254
    qemu_bh_delete(acb->bh);
3255
    acb->bh = NULL;
3256
    qemu_aio_release(acb);
3257
}
3258

    
3259
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3260
                                            int64_t sector_num,
3261
                                            QEMUIOVector *qiov,
3262
                                            int nb_sectors,
3263
                                            BlockDriverCompletionFunc *cb,
3264
                                            void *opaque,
3265
                                            int is_write)
3266

    
3267
{
3268
    BlockDriverAIOCBSync *acb;
3269

    
3270
    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3271
    acb->is_write = is_write;
3272
    acb->qiov = qiov;
3273
    acb->bounce = qemu_blockalign(bs, qiov->size);
3274
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3275

    
3276
    if (is_write) {
3277
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3278
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3279
    } else {
3280
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3281
    }
3282

    
3283
    qemu_bh_schedule(acb->bh);
3284

    
3285
    return &acb->common;
3286
}
3287

    
3288
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3289
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3290
        BlockDriverCompletionFunc *cb, void *opaque)
3291
{
3292
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3293
}
3294

    
3295
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3296
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3297
        BlockDriverCompletionFunc *cb, void *opaque)
3298
{
3299
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3300
}
3301

    
3302

    
3303
typedef struct BlockDriverAIOCBCoroutine {
3304
    BlockDriverAIOCB common;
3305
    BlockRequest req;
3306
    bool is_write;
3307
    QEMUBH* bh;
3308
} BlockDriverAIOCBCoroutine;
3309

    
3310
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3311
{
3312
    qemu_aio_flush();
3313
}
3314

    
3315
static AIOPool bdrv_em_co_aio_pool = {
3316
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3317
    .cancel             = bdrv_aio_co_cancel_em,
3318
};
3319

    
3320
static void bdrv_co_em_bh(void *opaque)
3321
{
3322
    BlockDriverAIOCBCoroutine *acb = opaque;
3323

    
3324
    acb->common.cb(acb->common.opaque, acb->req.error);
3325
    qemu_bh_delete(acb->bh);
3326
    qemu_aio_release(acb);
3327
}
3328

    
3329
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3330
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3331
{
3332
    BlockDriverAIOCBCoroutine *acb = opaque;
3333
    BlockDriverState *bs = acb->common.bs;
3334

    
3335
    if (!acb->is_write) {
3336
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3337
            acb->req.nb_sectors, acb->req.qiov, 0);
3338
    } else {
3339
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3340
            acb->req.nb_sectors, acb->req.qiov, 0);
3341
    }
3342

    
3343
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3344
    qemu_bh_schedule(acb->bh);
3345
}
3346

    
3347
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3348
                                               int64_t sector_num,
3349
                                               QEMUIOVector *qiov,
3350
                                               int nb_sectors,
3351
                                               BlockDriverCompletionFunc *cb,
3352
                                               void *opaque,
3353
                                               bool is_write)
3354
{
3355
    Coroutine *co;
3356
    BlockDriverAIOCBCoroutine *acb;
3357

    
3358
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3359
    acb->req.sector = sector_num;
3360
    acb->req.nb_sectors = nb_sectors;
3361
    acb->req.qiov = qiov;
3362
    acb->is_write = is_write;
3363

    
3364
    co = qemu_coroutine_create(bdrv_co_do_rw);
3365
    qemu_coroutine_enter(co, acb);
3366

    
3367
    return &acb->common;
3368
}
3369

    
3370
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3371
{
3372
    BlockDriverAIOCBCoroutine *acb = opaque;
3373
    BlockDriverState *bs = acb->common.bs;
3374

    
3375
    acb->req.error = bdrv_co_flush(bs);
3376
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3377
    qemu_bh_schedule(acb->bh);
3378
}
3379

    
3380
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3381
        BlockDriverCompletionFunc *cb, void *opaque)
3382
{
3383
    trace_bdrv_aio_flush(bs, opaque);
3384

    
3385
    Coroutine *co;
3386
    BlockDriverAIOCBCoroutine *acb;
3387

    
3388
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3389
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3390
    qemu_coroutine_enter(co, acb);
3391

    
3392
    return &acb->common;
3393
}
3394

    
3395
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3396
{
3397
    BlockDriverAIOCBCoroutine *acb = opaque;
3398
    BlockDriverState *bs = acb->common.bs;
3399

    
3400
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3401
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3402
    qemu_bh_schedule(acb->bh);
3403
}
3404

    
3405
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3406
        int64_t sector_num, int nb_sectors,
3407
        BlockDriverCompletionFunc *cb, void *opaque)
3408
{
3409
    Coroutine *co;
3410
    BlockDriverAIOCBCoroutine *acb;
3411

    
3412
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3413

    
3414
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3415
    acb->req.sector = sector_num;
3416
    acb->req.nb_sectors = nb_sectors;
3417
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3418
    qemu_coroutine_enter(co, acb);
3419

    
3420
    return &acb->common;
3421
}
3422

    
3423
void bdrv_init(void)
3424
{
3425
    module_call_init(MODULE_INIT_BLOCK);
3426
}
3427

    
3428
void bdrv_init_with_whitelist(void)
3429
{
3430
    use_bdrv_whitelist = 1;
3431
    bdrv_init();
3432
}
3433

    
3434
void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3435
                   BlockDriverCompletionFunc *cb, void *opaque)
3436
{
3437
    BlockDriverAIOCB *acb;
3438

    
3439
    if (pool->free_aiocb) {
3440
        acb = pool->free_aiocb;
3441
        pool->free_aiocb = acb->next;
3442
    } else {
3443
        acb = g_malloc0(pool->aiocb_size);
3444
        acb->pool = pool;
3445
    }
3446
    acb->bs = bs;
3447
    acb->cb = cb;
3448
    acb->opaque = opaque;
3449
    return acb;
3450
}
3451

    
3452
void qemu_aio_release(void *p)
3453
{
3454
    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3455
    AIOPool *pool = acb->pool;
3456
    acb->next = pool->free_aiocb;
3457
    pool->free_aiocb = acb;
3458
}
3459

    
3460
/**************************************************************/
3461
/* Coroutine block device emulation */
3462

    
3463
typedef struct CoroutineIOCompletion {
3464
    Coroutine *coroutine;
3465
    int ret;
3466
} CoroutineIOCompletion;
3467

    
3468
static void bdrv_co_io_em_complete(void *opaque, int ret)
3469
{
3470
    CoroutineIOCompletion *co = opaque;
3471

    
3472
    co->ret = ret;
3473
    qemu_coroutine_enter(co->coroutine, NULL);
3474
}
3475

    
3476
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3477
                                      int nb_sectors, QEMUIOVector *iov,
3478
                                      bool is_write)
3479
{
3480
    CoroutineIOCompletion co = {
3481
        .coroutine = qemu_coroutine_self(),
3482
    };
3483
    BlockDriverAIOCB *acb;
3484

    
3485
    if (is_write) {
3486
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3487
                                       bdrv_co_io_em_complete, &co);
3488
    } else {
3489
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3490
                                      bdrv_co_io_em_complete, &co);
3491
    }
3492

    
3493
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3494
    if (!acb) {
3495
        return -EIO;
3496
    }
3497
    qemu_coroutine_yield();
3498

    
3499
    return co.ret;
3500
}
3501

    
3502
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3503
                                         int64_t sector_num, int nb_sectors,
3504
                                         QEMUIOVector *iov)
3505
{
3506
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3507
}
3508

    
3509
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3510
                                         int64_t sector_num, int nb_sectors,
3511
                                         QEMUIOVector *iov)
3512
{
3513
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3514
}
3515

    
3516
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3517
{
3518
    RwCo *rwco = opaque;
3519

    
3520
    rwco->ret = bdrv_co_flush(rwco->bs);
3521
}
3522

    
3523
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3524
{
3525
    int ret;
3526

    
3527
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3528
        return 0;
3529
    }
3530

    
3531
    /* Write back cached data to the OS even with cache=unsafe */
3532
    if (bs->drv->bdrv_co_flush_to_os) {
3533
        ret = bs->drv->bdrv_co_flush_to_os(bs);
3534
        if (ret < 0) {
3535
            return ret;
3536
        }
3537
    }
3538

    
3539
    /* But don't actually force it to the disk with cache=unsafe */
3540
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
3541
        goto flush_parent;
3542
    }
3543

    
3544
    if (bs->drv->bdrv_co_flush_to_disk) {
3545
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
3546
    } else if (bs->drv->bdrv_aio_flush) {
3547
        BlockDriverAIOCB *acb;
3548
        CoroutineIOCompletion co = {
3549
            .coroutine = qemu_coroutine_self(),
3550
        };
3551

    
3552
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3553
        if (acb == NULL) {
3554
            ret = -EIO;
3555
        } else {
3556
            qemu_coroutine_yield();
3557
            ret = co.ret;
3558
        }
3559
    } else {
3560
        /*
3561
         * Some block drivers always operate in either writethrough or unsafe
3562
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3563
         * know how the server works (because the behaviour is hardcoded or
3564
         * depends on server-side configuration), so we can't ensure that
3565
         * everything is safe on disk. Returning an error doesn't work because
3566
         * that would break guests even if the server operates in writethrough
3567
         * mode.
3568
         *
3569
         * Let's hope the user knows what he's doing.
3570
         */
3571
        ret = 0;
3572
    }
3573
    if (ret < 0) {
3574
        return ret;
3575
    }
3576

    
3577
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3578
     * in the case of cache=unsafe, so there are no useless flushes.
3579
     */
3580
flush_parent:
3581
    return bdrv_co_flush(bs->file);
3582
}
3583

    
3584
void bdrv_invalidate_cache(BlockDriverState *bs)
3585
{
3586
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3587
        bs->drv->bdrv_invalidate_cache(bs);
3588
    }
3589
}
3590

    
3591
void bdrv_invalidate_cache_all(void)
3592
{
3593
    BlockDriverState *bs;
3594

    
3595
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3596
        bdrv_invalidate_cache(bs);
3597
    }
3598
}
3599

    
3600
void bdrv_clear_incoming_migration_all(void)
3601
{
3602
    BlockDriverState *bs;
3603

    
3604
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3605
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3606
    }
3607
}
3608

    
3609
int bdrv_flush(BlockDriverState *bs)
3610
{
3611
    Coroutine *co;
3612
    RwCo rwco = {
3613
        .bs = bs,
3614
        .ret = NOT_DONE,
3615
    };
3616

    
3617
    if (qemu_in_coroutine()) {
3618
        /* Fast-path if already in coroutine context */
3619
        bdrv_flush_co_entry(&rwco);
3620
    } else {
3621
        co = qemu_coroutine_create(bdrv_flush_co_entry);
3622
        qemu_coroutine_enter(co, &rwco);
3623
        while (rwco.ret == NOT_DONE) {
3624
            qemu_aio_wait();
3625
        }
3626
    }
3627

    
3628
    return rwco.ret;
3629
}
3630

    
3631
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3632
{
3633
    RwCo *rwco = opaque;
3634

    
3635
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3636
}
3637

    
3638
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3639
                                 int nb_sectors)
3640
{
3641
    if (!bs->drv) {
3642
        return -ENOMEDIUM;
3643
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3644
        return -EIO;
3645
    } else if (bs->read_only) {
3646
        return -EROFS;
3647
    } else if (bs->drv->bdrv_co_discard) {
3648
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3649
    } else if (bs->drv->bdrv_aio_discard) {
3650
        BlockDriverAIOCB *acb;
3651
        CoroutineIOCompletion co = {
3652
            .coroutine = qemu_coroutine_self(),
3653
        };
3654

    
3655
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3656
                                        bdrv_co_io_em_complete, &co);
3657
        if (acb == NULL) {
3658
            return -EIO;
3659
        } else {
3660
            qemu_coroutine_yield();
3661
            return co.ret;
3662
        }
3663
    } else {
3664
        return 0;
3665
    }
3666
}
3667

    
3668
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3669
{
3670
    Coroutine *co;
3671
    RwCo rwco = {
3672
        .bs = bs,
3673
        .sector_num = sector_num,
3674
        .nb_sectors = nb_sectors,
3675
        .ret = NOT_DONE,
3676
    };
3677

    
3678
    if (qemu_in_coroutine()) {
3679
        /* Fast-path if already in coroutine context */
3680
        bdrv_discard_co_entry(&rwco);
3681
    } else {
3682
        co = qemu_coroutine_create(bdrv_discard_co_entry);
3683
        qemu_coroutine_enter(co, &rwco);
3684
        while (rwco.ret == NOT_DONE) {
3685
            qemu_aio_wait();
3686
        }
3687
    }
3688

    
3689
    return rwco.ret;
3690
}
3691

    
3692
/**************************************************************/
3693
/* removable device support */
3694

    
3695
/**
3696
 * Return TRUE if the media is present
3697
 */
3698
int bdrv_is_inserted(BlockDriverState *bs)
3699
{
3700
    BlockDriver *drv = bs->drv;
3701

    
3702
    if (!drv)
3703
        return 0;
3704
    if (!drv->bdrv_is_inserted)
3705
        return 1;
3706
    return drv->bdrv_is_inserted(bs);
3707
}
3708

    
3709
/**
3710
 * Return whether the media changed since the last call to this
3711
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3712
 */
3713
int bdrv_media_changed(BlockDriverState *bs)
3714
{
3715
    BlockDriver *drv = bs->drv;
3716

    
3717
    if (drv && drv->bdrv_media_changed) {
3718
        return drv->bdrv_media_changed(bs);
3719
    }
3720
    return -ENOTSUP;
3721
}
3722

    
3723
/**
3724
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3725
 */
3726
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3727
{
3728
    BlockDriver *drv = bs->drv;
3729

    
3730
    if (drv && drv->bdrv_eject) {
3731
        drv->bdrv_eject(bs, eject_flag);
3732
    }
3733

    
3734
    if (bs->device_name[0] != '\0') {
3735
        bdrv_emit_qmp_eject_event(bs, eject_flag);
3736
    }
3737
}
3738

    
3739
/**
3740
 * Lock or unlock the media (if it is locked, the user won't be able
3741
 * to eject it manually).
3742
 */
3743
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3744
{
3745
    BlockDriver *drv = bs->drv;
3746

    
3747
    trace_bdrv_lock_medium(bs, locked);
3748

    
3749
    if (drv && drv->bdrv_lock_medium) {
3750
        drv->bdrv_lock_medium(bs, locked);
3751
    }
3752
}
3753

    
3754
/* needed for generic scsi interface */
3755

    
3756
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3757
{
3758
    BlockDriver *drv = bs->drv;
3759

    
3760
    if (drv && drv->bdrv_ioctl)
3761
        return drv->bdrv_ioctl(bs, req, buf);
3762
    return -ENOTSUP;
3763
}
3764

    
3765
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3766
        unsigned long int req, void *buf,
3767
        BlockDriverCompletionFunc *cb, void *opaque)
3768
{
3769
    BlockDriver *drv = bs->drv;
3770

    
3771
    if (drv && drv->bdrv_aio_ioctl)
3772
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3773
    return NULL;
3774
}
3775

    
3776
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3777
{
3778
    bs->buffer_alignment = align;
3779
}
3780

    
3781
void *qemu_blockalign(BlockDriverState *bs, size_t size)
3782
{
3783
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3784
}
3785

    
3786
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3787
{
3788
    int64_t bitmap_size;
3789

    
3790
    bs->dirty_count = 0;
3791
    if (enable) {
3792
        if (!bs->dirty_bitmap) {
3793
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3794
                    BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3795
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3796

    
3797
            bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3798
        }
3799
    } else {
3800
        if (bs->dirty_bitmap) {
3801
            g_free(bs->dirty_bitmap);
3802
            bs->dirty_bitmap = NULL;
3803
        }
3804
    }
3805
}
3806

    
3807
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3808
{
3809
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3810

    
3811
    if (bs->dirty_bitmap &&
3812
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3813
        return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3814
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
3815
    } else {
3816
        return 0;
3817
    }
3818
}
3819

    
3820
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3821
                      int nr_sectors)
3822
{
3823
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3824
}
3825

    
3826
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3827
{
3828
    return bs->dirty_count;
3829
}
3830

    
3831
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3832
{
3833
    assert(bs->in_use != in_use);
3834
    bs->in_use = in_use;
3835
}
3836

    
3837
int bdrv_in_use(BlockDriverState *bs)
3838
{
3839
    return bs->in_use;
3840
}
3841

    
3842
void bdrv_iostatus_enable(BlockDriverState *bs)
3843
{
3844
    bs->iostatus_enabled = true;
3845
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3846
}
3847

    
3848
/* The I/O status is only enabled if the drive explicitly
3849
 * enables it _and_ the VM is configured to stop on errors */
3850
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3851
{
3852
    return (bs->iostatus_enabled &&
3853
           (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3854
            bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3855
            bs->on_read_error == BLOCK_ERR_STOP_ANY));
3856
}
3857

    
3858
void bdrv_iostatus_disable(BlockDriverState *bs)
3859
{
3860
    bs->iostatus_enabled = false;
3861
}
3862

    
3863
void bdrv_iostatus_reset(BlockDriverState *bs)
3864
{
3865
    if (bdrv_iostatus_is_enabled(bs)) {
3866
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3867
    }
3868
}
3869

    
3870
/* XXX: Today this is set by device models because it makes the implementation
3871
   quite simple. However, the block layer knows about the error, so it's
3872
   possible to implement this without device models being involved */
3873
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3874
{
3875
    if (bdrv_iostatus_is_enabled(bs) &&
3876
        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3877
        assert(error >= 0);
3878
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3879
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
3880
    }
3881
}
3882

    
3883
void
3884
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3885
        enum BlockAcctType type)
3886
{
3887
    assert(type < BDRV_MAX_IOTYPE);
3888

    
3889
    cookie->bytes = bytes;
3890
    cookie->start_time_ns = get_clock();
3891
    cookie->type = type;
3892
}
3893

    
3894
void
3895
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3896
{
3897
    assert(cookie->type < BDRV_MAX_IOTYPE);
3898

    
3899
    bs->nr_bytes[cookie->type] += cookie->bytes;
3900
    bs->nr_ops[cookie->type]++;
3901
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3902
}
3903

    
3904
int bdrv_img_create(const char *filename, const char *fmt,
3905
                    const char *base_filename, const char *base_fmt,
3906
                    char *options, uint64_t img_size, int flags)
3907
{
3908
    QEMUOptionParameter *param = NULL, *create_options = NULL;
3909
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
3910
    BlockDriverState *bs = NULL;
3911
    BlockDriver *drv, *proto_drv;
3912
    BlockDriver *backing_drv = NULL;
3913
    int ret = 0;
3914

    
3915
    /* Find driver and parse its options */
3916
    drv = bdrv_find_format(fmt);
3917
    if (!drv) {
3918
        error_report("Unknown file format '%s'", fmt);
3919
        ret = -EINVAL;
3920
        goto out;
3921
    }
3922

    
3923
    proto_drv = bdrv_find_protocol(filename);
3924
    if (!proto_drv) {
3925
        error_report("Unknown protocol '%s'", filename);
3926
        ret = -EINVAL;
3927
        goto out;
3928
    }
3929

    
3930
    create_options = append_option_parameters(create_options,
3931
                                              drv->create_options);
3932
    create_options = append_option_parameters(create_options,
3933
                                              proto_drv->create_options);
3934

    
3935
    /* Create parameter list with default values */
3936
    param = parse_option_parameters("", create_options, param);
3937

    
3938
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3939

    
3940
    /* Parse -o options */
3941
    if (options) {
3942
        param = parse_option_parameters(options, create_options, param);
3943
        if (param == NULL) {
3944
            error_report("Invalid options for file format '%s'.", fmt);
3945
            ret = -EINVAL;
3946
            goto out;
3947
        }
3948
    }
3949

    
3950
    if (base_filename) {
3951
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3952
                                 base_filename)) {
3953
            error_report("Backing file not supported for file format '%s'",
3954
                         fmt);
3955
            ret = -EINVAL;
3956
            goto out;
3957
        }
3958
    }
3959

    
3960
    if (base_fmt) {
3961
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3962
            error_report("Backing file format not supported for file "
3963
                         "format '%s'", fmt);
3964
            ret = -EINVAL;
3965
            goto out;
3966
        }
3967
    }
3968

    
3969
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3970
    if (backing_file && backing_file->value.s) {
3971
        if (!strcmp(filename, backing_file->value.s)) {
3972
            error_report("Error: Trying to create an image with the "
3973
                         "same filename as the backing file");
3974
            ret = -EINVAL;
3975
            goto out;
3976
        }
3977
    }
3978

    
3979
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3980
    if (backing_fmt && backing_fmt->value.s) {
3981
        backing_drv = bdrv_find_format(backing_fmt->value.s);
3982
        if (!backing_drv) {
3983
            error_report("Unknown backing file format '%s'",
3984
                         backing_fmt->value.s);
3985
            ret = -EINVAL;
3986
            goto out;
3987
        }
3988
    }
3989

    
3990
    // The size for the image must always be specified, with one exception:
3991
    // If we are using a backing file, we can obtain the size from there
3992
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
3993
    if (size && size->value.n == -1) {
3994
        if (backing_file && backing_file->value.s) {
3995
            uint64_t size;
3996
            char buf[32];
3997
            int back_flags;
3998

    
3999
            /* backing files always opened read-only */
4000
            back_flags =
4001
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4002

    
4003
            bs = bdrv_new("");
4004

    
4005
            ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4006
            if (ret < 0) {
4007
                error_report("Could not open '%s'", backing_file->value.s);
4008
                goto out;
4009
            }
4010
            bdrv_get_geometry(bs, &size);
4011
            size *= 512;
4012

    
4013
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4014
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4015
        } else {
4016
            error_report("Image creation needs a size parameter");
4017
            ret = -EINVAL;
4018
            goto out;
4019
        }
4020
    }
4021

    
4022
    printf("Formatting '%s', fmt=%s ", filename, fmt);
4023
    print_option_parameters(param);
4024
    puts("");
4025

    
4026
    ret = bdrv_create(drv, filename, param);
4027

    
4028
    if (ret < 0) {
4029
        if (ret == -ENOTSUP) {
4030
            error_report("Formatting or formatting option not supported for "
4031
                         "file format '%s'", fmt);
4032
        } else if (ret == -EFBIG) {
4033
            error_report("The image size is too large for file format '%s'",
4034
                         fmt);
4035
        } else {
4036
            error_report("%s: error while creating %s: %s", filename, fmt,
4037
                         strerror(-ret));
4038
        }
4039
    }
4040

    
4041
out:
4042
    free_option_parameters(create_options);
4043
    free_option_parameters(param);
4044

    
4045
    if (bs) {
4046
        bdrv_delete(bs);
4047
    }
4048

    
4049
    return ret;
4050
}
4051

    
4052
void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4053
                       int64_t speed, BlockDriverCompletionFunc *cb,
4054
                       void *opaque, Error **errp)
4055
{
4056
    BlockJob *job;
4057

    
4058
    if (bs->job || bdrv_in_use(bs)) {
4059
        error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4060
        return NULL;
4061
    }
4062
    bdrv_set_in_use(bs, 1);
4063

    
4064
    job = g_malloc0(job_type->instance_size);
4065
    job->job_type      = job_type;
4066
    job->bs            = bs;
4067
    job->cb            = cb;
4068
    job->opaque        = opaque;
4069
    job->busy          = true;
4070
    bs->job = job;
4071

    
4072
    /* Only set speed when necessary to avoid NotSupported error */
4073
    if (speed != 0) {
4074
        Error *local_err = NULL;
4075

    
4076
        block_job_set_speed(job, speed, &local_err);
4077
        if (error_is_set(&local_err)) {
4078
            bs->job = NULL;
4079
            g_free(job);
4080
            bdrv_set_in_use(bs, 0);
4081
            error_propagate(errp, local_err);
4082
            return NULL;
4083
        }
4084
    }
4085
    return job;
4086
}
4087

    
4088
void block_job_complete(BlockJob *job, int ret)
4089
{
4090
    BlockDriverState *bs = job->bs;
4091

    
4092
    assert(bs->job == job);
4093
    job->cb(job->opaque, ret);
4094
    bs->job = NULL;
4095
    g_free(job);
4096
    bdrv_set_in_use(bs, 0);
4097
}
4098

    
4099
void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4100
{
4101
    Error *local_err = NULL;
4102

    
4103
    if (!job->job_type->set_speed) {
4104
        error_set(errp, QERR_NOT_SUPPORTED);
4105
        return;
4106
    }
4107
    job->job_type->set_speed(job, speed, &local_err);
4108
    if (error_is_set(&local_err)) {
4109
        error_propagate(errp, local_err);
4110
        return;
4111
    }
4112

    
4113
    job->speed = speed;
4114
}
4115

    
4116
void block_job_cancel(BlockJob *job)
4117
{
4118
    job->cancelled = true;
4119
    if (job->co && !job->busy) {
4120
        qemu_coroutine_enter(job->co, NULL);
4121
    }
4122
}
4123

    
4124
bool block_job_is_cancelled(BlockJob *job)
4125
{
4126
    return job->cancelled;
4127
}
4128

    
4129
struct BlockCancelData {
4130
    BlockJob *job;
4131
    BlockDriverCompletionFunc *cb;
4132
    void *opaque;
4133
    bool cancelled;
4134
    int ret;
4135
};
4136

    
4137
static void block_job_cancel_cb(void *opaque, int ret)
4138
{
4139
    struct BlockCancelData *data = opaque;
4140

    
4141
    data->cancelled = block_job_is_cancelled(data->job);
4142
    data->ret = ret;
4143
    data->cb(data->opaque, ret);
4144
}
4145

    
4146
int block_job_cancel_sync(BlockJob *job)
4147
{
4148
    struct BlockCancelData data;
4149
    BlockDriverState *bs = job->bs;
4150

    
4151
    assert(bs->job == job);
4152

    
4153
    /* Set up our own callback to store the result and chain to
4154
     * the original callback.
4155
     */
4156
    data.job = job;
4157
    data.cb = job->cb;
4158
    data.opaque = job->opaque;
4159
    data.ret = -EINPROGRESS;
4160
    job->cb = block_job_cancel_cb;
4161
    job->opaque = &data;
4162
    block_job_cancel(job);
4163
    while (data.ret == -EINPROGRESS) {
4164
        qemu_aio_wait();
4165
    }
4166
    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4167
}
4168

    
4169
void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4170
{
4171
    /* Check cancellation *before* setting busy = false, too!  */
4172
    if (!block_job_is_cancelled(job)) {
4173
        job->busy = false;
4174
        co_sleep_ns(clock, ns);
4175
        job->busy = true;
4176
    }
4177
}