Statistics
| Branch: | Revision:

root / block.c @ 9db1c0f7

History | View | Annotate | Download (113.8 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block_int.h"
29
#include "module.h"
30
#include "qjson.h"
31
#include "qemu-coroutine.h"
32
#include "qmp-commands.h"
33
#include "qemu-timer.h"
34

    
35
#ifdef CONFIG_BSD
36
#include <sys/types.h>
37
#include <sys/stat.h>
38
#include <sys/ioctl.h>
39
#include <sys/queue.h>
40
#ifndef __DragonFly__
41
#include <sys/disk.h>
42
#endif
43
#endif
44

    
45
#ifdef _WIN32
46
#include <windows.h>
47
#endif
48

    
49
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50

    
51
typedef enum {
52
    BDRV_REQ_COPY_ON_READ = 0x1,
53
    BDRV_REQ_ZERO_WRITE   = 0x2,
54
} BdrvRequestFlags;
55

    
56
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59
        BlockDriverCompletionFunc *cb, void *opaque);
60
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64
                                         int64_t sector_num, int nb_sectors,
65
                                         QEMUIOVector *iov);
66
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71
    BdrvRequestFlags flags);
72
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76
                                               int64_t sector_num,
77
                                               QEMUIOVector *qiov,
78
                                               int nb_sectors,
79
                                               BlockDriverCompletionFunc *cb,
80
                                               void *opaque,
81
                                               bool is_write);
82
static void coroutine_fn bdrv_co_do_rw(void *opaque);
83
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84
    int64_t sector_num, int nb_sectors);
85

    
86
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87
        bool is_write, double elapsed_time, uint64_t *wait);
88
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89
        double elapsed_time, uint64_t *wait);
90
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91
        bool is_write, int64_t *wait);
92

    
93
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
95

    
96
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
98

    
99
/* The device to use for VM snapshots */
100
static BlockDriverState *bs_snapshots;
101

    
102
/* If non-zero, use only whitelisted block drivers */
103
static int use_bdrv_whitelist;
104

    
105
#ifdef _WIN32
106
static int is_windows_drive_prefix(const char *filename)
107
{
108
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110
            filename[1] == ':');
111
}
112

    
113
int is_windows_drive(const char *filename)
114
{
115
    if (is_windows_drive_prefix(filename) &&
116
        filename[2] == '\0')
117
        return 1;
118
    if (strstart(filename, "\\\\.\\", NULL) ||
119
        strstart(filename, "//./", NULL))
120
        return 1;
121
    return 0;
122
}
123
#endif
124

    
125
/* throttling disk I/O limits */
126
void bdrv_io_limits_disable(BlockDriverState *bs)
127
{
128
    bs->io_limits_enabled = false;
129

    
130
    while (qemu_co_queue_next(&bs->throttled_reqs));
131

    
132
    if (bs->block_timer) {
133
        qemu_del_timer(bs->block_timer);
134
        qemu_free_timer(bs->block_timer);
135
        bs->block_timer = NULL;
136
    }
137

    
138
    bs->slice_start = 0;
139
    bs->slice_end   = 0;
140
    bs->slice_time  = 0;
141
    memset(&bs->io_base, 0, sizeof(bs->io_base));
142
}
143

    
144
static void bdrv_block_timer(void *opaque)
145
{
146
    BlockDriverState *bs = opaque;
147

    
148
    qemu_co_queue_next(&bs->throttled_reqs);
149
}
150

    
151
void bdrv_io_limits_enable(BlockDriverState *bs)
152
{
153
    qemu_co_queue_init(&bs->throttled_reqs);
154
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
156
    bs->slice_start = qemu_get_clock_ns(vm_clock);
157
    bs->slice_end   = bs->slice_start + bs->slice_time;
158
    memset(&bs->io_base, 0, sizeof(bs->io_base));
159
    bs->io_limits_enabled = true;
160
}
161

    
162
bool bdrv_io_limits_enabled(BlockDriverState *bs)
163
{
164
    BlockIOLimit *io_limits = &bs->io_limits;
165
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
166
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
169
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171
}
172

    
173
static void bdrv_io_limits_intercept(BlockDriverState *bs,
174
                                     bool is_write, int nb_sectors)
175
{
176
    int64_t wait_time = -1;
177

    
178
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179
        qemu_co_queue_wait(&bs->throttled_reqs);
180
    }
181

    
182
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183
     * throttled requests will not be dequeued until the current request is
184
     * allowed to be serviced. So if the current request still exceeds the
185
     * limits, it will be inserted to the head. All requests followed it will
186
     * be still in throttled_reqs queue.
187
     */
188

    
189
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190
        qemu_mod_timer(bs->block_timer,
191
                       wait_time + qemu_get_clock_ns(vm_clock));
192
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193
    }
194

    
195
    qemu_co_queue_next(&bs->throttled_reqs);
196
}
197

    
198
/* check if the path starts with "<protocol>:" */
199
static int path_has_protocol(const char *path)
200
{
201
    const char *p;
202

    
203
#ifdef _WIN32
204
    if (is_windows_drive(path) ||
205
        is_windows_drive_prefix(path)) {
206
        return 0;
207
    }
208
    p = path + strcspn(path, ":/\\");
209
#else
210
    p = path + strcspn(path, ":/");
211
#endif
212

    
213
    return *p == ':';
214
}
215

    
216
int path_is_absolute(const char *path)
217
{
218
#ifdef _WIN32
219
    /* specific case for names like: "\\.\d:" */
220
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221
        return 1;
222
    }
223
    return (*path == '/' || *path == '\\');
224
#else
225
    return (*path == '/');
226
#endif
227
}
228

    
229
/* if filename is absolute, just copy it to dest. Otherwise, build a
230
   path to it by considering it is relative to base_path. URL are
231
   supported. */
232
void path_combine(char *dest, int dest_size,
233
                  const char *base_path,
234
                  const char *filename)
235
{
236
    const char *p, *p1;
237
    int len;
238

    
239
    if (dest_size <= 0)
240
        return;
241
    if (path_is_absolute(filename)) {
242
        pstrcpy(dest, dest_size, filename);
243
    } else {
244
        p = strchr(base_path, ':');
245
        if (p)
246
            p++;
247
        else
248
            p = base_path;
249
        p1 = strrchr(base_path, '/');
250
#ifdef _WIN32
251
        {
252
            const char *p2;
253
            p2 = strrchr(base_path, '\\');
254
            if (!p1 || p2 > p1)
255
                p1 = p2;
256
        }
257
#endif
258
        if (p1)
259
            p1++;
260
        else
261
            p1 = base_path;
262
        if (p1 > p)
263
            p = p1;
264
        len = p - base_path;
265
        if (len > dest_size - 1)
266
            len = dest_size - 1;
267
        memcpy(dest, base_path, len);
268
        dest[len] = '\0';
269
        pstrcat(dest, dest_size, filename);
270
    }
271
}
272

    
273
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
274
{
275
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276
        pstrcpy(dest, sz, bs->backing_file);
277
    } else {
278
        path_combine(dest, sz, bs->filename, bs->backing_file);
279
    }
280
}
281

    
282
void bdrv_register(BlockDriver *bdrv)
283
{
284
    /* Block drivers without coroutine functions need emulation */
285
    if (!bdrv->bdrv_co_readv) {
286
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
287
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
288

    
289
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290
         * the block driver lacks aio we need to emulate that too.
291
         */
292
        if (!bdrv->bdrv_aio_readv) {
293
            /* add AIO emulation layer */
294
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
296
        }
297
    }
298

    
299
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
300
}
301

    
302
/* create a new block device (by default it is empty) */
303
BlockDriverState *bdrv_new(const char *device_name)
304
{
305
    BlockDriverState *bs;
306

    
307
    bs = g_malloc0(sizeof(BlockDriverState));
308
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309
    if (device_name[0] != '\0') {
310
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
311
    }
312
    bdrv_iostatus_disable(bs);
313
    return bs;
314
}
315

    
316
BlockDriver *bdrv_find_format(const char *format_name)
317
{
318
    BlockDriver *drv1;
319
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320
        if (!strcmp(drv1->format_name, format_name)) {
321
            return drv1;
322
        }
323
    }
324
    return NULL;
325
}
326

    
327
static int bdrv_is_whitelisted(BlockDriver *drv)
328
{
329
    static const char *whitelist[] = {
330
        CONFIG_BDRV_WHITELIST
331
    };
332
    const char **p;
333

    
334
    if (!whitelist[0])
335
        return 1;               /* no whitelist, anything goes */
336

    
337
    for (p = whitelist; *p; p++) {
338
        if (!strcmp(drv->format_name, *p)) {
339
            return 1;
340
        }
341
    }
342
    return 0;
343
}
344

    
345
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
346
{
347
    BlockDriver *drv = bdrv_find_format(format_name);
348
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
349
}
350

    
351
typedef struct CreateCo {
352
    BlockDriver *drv;
353
    char *filename;
354
    QEMUOptionParameter *options;
355
    int ret;
356
} CreateCo;
357

    
358
static void coroutine_fn bdrv_create_co_entry(void *opaque)
359
{
360
    CreateCo *cco = opaque;
361
    assert(cco->drv);
362

    
363
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
364
}
365

    
366
int bdrv_create(BlockDriver *drv, const char* filename,
367
    QEMUOptionParameter *options)
368
{
369
    int ret;
370

    
371
    Coroutine *co;
372
    CreateCo cco = {
373
        .drv = drv,
374
        .filename = g_strdup(filename),
375
        .options = options,
376
        .ret = NOT_DONE,
377
    };
378

    
379
    if (!drv->bdrv_create) {
380
        return -ENOTSUP;
381
    }
382

    
383
    if (qemu_in_coroutine()) {
384
        /* Fast-path if already in coroutine context */
385
        bdrv_create_co_entry(&cco);
386
    } else {
387
        co = qemu_coroutine_create(bdrv_create_co_entry);
388
        qemu_coroutine_enter(co, &cco);
389
        while (cco.ret == NOT_DONE) {
390
            qemu_aio_wait();
391
        }
392
    }
393

    
394
    ret = cco.ret;
395
    g_free(cco.filename);
396

    
397
    return ret;
398
}
399

    
400
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
401
{
402
    BlockDriver *drv;
403

    
404
    drv = bdrv_find_protocol(filename);
405
    if (drv == NULL) {
406
        return -ENOENT;
407
    }
408

    
409
    return bdrv_create(drv, filename, options);
410
}
411

    
412
/*
413
 * Create a uniquely-named empty temporary file.
414
 * Return 0 upon success, otherwise a negative errno value.
415
 */
416
int get_tmp_filename(char *filename, int size)
417
{
418
#ifdef _WIN32
419
    char temp_dir[MAX_PATH];
420
    /* GetTempFileName requires that its output buffer (4th param)
421
       have length MAX_PATH or greater.  */
422
    assert(size >= MAX_PATH);
423
    return (GetTempPath(MAX_PATH, temp_dir)
424
            && GetTempFileName(temp_dir, "qem", 0, filename)
425
            ? 0 : -GetLastError());
426
#else
427
    int fd;
428
    const char *tmpdir;
429
    tmpdir = getenv("TMPDIR");
430
    if (!tmpdir)
431
        tmpdir = "/tmp";
432
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433
        return -EOVERFLOW;
434
    }
435
    fd = mkstemp(filename);
436
    if (fd < 0 || close(fd)) {
437
        return -errno;
438
    }
439
    return 0;
440
#endif
441
}
442

    
443
/*
444
 * Detect host devices. By convention, /dev/cdrom[N] is always
445
 * recognized as a host CDROM.
446
 */
447
static BlockDriver *find_hdev_driver(const char *filename)
448
{
449
    int score_max = 0, score;
450
    BlockDriver *drv = NULL, *d;
451

    
452
    QLIST_FOREACH(d, &bdrv_drivers, list) {
453
        if (d->bdrv_probe_device) {
454
            score = d->bdrv_probe_device(filename);
455
            if (score > score_max) {
456
                score_max = score;
457
                drv = d;
458
            }
459
        }
460
    }
461

    
462
    return drv;
463
}
464

    
465
BlockDriver *bdrv_find_protocol(const char *filename)
466
{
467
    BlockDriver *drv1;
468
    char protocol[128];
469
    int len;
470
    const char *p;
471

    
472
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
473

    
474
    /*
475
     * XXX(hch): we really should not let host device detection
476
     * override an explicit protocol specification, but moving this
477
     * later breaks access to device names with colons in them.
478
     * Thanks to the brain-dead persistent naming schemes on udev-
479
     * based Linux systems those actually are quite common.
480
     */
481
    drv1 = find_hdev_driver(filename);
482
    if (drv1) {
483
        return drv1;
484
    }
485

    
486
    if (!path_has_protocol(filename)) {
487
        return bdrv_find_format("file");
488
    }
489
    p = strchr(filename, ':');
490
    assert(p != NULL);
491
    len = p - filename;
492
    if (len > sizeof(protocol) - 1)
493
        len = sizeof(protocol) - 1;
494
    memcpy(protocol, filename, len);
495
    protocol[len] = '\0';
496
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
497
        if (drv1->protocol_name &&
498
            !strcmp(drv1->protocol_name, protocol)) {
499
            return drv1;
500
        }
501
    }
502
    return NULL;
503
}
504

    
505
static int find_image_format(const char *filename, BlockDriver **pdrv)
506
{
507
    int ret, score, score_max;
508
    BlockDriver *drv1, *drv;
509
    uint8_t buf[2048];
510
    BlockDriverState *bs;
511

    
512
    ret = bdrv_file_open(&bs, filename, 0);
513
    if (ret < 0) {
514
        *pdrv = NULL;
515
        return ret;
516
    }
517

    
518
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
519
    if (bs->sg || !bdrv_is_inserted(bs)) {
520
        bdrv_delete(bs);
521
        drv = bdrv_find_format("raw");
522
        if (!drv) {
523
            ret = -ENOENT;
524
        }
525
        *pdrv = drv;
526
        return ret;
527
    }
528

    
529
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
530
    bdrv_delete(bs);
531
    if (ret < 0) {
532
        *pdrv = NULL;
533
        return ret;
534
    }
535

    
536
    score_max = 0;
537
    drv = NULL;
538
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
539
        if (drv1->bdrv_probe) {
540
            score = drv1->bdrv_probe(buf, ret, filename);
541
            if (score > score_max) {
542
                score_max = score;
543
                drv = drv1;
544
            }
545
        }
546
    }
547
    if (!drv) {
548
        ret = -ENOENT;
549
    }
550
    *pdrv = drv;
551
    return ret;
552
}
553

    
554
/**
555
 * Set the current 'total_sectors' value
556
 */
557
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
558
{
559
    BlockDriver *drv = bs->drv;
560

    
561
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
562
    if (bs->sg)
563
        return 0;
564

    
565
    /* query actual device if possible, otherwise just trust the hint */
566
    if (drv->bdrv_getlength) {
567
        int64_t length = drv->bdrv_getlength(bs);
568
        if (length < 0) {
569
            return length;
570
        }
571
        hint = length >> BDRV_SECTOR_BITS;
572
    }
573

    
574
    bs->total_sectors = hint;
575
    return 0;
576
}
577

    
578
/**
579
 * Set open flags for a given cache mode
580
 *
581
 * Return 0 on success, -1 if the cache mode was invalid.
582
 */
583
int bdrv_parse_cache_flags(const char *mode, int *flags)
584
{
585
    *flags &= ~BDRV_O_CACHE_MASK;
586

    
587
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
588
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
589
    } else if (!strcmp(mode, "directsync")) {
590
        *flags |= BDRV_O_NOCACHE;
591
    } else if (!strcmp(mode, "writeback")) {
592
        *flags |= BDRV_O_CACHE_WB;
593
    } else if (!strcmp(mode, "unsafe")) {
594
        *flags |= BDRV_O_CACHE_WB;
595
        *flags |= BDRV_O_NO_FLUSH;
596
    } else if (!strcmp(mode, "writethrough")) {
597
        /* this is the default */
598
    } else {
599
        return -1;
600
    }
601

    
602
    return 0;
603
}
604

    
605
/**
606
 * The copy-on-read flag is actually a reference count so multiple users may
607
 * use the feature without worrying about clobbering its previous state.
608
 * Copy-on-read stays enabled until all users have called to disable it.
609
 */
610
void bdrv_enable_copy_on_read(BlockDriverState *bs)
611
{
612
    bs->copy_on_read++;
613
}
614

    
615
void bdrv_disable_copy_on_read(BlockDriverState *bs)
616
{
617
    assert(bs->copy_on_read > 0);
618
    bs->copy_on_read--;
619
}
620

    
621
/*
622
 * Common part for opening disk images and files
623
 */
624
static int bdrv_open_common(BlockDriverState *bs, const char *filename,
625
    int flags, BlockDriver *drv)
626
{
627
    int ret, open_flags;
628

    
629
    assert(drv != NULL);
630
    assert(bs->file == NULL);
631

    
632
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
633

    
634
    bs->open_flags = flags;
635
    bs->buffer_alignment = 512;
636

    
637
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
638
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
639
        bdrv_enable_copy_on_read(bs);
640
    }
641

    
642
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
643

    
644
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
645
        return -ENOTSUP;
646
    }
647

    
648
    bs->drv = drv;
649
    bs->opaque = g_malloc0(drv->instance_size);
650

    
651
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
652
    open_flags = flags | BDRV_O_CACHE_WB;
653

    
654
    /*
655
     * Clear flags that are internal to the block layer before opening the
656
     * image.
657
     */
658
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
659

    
660
    /*
661
     * Snapshots should be writable.
662
     */
663
    if (bs->is_temporary) {
664
        open_flags |= BDRV_O_RDWR;
665
    }
666

    
667
    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
668

    
669
    /* Open the image, either directly or using a protocol */
670
    if (drv->bdrv_file_open) {
671
        ret = drv->bdrv_file_open(bs, filename, open_flags);
672
    } else {
673
        ret = bdrv_file_open(&bs->file, filename, open_flags);
674
        if (ret >= 0) {
675
            ret = drv->bdrv_open(bs, open_flags);
676
        }
677
    }
678

    
679
    if (ret < 0) {
680
        goto free_and_fail;
681
    }
682

    
683
    ret = refresh_total_sectors(bs, bs->total_sectors);
684
    if (ret < 0) {
685
        goto free_and_fail;
686
    }
687

    
688
#ifndef _WIN32
689
    if (bs->is_temporary) {
690
        unlink(filename);
691
    }
692
#endif
693
    return 0;
694

    
695
free_and_fail:
696
    if (bs->file) {
697
        bdrv_delete(bs->file);
698
        bs->file = NULL;
699
    }
700
    g_free(bs->opaque);
701
    bs->opaque = NULL;
702
    bs->drv = NULL;
703
    return ret;
704
}
705

    
706
/*
707
 * Opens a file using a protocol (file, host_device, nbd, ...)
708
 */
709
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
710
{
711
    BlockDriverState *bs;
712
    BlockDriver *drv;
713
    int ret;
714

    
715
    drv = bdrv_find_protocol(filename);
716
    if (!drv) {
717
        return -ENOENT;
718
    }
719

    
720
    bs = bdrv_new("");
721
    ret = bdrv_open_common(bs, filename, flags, drv);
722
    if (ret < 0) {
723
        bdrv_delete(bs);
724
        return ret;
725
    }
726
    bs->growable = 1;
727
    *pbs = bs;
728
    return 0;
729
}
730

    
731
/*
732
 * Opens a disk image (raw, qcow2, vmdk, ...)
733
 */
734
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
735
              BlockDriver *drv)
736
{
737
    int ret;
738
    char tmp_filename[PATH_MAX];
739

    
740
    if (flags & BDRV_O_SNAPSHOT) {
741
        BlockDriverState *bs1;
742
        int64_t total_size;
743
        int is_protocol = 0;
744
        BlockDriver *bdrv_qcow2;
745
        QEMUOptionParameter *options;
746
        char backing_filename[PATH_MAX];
747

    
748
        /* if snapshot, we create a temporary backing file and open it
749
           instead of opening 'filename' directly */
750

    
751
        /* if there is a backing file, use it */
752
        bs1 = bdrv_new("");
753
        ret = bdrv_open(bs1, filename, 0, drv);
754
        if (ret < 0) {
755
            bdrv_delete(bs1);
756
            return ret;
757
        }
758
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
759

    
760
        if (bs1->drv && bs1->drv->protocol_name)
761
            is_protocol = 1;
762

    
763
        bdrv_delete(bs1);
764

    
765
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
766
        if (ret < 0) {
767
            return ret;
768
        }
769

    
770
        /* Real path is meaningless for protocols */
771
        if (is_protocol)
772
            snprintf(backing_filename, sizeof(backing_filename),
773
                     "%s", filename);
774
        else if (!realpath(filename, backing_filename))
775
            return -errno;
776

    
777
        bdrv_qcow2 = bdrv_find_format("qcow2");
778
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
779

    
780
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
781
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
782
        if (drv) {
783
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
784
                drv->format_name);
785
        }
786

    
787
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
788
        free_option_parameters(options);
789
        if (ret < 0) {
790
            return ret;
791
        }
792

    
793
        filename = tmp_filename;
794
        drv = bdrv_qcow2;
795
        bs->is_temporary = 1;
796
    }
797

    
798
    /* Find the right image format driver */
799
    if (!drv) {
800
        ret = find_image_format(filename, &drv);
801
    }
802

    
803
    if (!drv) {
804
        goto unlink_and_fail;
805
    }
806

    
807
    /* Open the image */
808
    ret = bdrv_open_common(bs, filename, flags, drv);
809
    if (ret < 0) {
810
        goto unlink_and_fail;
811
    }
812

    
813
    /* If there is a backing file, use it */
814
    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
815
        char backing_filename[PATH_MAX];
816
        int back_flags;
817
        BlockDriver *back_drv = NULL;
818

    
819
        bs->backing_hd = bdrv_new("");
820
        bdrv_get_full_backing_filename(bs, backing_filename,
821
                                       sizeof(backing_filename));
822

    
823
        if (bs->backing_format[0] != '\0') {
824
            back_drv = bdrv_find_format(bs->backing_format);
825
        }
826

    
827
        /* backing files always opened read-only */
828
        back_flags =
829
            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830

    
831
        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
832
        if (ret < 0) {
833
            bdrv_close(bs);
834
            return ret;
835
        }
836
        if (bs->is_temporary) {
837
            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
838
        } else {
839
            /* base image inherits from "parent" */
840
            bs->backing_hd->keep_read_only = bs->keep_read_only;
841
        }
842
    }
843

    
844
    if (!bdrv_key_required(bs)) {
845
        bdrv_dev_change_media_cb(bs, true);
846
    }
847

    
848
    /* throttling disk I/O limits */
849
    if (bs->io_limits_enabled) {
850
        bdrv_io_limits_enable(bs);
851
    }
852

    
853
    return 0;
854

    
855
unlink_and_fail:
856
    if (bs->is_temporary) {
857
        unlink(filename);
858
    }
859
    return ret;
860
}
861

    
862
void bdrv_close(BlockDriverState *bs)
863
{
864
    bdrv_flush(bs);
865
    if (bs->drv) {
866
        if (bs->job) {
867
            block_job_cancel_sync(bs->job);
868
        }
869
        bdrv_drain_all();
870

    
871
        if (bs == bs_snapshots) {
872
            bs_snapshots = NULL;
873
        }
874
        if (bs->backing_hd) {
875
            bdrv_delete(bs->backing_hd);
876
            bs->backing_hd = NULL;
877
        }
878
        bs->drv->bdrv_close(bs);
879
        g_free(bs->opaque);
880
#ifdef _WIN32
881
        if (bs->is_temporary) {
882
            unlink(bs->filename);
883
        }
884
#endif
885
        bs->opaque = NULL;
886
        bs->drv = NULL;
887
        bs->copy_on_read = 0;
888
        bs->backing_file[0] = '\0';
889
        bs->backing_format[0] = '\0';
890
        bs->total_sectors = 0;
891
        bs->encrypted = 0;
892
        bs->valid_key = 0;
893
        bs->sg = 0;
894
        bs->growable = 0;
895

    
896
        if (bs->file != NULL) {
897
            bdrv_delete(bs->file);
898
            bs->file = NULL;
899
        }
900

    
901
        bdrv_dev_change_media_cb(bs, false);
902
    }
903

    
904
    /*throttling disk I/O limits*/
905
    if (bs->io_limits_enabled) {
906
        bdrv_io_limits_disable(bs);
907
    }
908
}
909

    
910
void bdrv_close_all(void)
911
{
912
    BlockDriverState *bs;
913

    
914
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
915
        bdrv_close(bs);
916
    }
917
}
918

    
919
/*
920
 * Wait for pending requests to complete across all BlockDriverStates
921
 *
922
 * This function does not flush data to disk, use bdrv_flush_all() for that
923
 * after calling this function.
924
 *
925
 * Note that completion of an asynchronous I/O operation can trigger any
926
 * number of other I/O operations on other devices---for example a coroutine
927
 * can be arbitrarily complex and a constant flow of I/O can come until the
928
 * coroutine is complete.  Because of this, it is not possible to have a
929
 * function to drain a single device's I/O queue.
930
 */
931
void bdrv_drain_all(void)
932
{
933
    BlockDriverState *bs;
934
    bool busy;
935

    
936
    do {
937
        busy = qemu_aio_wait();
938

    
939
        /* FIXME: We do not have timer support here, so this is effectively
940
         * a busy wait.
941
         */
942
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
943
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
944
                qemu_co_queue_restart_all(&bs->throttled_reqs);
945
                busy = true;
946
            }
947
        }
948
    } while (busy);
949

    
950
    /* If requests are still pending there is a bug somewhere */
951
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
952
        assert(QLIST_EMPTY(&bs->tracked_requests));
953
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
954
    }
955
}
956

    
957
/* make a BlockDriverState anonymous by removing from bdrv_state list.
958
   Also, NULL terminate the device_name to prevent double remove */
959
void bdrv_make_anon(BlockDriverState *bs)
960
{
961
    if (bs->device_name[0] != '\0') {
962
        QTAILQ_REMOVE(&bdrv_states, bs, list);
963
    }
964
    bs->device_name[0] = '\0';
965
}
966

    
967
static void bdrv_rebind(BlockDriverState *bs)
968
{
969
    if (bs->drv && bs->drv->bdrv_rebind) {
970
        bs->drv->bdrv_rebind(bs);
971
    }
972
}
973

    
974
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
975
                                     BlockDriverState *bs_src)
976
{
977
    /* move some fields that need to stay attached to the device */
978
    bs_dest->open_flags         = bs_src->open_flags;
979

    
980
    /* dev info */
981
    bs_dest->dev_ops            = bs_src->dev_ops;
982
    bs_dest->dev_opaque         = bs_src->dev_opaque;
983
    bs_dest->dev                = bs_src->dev;
984
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
985
    bs_dest->copy_on_read       = bs_src->copy_on_read;
986

    
987
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
988

    
989
    /* i/o timing parameters */
990
    bs_dest->slice_time         = bs_src->slice_time;
991
    bs_dest->slice_start        = bs_src->slice_start;
992
    bs_dest->slice_end          = bs_src->slice_end;
993
    bs_dest->io_limits          = bs_src->io_limits;
994
    bs_dest->io_base            = bs_src->io_base;
995
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
996
    bs_dest->block_timer        = bs_src->block_timer;
997
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
998

    
999
    /* geometry */
1000
    bs_dest->cyls               = bs_src->cyls;
1001
    bs_dest->heads              = bs_src->heads;
1002
    bs_dest->secs               = bs_src->secs;
1003
    bs_dest->translation        = bs_src->translation;
1004

    
1005
    /* r/w error */
1006
    bs_dest->on_read_error      = bs_src->on_read_error;
1007
    bs_dest->on_write_error     = bs_src->on_write_error;
1008

    
1009
    /* i/o status */
1010
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1011
    bs_dest->iostatus           = bs_src->iostatus;
1012

    
1013
    /* dirty bitmap */
1014
    bs_dest->dirty_count        = bs_src->dirty_count;
1015
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1016

    
1017
    /* job */
1018
    bs_dest->in_use             = bs_src->in_use;
1019
    bs_dest->job                = bs_src->job;
1020

    
1021
    /* keep the same entry in bdrv_states */
1022
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1023
            bs_src->device_name);
1024
    bs_dest->list = bs_src->list;
1025
}
1026

    
1027
/*
1028
 * Swap bs contents for two image chains while they are live,
1029
 * while keeping required fields on the BlockDriverState that is
1030
 * actually attached to a device.
1031
 *
1032
 * This will modify the BlockDriverState fields, and swap contents
1033
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1034
 *
1035
 * bs_new is required to be anonymous.
1036
 *
1037
 * This function does not create any image files.
1038
 */
1039
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1040
{
1041
    BlockDriverState tmp;
1042

    
1043
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1044
    assert(bs_new->device_name[0] == '\0');
1045
    assert(bs_new->dirty_bitmap == NULL);
1046
    assert(bs_new->job == NULL);
1047
    assert(bs_new->dev == NULL);
1048
    assert(bs_new->in_use == 0);
1049
    assert(bs_new->io_limits_enabled == false);
1050
    assert(bs_new->block_timer == NULL);
1051

    
1052
    tmp = *bs_new;
1053
    *bs_new = *bs_old;
1054
    *bs_old = tmp;
1055

    
1056
    /* there are some fields that should not be swapped, move them back */
1057
    bdrv_move_feature_fields(&tmp, bs_old);
1058
    bdrv_move_feature_fields(bs_old, bs_new);
1059
    bdrv_move_feature_fields(bs_new, &tmp);
1060

    
1061
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1062
    assert(bs_new->device_name[0] == '\0');
1063

    
1064
    /* Check a few fields that should remain attached to the device */
1065
    assert(bs_new->dev == NULL);
1066
    assert(bs_new->job == NULL);
1067
    assert(bs_new->in_use == 0);
1068
    assert(bs_new->io_limits_enabled == false);
1069
    assert(bs_new->block_timer == NULL);
1070

    
1071
    bdrv_rebind(bs_new);
1072
    bdrv_rebind(bs_old);
1073
}
1074

    
1075
/*
1076
 * Add new bs contents at the top of an image chain while the chain is
1077
 * live, while keeping required fields on the top layer.
1078
 *
1079
 * This will modify the BlockDriverState fields, and swap contents
1080
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1081
 *
1082
 * bs_new is required to be anonymous.
1083
 *
1084
 * This function does not create any image files.
1085
 */
1086
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1087
{
1088
    bdrv_swap(bs_new, bs_top);
1089

    
1090
    /* The contents of 'tmp' will become bs_top, as we are
1091
     * swapping bs_new and bs_top contents. */
1092
    bs_top->backing_hd = bs_new;
1093
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1094
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1095
            bs_new->filename);
1096
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1097
            bs_new->drv ? bs_new->drv->format_name : "");
1098
}
1099

    
1100
void bdrv_delete(BlockDriverState *bs)
1101
{
1102
    assert(!bs->dev);
1103
    assert(!bs->job);
1104
    assert(!bs->in_use);
1105

    
1106
    /* remove from list, if necessary */
1107
    bdrv_make_anon(bs);
1108

    
1109
    bdrv_close(bs);
1110

    
1111
    assert(bs != bs_snapshots);
1112
    g_free(bs);
1113
}
1114

    
1115
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1116
/* TODO change to DeviceState *dev when all users are qdevified */
1117
{
1118
    if (bs->dev) {
1119
        return -EBUSY;
1120
    }
1121
    bs->dev = dev;
1122
    bdrv_iostatus_reset(bs);
1123
    return 0;
1124
}
1125

    
1126
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1127
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1128
{
1129
    if (bdrv_attach_dev(bs, dev) < 0) {
1130
        abort();
1131
    }
1132
}
1133

    
1134
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1135
/* TODO change to DeviceState *dev when all users are qdevified */
1136
{
1137
    assert(bs->dev == dev);
1138
    bs->dev = NULL;
1139
    bs->dev_ops = NULL;
1140
    bs->dev_opaque = NULL;
1141
    bs->buffer_alignment = 512;
1142
}
1143

    
1144
/* TODO change to return DeviceState * when all users are qdevified */
1145
void *bdrv_get_attached_dev(BlockDriverState *bs)
1146
{
1147
    return bs->dev;
1148
}
1149

    
1150
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1151
                      void *opaque)
1152
{
1153
    bs->dev_ops = ops;
1154
    bs->dev_opaque = opaque;
1155
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1156
        bs_snapshots = NULL;
1157
    }
1158
}
1159

    
1160
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1161
                               BlockQMPEventAction action, int is_read)
1162
{
1163
    QObject *data;
1164
    const char *action_str;
1165

    
1166
    switch (action) {
1167
    case BDRV_ACTION_REPORT:
1168
        action_str = "report";
1169
        break;
1170
    case BDRV_ACTION_IGNORE:
1171
        action_str = "ignore";
1172
        break;
1173
    case BDRV_ACTION_STOP:
1174
        action_str = "stop";
1175
        break;
1176
    default:
1177
        abort();
1178
    }
1179

    
1180
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1181
                              bdrv->device_name,
1182
                              action_str,
1183
                              is_read ? "read" : "write");
1184
    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1185

    
1186
    qobject_decref(data);
1187
}
1188

    
1189
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1190
{
1191
    QObject *data;
1192

    
1193
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1194
                              bdrv_get_device_name(bs), ejected);
1195
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1196

    
1197
    qobject_decref(data);
1198
}
1199

    
1200
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1201
{
1202
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1203
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1204
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1205
        if (tray_was_closed) {
1206
            /* tray open */
1207
            bdrv_emit_qmp_eject_event(bs, true);
1208
        }
1209
        if (load) {
1210
            /* tray close */
1211
            bdrv_emit_qmp_eject_event(bs, false);
1212
        }
1213
    }
1214
}
1215

    
1216
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1217
{
1218
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1219
}
1220

    
1221
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1222
{
1223
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1224
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1225
    }
1226
}
1227

    
1228
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1229
{
1230
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1231
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1232
    }
1233
    return false;
1234
}
1235

    
1236
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1237
{
1238
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1239
        bs->dev_ops->resize_cb(bs->dev_opaque);
1240
    }
1241
}
1242

    
1243
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1244
{
1245
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1246
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1247
    }
1248
    return false;
1249
}
1250

    
1251
/*
1252
 * Run consistency checks on an image
1253
 *
1254
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1255
 * free of errors) or -errno when an internal error occurred. The results of the
1256
 * check are stored in res.
1257
 */
1258
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1259
{
1260
    if (bs->drv->bdrv_check == NULL) {
1261
        return -ENOTSUP;
1262
    }
1263

    
1264
    memset(res, 0, sizeof(*res));
1265
    return bs->drv->bdrv_check(bs, res, fix);
1266
}
1267

    
1268
#define COMMIT_BUF_SECTORS 2048
1269

    
1270
/* commit COW file into the raw image */
1271
int bdrv_commit(BlockDriverState *bs)
1272
{
1273
    BlockDriver *drv = bs->drv;
1274
    BlockDriver *backing_drv;
1275
    int64_t sector, total_sectors;
1276
    int n, ro, open_flags;
1277
    int ret = 0, rw_ret = 0;
1278
    uint8_t *buf;
1279
    char filename[1024];
1280
    BlockDriverState *bs_rw, *bs_ro;
1281

    
1282
    if (!drv)
1283
        return -ENOMEDIUM;
1284
    
1285
    if (!bs->backing_hd) {
1286
        return -ENOTSUP;
1287
    }
1288

    
1289
    if (bs->backing_hd->keep_read_only) {
1290
        return -EACCES;
1291
    }
1292

    
1293
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1294
        return -EBUSY;
1295
    }
1296

    
1297
    backing_drv = bs->backing_hd->drv;
1298
    ro = bs->backing_hd->read_only;
1299
    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1300
    open_flags =  bs->backing_hd->open_flags;
1301

    
1302
    if (ro) {
1303
        /* re-open as RW */
1304
        bdrv_delete(bs->backing_hd);
1305
        bs->backing_hd = NULL;
1306
        bs_rw = bdrv_new("");
1307
        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1308
            backing_drv);
1309
        if (rw_ret < 0) {
1310
            bdrv_delete(bs_rw);
1311
            /* try to re-open read-only */
1312
            bs_ro = bdrv_new("");
1313
            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1314
                backing_drv);
1315
            if (ret < 0) {
1316
                bdrv_delete(bs_ro);
1317
                /* drive not functional anymore */
1318
                bs->drv = NULL;
1319
                return ret;
1320
            }
1321
            bs->backing_hd = bs_ro;
1322
            return rw_ret;
1323
        }
1324
        bs->backing_hd = bs_rw;
1325
    }
1326

    
1327
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1328
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1329

    
1330
    for (sector = 0; sector < total_sectors; sector += n) {
1331
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1332

    
1333
            if (bdrv_read(bs, sector, buf, n) != 0) {
1334
                ret = -EIO;
1335
                goto ro_cleanup;
1336
            }
1337

    
1338
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1339
                ret = -EIO;
1340
                goto ro_cleanup;
1341
            }
1342
        }
1343
    }
1344

    
1345
    if (drv->bdrv_make_empty) {
1346
        ret = drv->bdrv_make_empty(bs);
1347
        bdrv_flush(bs);
1348
    }
1349

    
1350
    /*
1351
     * Make sure all data we wrote to the backing device is actually
1352
     * stable on disk.
1353
     */
1354
    if (bs->backing_hd)
1355
        bdrv_flush(bs->backing_hd);
1356

    
1357
ro_cleanup:
1358
    g_free(buf);
1359

    
1360
    if (ro) {
1361
        /* re-open as RO */
1362
        bdrv_delete(bs->backing_hd);
1363
        bs->backing_hd = NULL;
1364
        bs_ro = bdrv_new("");
1365
        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1366
            backing_drv);
1367
        if (ret < 0) {
1368
            bdrv_delete(bs_ro);
1369
            /* drive not functional anymore */
1370
            bs->drv = NULL;
1371
            return ret;
1372
        }
1373
        bs->backing_hd = bs_ro;
1374
        bs->backing_hd->keep_read_only = 0;
1375
    }
1376

    
1377
    return ret;
1378
}
1379

    
1380
int bdrv_commit_all(void)
1381
{
1382
    BlockDriverState *bs;
1383

    
1384
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1385
        int ret = bdrv_commit(bs);
1386
        if (ret < 0) {
1387
            return ret;
1388
        }
1389
    }
1390
    return 0;
1391
}
1392

    
1393
struct BdrvTrackedRequest {
1394
    BlockDriverState *bs;
1395
    int64_t sector_num;
1396
    int nb_sectors;
1397
    bool is_write;
1398
    QLIST_ENTRY(BdrvTrackedRequest) list;
1399
    Coroutine *co; /* owner, used for deadlock detection */
1400
    CoQueue wait_queue; /* coroutines blocked on this request */
1401
};
1402

    
1403
/**
1404
 * Remove an active request from the tracked requests list
1405
 *
1406
 * This function should be called when a tracked request is completing.
1407
 */
1408
static void tracked_request_end(BdrvTrackedRequest *req)
1409
{
1410
    QLIST_REMOVE(req, list);
1411
    qemu_co_queue_restart_all(&req->wait_queue);
1412
}
1413

    
1414
/**
1415
 * Add an active request to the tracked requests list
1416
 */
1417
static void tracked_request_begin(BdrvTrackedRequest *req,
1418
                                  BlockDriverState *bs,
1419
                                  int64_t sector_num,
1420
                                  int nb_sectors, bool is_write)
1421
{
1422
    *req = (BdrvTrackedRequest){
1423
        .bs = bs,
1424
        .sector_num = sector_num,
1425
        .nb_sectors = nb_sectors,
1426
        .is_write = is_write,
1427
        .co = qemu_coroutine_self(),
1428
    };
1429

    
1430
    qemu_co_queue_init(&req->wait_queue);
1431

    
1432
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1433
}
1434

    
1435
/**
1436
 * Round a region to cluster boundaries
1437
 */
1438
static void round_to_clusters(BlockDriverState *bs,
1439
                              int64_t sector_num, int nb_sectors,
1440
                              int64_t *cluster_sector_num,
1441
                              int *cluster_nb_sectors)
1442
{
1443
    BlockDriverInfo bdi;
1444

    
1445
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1446
        *cluster_sector_num = sector_num;
1447
        *cluster_nb_sectors = nb_sectors;
1448
    } else {
1449
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1450
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1451
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1452
                                            nb_sectors, c);
1453
    }
1454
}
1455

    
1456
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1457
                                     int64_t sector_num, int nb_sectors) {
1458
    /*        aaaa   bbbb */
1459
    if (sector_num >= req->sector_num + req->nb_sectors) {
1460
        return false;
1461
    }
1462
    /* bbbb   aaaa        */
1463
    if (req->sector_num >= sector_num + nb_sectors) {
1464
        return false;
1465
    }
1466
    return true;
1467
}
1468

    
1469
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1470
        int64_t sector_num, int nb_sectors)
1471
{
1472
    BdrvTrackedRequest *req;
1473
    int64_t cluster_sector_num;
1474
    int cluster_nb_sectors;
1475
    bool retry;
1476

    
1477
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1478
     * that allocating writes will be serialized and not race with each other
1479
     * for the same cluster.  For example, in copy-on-read it ensures that the
1480
     * CoR read and write operations are atomic and guest writes cannot
1481
     * interleave between them.
1482
     */
1483
    round_to_clusters(bs, sector_num, nb_sectors,
1484
                      &cluster_sector_num, &cluster_nb_sectors);
1485

    
1486
    do {
1487
        retry = false;
1488
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1489
            if (tracked_request_overlaps(req, cluster_sector_num,
1490
                                         cluster_nb_sectors)) {
1491
                /* Hitting this means there was a reentrant request, for
1492
                 * example, a block driver issuing nested requests.  This must
1493
                 * never happen since it means deadlock.
1494
                 */
1495
                assert(qemu_coroutine_self() != req->co);
1496

    
1497
                qemu_co_queue_wait(&req->wait_queue);
1498
                retry = true;
1499
                break;
1500
            }
1501
        }
1502
    } while (retry);
1503
}
1504

    
1505
/*
1506
 * Return values:
1507
 * 0        - success
1508
 * -EINVAL  - backing format specified, but no file
1509
 * -ENOSPC  - can't update the backing file because no space is left in the
1510
 *            image file header
1511
 * -ENOTSUP - format driver doesn't support changing the backing file
1512
 */
1513
int bdrv_change_backing_file(BlockDriverState *bs,
1514
    const char *backing_file, const char *backing_fmt)
1515
{
1516
    BlockDriver *drv = bs->drv;
1517
    int ret;
1518

    
1519
    /* Backing file format doesn't make sense without a backing file */
1520
    if (backing_fmt && !backing_file) {
1521
        return -EINVAL;
1522
    }
1523

    
1524
    if (drv->bdrv_change_backing_file != NULL) {
1525
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1526
    } else {
1527
        ret = -ENOTSUP;
1528
    }
1529

    
1530
    if (ret == 0) {
1531
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1532
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1533
    }
1534
    return ret;
1535
}
1536

    
1537
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1538
                                   size_t size)
1539
{
1540
    int64_t len;
1541

    
1542
    if (!bdrv_is_inserted(bs))
1543
        return -ENOMEDIUM;
1544

    
1545
    if (bs->growable)
1546
        return 0;
1547

    
1548
    len = bdrv_getlength(bs);
1549

    
1550
    if (offset < 0)
1551
        return -EIO;
1552

    
1553
    if ((offset > len) || (len - offset < size))
1554
        return -EIO;
1555

    
1556
    return 0;
1557
}
1558

    
1559
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1560
                              int nb_sectors)
1561
{
1562
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1563
                                   nb_sectors * BDRV_SECTOR_SIZE);
1564
}
1565

    
1566
typedef struct RwCo {
1567
    BlockDriverState *bs;
1568
    int64_t sector_num;
1569
    int nb_sectors;
1570
    QEMUIOVector *qiov;
1571
    bool is_write;
1572
    int ret;
1573
} RwCo;
1574

    
1575
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1576
{
1577
    RwCo *rwco = opaque;
1578

    
1579
    if (!rwco->is_write) {
1580
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1581
                                     rwco->nb_sectors, rwco->qiov, 0);
1582
    } else {
1583
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1584
                                      rwco->nb_sectors, rwco->qiov, 0);
1585
    }
1586
}
1587

    
1588
/*
1589
 * Process a synchronous request using coroutines
1590
 */
1591
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1592
                      int nb_sectors, bool is_write)
1593
{
1594
    QEMUIOVector qiov;
1595
    struct iovec iov = {
1596
        .iov_base = (void *)buf,
1597
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1598
    };
1599
    Coroutine *co;
1600
    RwCo rwco = {
1601
        .bs = bs,
1602
        .sector_num = sector_num,
1603
        .nb_sectors = nb_sectors,
1604
        .qiov = &qiov,
1605
        .is_write = is_write,
1606
        .ret = NOT_DONE,
1607
    };
1608

    
1609
    qemu_iovec_init_external(&qiov, &iov, 1);
1610

    
1611
    /**
1612
     * In sync call context, when the vcpu is blocked, this throttling timer
1613
     * will not fire; so the I/O throttling function has to be disabled here
1614
     * if it has been enabled.
1615
     */
1616
    if (bs->io_limits_enabled) {
1617
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
1618
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
1619
        bdrv_io_limits_disable(bs);
1620
    }
1621

    
1622
    if (qemu_in_coroutine()) {
1623
        /* Fast-path if already in coroutine context */
1624
        bdrv_rw_co_entry(&rwco);
1625
    } else {
1626
        co = qemu_coroutine_create(bdrv_rw_co_entry);
1627
        qemu_coroutine_enter(co, &rwco);
1628
        while (rwco.ret == NOT_DONE) {
1629
            qemu_aio_wait();
1630
        }
1631
    }
1632
    return rwco.ret;
1633
}
1634

    
1635
/* return < 0 if error. See bdrv_write() for the return codes */
1636
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1637
              uint8_t *buf, int nb_sectors)
1638
{
1639
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1640
}
1641

    
1642
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
1643
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
1644
                          uint8_t *buf, int nb_sectors)
1645
{
1646
    bool enabled;
1647
    int ret;
1648

    
1649
    enabled = bs->io_limits_enabled;
1650
    bs->io_limits_enabled = false;
1651
    ret = bdrv_read(bs, 0, buf, 1);
1652
    bs->io_limits_enabled = enabled;
1653
    return ret;
1654
}
1655

    
1656
#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
1657

    
1658
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1659
                             int nb_sectors, int dirty)
1660
{
1661
    int64_t start, end;
1662
    unsigned long val, idx, bit;
1663

    
1664
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1665
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1666

    
1667
    for (; start <= end; start++) {
1668
        idx = start / BITS_PER_LONG;
1669
        bit = start % BITS_PER_LONG;
1670
        val = bs->dirty_bitmap[idx];
1671
        if (dirty) {
1672
            if (!(val & (1UL << bit))) {
1673
                bs->dirty_count++;
1674
                val |= 1UL << bit;
1675
            }
1676
        } else {
1677
            if (val & (1UL << bit)) {
1678
                bs->dirty_count--;
1679
                val &= ~(1UL << bit);
1680
            }
1681
        }
1682
        bs->dirty_bitmap[idx] = val;
1683
    }
1684
}
1685

    
1686
/* Return < 0 if error. Important errors are:
1687
  -EIO         generic I/O error (may happen for all errors)
1688
  -ENOMEDIUM   No media inserted.
1689
  -EINVAL      Invalid sector number or nb_sectors
1690
  -EACCES      Trying to write a read-only device
1691
*/
1692
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1693
               const uint8_t *buf, int nb_sectors)
1694
{
1695
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1696
}
1697

    
1698
int bdrv_pread(BlockDriverState *bs, int64_t offset,
1699
               void *buf, int count1)
1700
{
1701
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1702
    int len, nb_sectors, count;
1703
    int64_t sector_num;
1704
    int ret;
1705

    
1706
    count = count1;
1707
    /* first read to align to sector start */
1708
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1709
    if (len > count)
1710
        len = count;
1711
    sector_num = offset >> BDRV_SECTOR_BITS;
1712
    if (len > 0) {
1713
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1714
            return ret;
1715
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1716
        count -= len;
1717
        if (count == 0)
1718
            return count1;
1719
        sector_num++;
1720
        buf += len;
1721
    }
1722

    
1723
    /* read the sectors "in place" */
1724
    nb_sectors = count >> BDRV_SECTOR_BITS;
1725
    if (nb_sectors > 0) {
1726
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1727
            return ret;
1728
        sector_num += nb_sectors;
1729
        len = nb_sectors << BDRV_SECTOR_BITS;
1730
        buf += len;
1731
        count -= len;
1732
    }
1733

    
1734
    /* add data from the last sector */
1735
    if (count > 0) {
1736
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1737
            return ret;
1738
        memcpy(buf, tmp_buf, count);
1739
    }
1740
    return count1;
1741
}
1742

    
1743
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1744
                const void *buf, int count1)
1745
{
1746
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1747
    int len, nb_sectors, count;
1748
    int64_t sector_num;
1749
    int ret;
1750

    
1751
    count = count1;
1752
    /* first write to align to sector start */
1753
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1754
    if (len > count)
1755
        len = count;
1756
    sector_num = offset >> BDRV_SECTOR_BITS;
1757
    if (len > 0) {
1758
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1759
            return ret;
1760
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1761
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1762
            return ret;
1763
        count -= len;
1764
        if (count == 0)
1765
            return count1;
1766
        sector_num++;
1767
        buf += len;
1768
    }
1769

    
1770
    /* write the sectors "in place" */
1771
    nb_sectors = count >> BDRV_SECTOR_BITS;
1772
    if (nb_sectors > 0) {
1773
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1774
            return ret;
1775
        sector_num += nb_sectors;
1776
        len = nb_sectors << BDRV_SECTOR_BITS;
1777
        buf += len;
1778
        count -= len;
1779
    }
1780

    
1781
    /* add data from the last sector */
1782
    if (count > 0) {
1783
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1784
            return ret;
1785
        memcpy(tmp_buf, buf, count);
1786
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1787
            return ret;
1788
    }
1789
    return count1;
1790
}
1791

    
1792
/*
1793
 * Writes to the file and ensures that no writes are reordered across this
1794
 * request (acts as a barrier)
1795
 *
1796
 * Returns 0 on success, -errno in error cases.
1797
 */
1798
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1799
    const void *buf, int count)
1800
{
1801
    int ret;
1802

    
1803
    ret = bdrv_pwrite(bs, offset, buf, count);
1804
    if (ret < 0) {
1805
        return ret;
1806
    }
1807

    
1808
    /* No flush needed for cache modes that already do it */
1809
    if (bs->enable_write_cache) {
1810
        bdrv_flush(bs);
1811
    }
1812

    
1813
    return 0;
1814
}
1815

    
1816
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1817
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1818
{
1819
    /* Perform I/O through a temporary buffer so that users who scribble over
1820
     * their read buffer while the operation is in progress do not end up
1821
     * modifying the image file.  This is critical for zero-copy guest I/O
1822
     * where anything might happen inside guest memory.
1823
     */
1824
    void *bounce_buffer;
1825

    
1826
    BlockDriver *drv = bs->drv;
1827
    struct iovec iov;
1828
    QEMUIOVector bounce_qiov;
1829
    int64_t cluster_sector_num;
1830
    int cluster_nb_sectors;
1831
    size_t skip_bytes;
1832
    int ret;
1833

    
1834
    /* Cover entire cluster so no additional backing file I/O is required when
1835
     * allocating cluster in the image file.
1836
     */
1837
    round_to_clusters(bs, sector_num, nb_sectors,
1838
                      &cluster_sector_num, &cluster_nb_sectors);
1839

    
1840
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1841
                                   cluster_sector_num, cluster_nb_sectors);
1842

    
1843
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1844
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1845
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1846

    
1847
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1848
                             &bounce_qiov);
1849
    if (ret < 0) {
1850
        goto err;
1851
    }
1852

    
1853
    if (drv->bdrv_co_write_zeroes &&
1854
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
1855
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1856
                                      cluster_nb_sectors);
1857
    } else {
1858
        /* This does not change the data on the disk, it is not necessary
1859
         * to flush even in cache=writethrough mode.
1860
         */
1861
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1862
                                  &bounce_qiov);
1863
    }
1864

    
1865
    if (ret < 0) {
1866
        /* It might be okay to ignore write errors for guest requests.  If this
1867
         * is a deliberate copy-on-read then we don't want to ignore the error.
1868
         * Simply report it in all cases.
1869
         */
1870
        goto err;
1871
    }
1872

    
1873
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1874
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
1875
                        nb_sectors * BDRV_SECTOR_SIZE);
1876

    
1877
err:
1878
    qemu_vfree(bounce_buffer);
1879
    return ret;
1880
}
1881

    
1882
/*
1883
 * Handle a read request in coroutine context
1884
 */
1885
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1886
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1887
    BdrvRequestFlags flags)
1888
{
1889
    BlockDriver *drv = bs->drv;
1890
    BdrvTrackedRequest req;
1891
    int ret;
1892

    
1893
    if (!drv) {
1894
        return -ENOMEDIUM;
1895
    }
1896
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1897
        return -EIO;
1898
    }
1899

    
1900
    /* throttling disk read I/O */
1901
    if (bs->io_limits_enabled) {
1902
        bdrv_io_limits_intercept(bs, false, nb_sectors);
1903
    }
1904

    
1905
    if (bs->copy_on_read) {
1906
        flags |= BDRV_REQ_COPY_ON_READ;
1907
    }
1908
    if (flags & BDRV_REQ_COPY_ON_READ) {
1909
        bs->copy_on_read_in_flight++;
1910
    }
1911

    
1912
    if (bs->copy_on_read_in_flight) {
1913
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1914
    }
1915

    
1916
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1917

    
1918
    if (flags & BDRV_REQ_COPY_ON_READ) {
1919
        int pnum;
1920

    
1921
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1922
        if (ret < 0) {
1923
            goto out;
1924
        }
1925

    
1926
        if (!ret || pnum != nb_sectors) {
1927
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1928
            goto out;
1929
        }
1930
    }
1931

    
1932
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1933

    
1934
out:
1935
    tracked_request_end(&req);
1936

    
1937
    if (flags & BDRV_REQ_COPY_ON_READ) {
1938
        bs->copy_on_read_in_flight--;
1939
    }
1940

    
1941
    return ret;
1942
}
1943

    
1944
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1945
    int nb_sectors, QEMUIOVector *qiov)
1946
{
1947
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1948

    
1949
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1950
}
1951

    
1952
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1953
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1954
{
1955
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1956

    
1957
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1958
                            BDRV_REQ_COPY_ON_READ);
1959
}
1960

    
1961
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1962
    int64_t sector_num, int nb_sectors)
1963
{
1964
    BlockDriver *drv = bs->drv;
1965
    QEMUIOVector qiov;
1966
    struct iovec iov;
1967
    int ret;
1968

    
1969
    /* TODO Emulate only part of misaligned requests instead of letting block
1970
     * drivers return -ENOTSUP and emulate everything */
1971

    
1972
    /* First try the efficient write zeroes operation */
1973
    if (drv->bdrv_co_write_zeroes) {
1974
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1975
        if (ret != -ENOTSUP) {
1976
            return ret;
1977
        }
1978
    }
1979

    
1980
    /* Fall back to bounce buffer if write zeroes is unsupported */
1981
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1982
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1983
    memset(iov.iov_base, 0, iov.iov_len);
1984
    qemu_iovec_init_external(&qiov, &iov, 1);
1985

    
1986
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1987

    
1988
    qemu_vfree(iov.iov_base);
1989
    return ret;
1990
}
1991

    
1992
/*
1993
 * Handle a write request in coroutine context
1994
 */
1995
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1996
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1997
    BdrvRequestFlags flags)
1998
{
1999
    BlockDriver *drv = bs->drv;
2000
    BdrvTrackedRequest req;
2001
    int ret;
2002

    
2003
    if (!bs->drv) {
2004
        return -ENOMEDIUM;
2005
    }
2006
    if (bs->read_only) {
2007
        return -EACCES;
2008
    }
2009
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2010
        return -EIO;
2011
    }
2012

    
2013
    /* throttling disk write I/O */
2014
    if (bs->io_limits_enabled) {
2015
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2016
    }
2017

    
2018
    if (bs->copy_on_read_in_flight) {
2019
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2020
    }
2021

    
2022
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2023

    
2024
    if (flags & BDRV_REQ_ZERO_WRITE) {
2025
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2026
    } else {
2027
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2028
    }
2029

    
2030
    if (ret == 0 && !bs->enable_write_cache) {
2031
        ret = bdrv_co_flush(bs);
2032
    }
2033

    
2034
    if (bs->dirty_bitmap) {
2035
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2036
    }
2037

    
2038
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2039
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2040
    }
2041

    
2042
    tracked_request_end(&req);
2043

    
2044
    return ret;
2045
}
2046

    
2047
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2048
    int nb_sectors, QEMUIOVector *qiov)
2049
{
2050
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2051

    
2052
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2053
}
2054

    
2055
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2056
                                      int64_t sector_num, int nb_sectors)
2057
{
2058
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2059

    
2060
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2061
                             BDRV_REQ_ZERO_WRITE);
2062
}
2063

    
2064
/**
2065
 * Truncate file to 'offset' bytes (needed only for file protocols)
2066
 */
2067
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2068
{
2069
    BlockDriver *drv = bs->drv;
2070
    int ret;
2071
    if (!drv)
2072
        return -ENOMEDIUM;
2073
    if (!drv->bdrv_truncate)
2074
        return -ENOTSUP;
2075
    if (bs->read_only)
2076
        return -EACCES;
2077
    if (bdrv_in_use(bs))
2078
        return -EBUSY;
2079
    ret = drv->bdrv_truncate(bs, offset);
2080
    if (ret == 0) {
2081
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2082
        bdrv_dev_resize_cb(bs);
2083
    }
2084
    return ret;
2085
}
2086

    
2087
/**
2088
 * Length of a allocated file in bytes. Sparse files are counted by actual
2089
 * allocated space. Return < 0 if error or unknown.
2090
 */
2091
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2092
{
2093
    BlockDriver *drv = bs->drv;
2094
    if (!drv) {
2095
        return -ENOMEDIUM;
2096
    }
2097
    if (drv->bdrv_get_allocated_file_size) {
2098
        return drv->bdrv_get_allocated_file_size(bs);
2099
    }
2100
    if (bs->file) {
2101
        return bdrv_get_allocated_file_size(bs->file);
2102
    }
2103
    return -ENOTSUP;
2104
}
2105

    
2106
/**
2107
 * Length of a file in bytes. Return < 0 if error or unknown.
2108
 */
2109
int64_t bdrv_getlength(BlockDriverState *bs)
2110
{
2111
    BlockDriver *drv = bs->drv;
2112
    if (!drv)
2113
        return -ENOMEDIUM;
2114

    
2115
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2116
        if (drv->bdrv_getlength) {
2117
            return drv->bdrv_getlength(bs);
2118
        }
2119
    }
2120
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2121
}
2122

    
2123
/* return 0 as number of sectors if no device present or error */
2124
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2125
{
2126
    int64_t length;
2127
    length = bdrv_getlength(bs);
2128
    if (length < 0)
2129
        length = 0;
2130
    else
2131
        length = length >> BDRV_SECTOR_BITS;
2132
    *nb_sectors_ptr = length;
2133
}
2134

    
2135
void bdrv_set_geometry_hint(BlockDriverState *bs,
2136
                            int cyls, int heads, int secs)
2137
{
2138
    bs->cyls = cyls;
2139
    bs->heads = heads;
2140
    bs->secs = secs;
2141
}
2142

    
2143
void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2144
{
2145
    bs->translation = translation;
2146
}
2147

    
2148
void bdrv_get_geometry_hint(BlockDriverState *bs,
2149
                            int *pcyls, int *pheads, int *psecs)
2150
{
2151
    *pcyls = bs->cyls;
2152
    *pheads = bs->heads;
2153
    *psecs = bs->secs;
2154
}
2155

    
2156
/* throttling disk io limits */
2157
void bdrv_set_io_limits(BlockDriverState *bs,
2158
                        BlockIOLimit *io_limits)
2159
{
2160
    bs->io_limits = *io_limits;
2161
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2162
}
2163

    
2164
int bdrv_get_translation_hint(BlockDriverState *bs)
2165
{
2166
    return bs->translation;
2167
}
2168

    
2169
void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2170
                       BlockErrorAction on_write_error)
2171
{
2172
    bs->on_read_error = on_read_error;
2173
    bs->on_write_error = on_write_error;
2174
}
2175

    
2176
BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2177
{
2178
    return is_read ? bs->on_read_error : bs->on_write_error;
2179
}
2180

    
2181
int bdrv_is_read_only(BlockDriverState *bs)
2182
{
2183
    return bs->read_only;
2184
}
2185

    
2186
int bdrv_is_sg(BlockDriverState *bs)
2187
{
2188
    return bs->sg;
2189
}
2190

    
2191
int bdrv_enable_write_cache(BlockDriverState *bs)
2192
{
2193
    return bs->enable_write_cache;
2194
}
2195

    
2196
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2197
{
2198
    bs->enable_write_cache = wce;
2199
}
2200

    
2201
int bdrv_is_encrypted(BlockDriverState *bs)
2202
{
2203
    if (bs->backing_hd && bs->backing_hd->encrypted)
2204
        return 1;
2205
    return bs->encrypted;
2206
}
2207

    
2208
int bdrv_key_required(BlockDriverState *bs)
2209
{
2210
    BlockDriverState *backing_hd = bs->backing_hd;
2211

    
2212
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2213
        return 1;
2214
    return (bs->encrypted && !bs->valid_key);
2215
}
2216

    
2217
int bdrv_set_key(BlockDriverState *bs, const char *key)
2218
{
2219
    int ret;
2220
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2221
        ret = bdrv_set_key(bs->backing_hd, key);
2222
        if (ret < 0)
2223
            return ret;
2224
        if (!bs->encrypted)
2225
            return 0;
2226
    }
2227
    if (!bs->encrypted) {
2228
        return -EINVAL;
2229
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2230
        return -ENOMEDIUM;
2231
    }
2232
    ret = bs->drv->bdrv_set_key(bs, key);
2233
    if (ret < 0) {
2234
        bs->valid_key = 0;
2235
    } else if (!bs->valid_key) {
2236
        bs->valid_key = 1;
2237
        /* call the change callback now, we skipped it on open */
2238
        bdrv_dev_change_media_cb(bs, true);
2239
    }
2240
    return ret;
2241
}
2242

    
2243
const char *bdrv_get_format_name(BlockDriverState *bs)
2244
{
2245
    return bs->drv ? bs->drv->format_name : NULL;
2246
}
2247

    
2248
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2249
                         void *opaque)
2250
{
2251
    BlockDriver *drv;
2252

    
2253
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2254
        it(opaque, drv->format_name);
2255
    }
2256
}
2257

    
2258
BlockDriverState *bdrv_find(const char *name)
2259
{
2260
    BlockDriverState *bs;
2261

    
2262
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2263
        if (!strcmp(name, bs->device_name)) {
2264
            return bs;
2265
        }
2266
    }
2267
    return NULL;
2268
}
2269

    
2270
BlockDriverState *bdrv_next(BlockDriverState *bs)
2271
{
2272
    if (!bs) {
2273
        return QTAILQ_FIRST(&bdrv_states);
2274
    }
2275
    return QTAILQ_NEXT(bs, list);
2276
}
2277

    
2278
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2279
{
2280
    BlockDriverState *bs;
2281

    
2282
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2283
        it(opaque, bs);
2284
    }
2285
}
2286

    
2287
const char *bdrv_get_device_name(BlockDriverState *bs)
2288
{
2289
    return bs->device_name;
2290
}
2291

    
2292
int bdrv_get_flags(BlockDriverState *bs)
2293
{
2294
    return bs->open_flags;
2295
}
2296

    
2297
void bdrv_flush_all(void)
2298
{
2299
    BlockDriverState *bs;
2300

    
2301
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2302
        bdrv_flush(bs);
2303
    }
2304
}
2305

    
2306
int bdrv_has_zero_init(BlockDriverState *bs)
2307
{
2308
    assert(bs->drv);
2309

    
2310
    if (bs->drv->bdrv_has_zero_init) {
2311
        return bs->drv->bdrv_has_zero_init(bs);
2312
    }
2313

    
2314
    return 1;
2315
}
2316

    
2317
typedef struct BdrvCoIsAllocatedData {
2318
    BlockDriverState *bs;
2319
    int64_t sector_num;
2320
    int nb_sectors;
2321
    int *pnum;
2322
    int ret;
2323
    bool done;
2324
} BdrvCoIsAllocatedData;
2325

    
2326
/*
2327
 * Returns true iff the specified sector is present in the disk image. Drivers
2328
 * not implementing the functionality are assumed to not support backing files,
2329
 * hence all their sectors are reported as allocated.
2330
 *
2331
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2332
 * and 'pnum' is set to 0.
2333
 *
2334
 * 'pnum' is set to the number of sectors (including and immediately following
2335
 * the specified sector) that are known to be in the same
2336
 * allocated/unallocated state.
2337
 *
2338
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2339
 * beyond the end of the disk image it will be clamped.
2340
 */
2341
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2342
                                      int nb_sectors, int *pnum)
2343
{
2344
    int64_t n;
2345

    
2346
    if (sector_num >= bs->total_sectors) {
2347
        *pnum = 0;
2348
        return 0;
2349
    }
2350

    
2351
    n = bs->total_sectors - sector_num;
2352
    if (n < nb_sectors) {
2353
        nb_sectors = n;
2354
    }
2355

    
2356
    if (!bs->drv->bdrv_co_is_allocated) {
2357
        *pnum = nb_sectors;
2358
        return 1;
2359
    }
2360

    
2361
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2362
}
2363

    
2364
/* Coroutine wrapper for bdrv_is_allocated() */
2365
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2366
{
2367
    BdrvCoIsAllocatedData *data = opaque;
2368
    BlockDriverState *bs = data->bs;
2369

    
2370
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2371
                                     data->pnum);
2372
    data->done = true;
2373
}
2374

    
2375
/*
2376
 * Synchronous wrapper around bdrv_co_is_allocated().
2377
 *
2378
 * See bdrv_co_is_allocated() for details.
2379
 */
2380
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2381
                      int *pnum)
2382
{
2383
    Coroutine *co;
2384
    BdrvCoIsAllocatedData data = {
2385
        .bs = bs,
2386
        .sector_num = sector_num,
2387
        .nb_sectors = nb_sectors,
2388
        .pnum = pnum,
2389
        .done = false,
2390
    };
2391

    
2392
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2393
    qemu_coroutine_enter(co, &data);
2394
    while (!data.done) {
2395
        qemu_aio_wait();
2396
    }
2397
    return data.ret;
2398
}
2399

    
2400
/*
2401
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2402
 *
2403
 * Return true if the given sector is allocated in any image between
2404
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2405
 * sector is allocated in any image of the chain.  Return false otherwise.
2406
 *
2407
 * 'pnum' is set to the number of sectors (including and immediately following
2408
 *  the specified sector) that are known to be in the same
2409
 *  allocated/unallocated state.
2410
 *
2411
 */
2412
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2413
                                            BlockDriverState *base,
2414
                                            int64_t sector_num,
2415
                                            int nb_sectors, int *pnum)
2416
{
2417
    BlockDriverState *intermediate;
2418
    int ret, n = nb_sectors;
2419

    
2420
    intermediate = top;
2421
    while (intermediate && intermediate != base) {
2422
        int pnum_inter;
2423
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2424
                                   &pnum_inter);
2425
        if (ret < 0) {
2426
            return ret;
2427
        } else if (ret) {
2428
            *pnum = pnum_inter;
2429
            return 1;
2430
        }
2431

    
2432
        /*
2433
         * [sector_num, nb_sectors] is unallocated on top but intermediate
2434
         * might have
2435
         *
2436
         * [sector_num+x, nr_sectors] allocated.
2437
         */
2438
        if (n > pnum_inter) {
2439
            n = pnum_inter;
2440
        }
2441

    
2442
        intermediate = intermediate->backing_hd;
2443
    }
2444

    
2445
    *pnum = n;
2446
    return 0;
2447
}
2448

    
2449
BlockInfoList *qmp_query_block(Error **errp)
2450
{
2451
    BlockInfoList *head = NULL, *cur_item = NULL;
2452
    BlockDriverState *bs;
2453

    
2454
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2455
        BlockInfoList *info = g_malloc0(sizeof(*info));
2456

    
2457
        info->value = g_malloc0(sizeof(*info->value));
2458
        info->value->device = g_strdup(bs->device_name);
2459
        info->value->type = g_strdup("unknown");
2460
        info->value->locked = bdrv_dev_is_medium_locked(bs);
2461
        info->value->removable = bdrv_dev_has_removable_media(bs);
2462

    
2463
        if (bdrv_dev_has_removable_media(bs)) {
2464
            info->value->has_tray_open = true;
2465
            info->value->tray_open = bdrv_dev_is_tray_open(bs);
2466
        }
2467

    
2468
        if (bdrv_iostatus_is_enabled(bs)) {
2469
            info->value->has_io_status = true;
2470
            info->value->io_status = bs->iostatus;
2471
        }
2472

    
2473
        if (bs->drv) {
2474
            info->value->has_inserted = true;
2475
            info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2476
            info->value->inserted->file = g_strdup(bs->filename);
2477
            info->value->inserted->ro = bs->read_only;
2478
            info->value->inserted->drv = g_strdup(bs->drv->format_name);
2479
            info->value->inserted->encrypted = bs->encrypted;
2480
            if (bs->backing_file[0]) {
2481
                info->value->inserted->has_backing_file = true;
2482
                info->value->inserted->backing_file = g_strdup(bs->backing_file);
2483
            }
2484

    
2485
            if (bs->io_limits_enabled) {
2486
                info->value->inserted->bps =
2487
                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2488
                info->value->inserted->bps_rd =
2489
                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2490
                info->value->inserted->bps_wr =
2491
                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2492
                info->value->inserted->iops =
2493
                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2494
                info->value->inserted->iops_rd =
2495
                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2496
                info->value->inserted->iops_wr =
2497
                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2498
            }
2499
        }
2500

    
2501
        /* XXX: waiting for the qapi to support GSList */
2502
        if (!cur_item) {
2503
            head = cur_item = info;
2504
        } else {
2505
            cur_item->next = info;
2506
            cur_item = info;
2507
        }
2508
    }
2509

    
2510
    return head;
2511
}
2512

    
2513
/* Consider exposing this as a full fledged QMP command */
2514
static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2515
{
2516
    BlockStats *s;
2517

    
2518
    s = g_malloc0(sizeof(*s));
2519

    
2520
    if (bs->device_name[0]) {
2521
        s->has_device = true;
2522
        s->device = g_strdup(bs->device_name);
2523
    }
2524

    
2525
    s->stats = g_malloc0(sizeof(*s->stats));
2526
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2527
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2528
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2529
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2530
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2531
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2532
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2533
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2534
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2535

    
2536
    if (bs->file) {
2537
        s->has_parent = true;
2538
        s->parent = qmp_query_blockstat(bs->file, NULL);
2539
    }
2540

    
2541
    return s;
2542
}
2543

    
2544
BlockStatsList *qmp_query_blockstats(Error **errp)
2545
{
2546
    BlockStatsList *head = NULL, *cur_item = NULL;
2547
    BlockDriverState *bs;
2548

    
2549
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2550
        BlockStatsList *info = g_malloc0(sizeof(*info));
2551
        info->value = qmp_query_blockstat(bs, NULL);
2552

    
2553
        /* XXX: waiting for the qapi to support GSList */
2554
        if (!cur_item) {
2555
            head = cur_item = info;
2556
        } else {
2557
            cur_item->next = info;
2558
            cur_item = info;
2559
        }
2560
    }
2561

    
2562
    return head;
2563
}
2564

    
2565
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2566
{
2567
    if (bs->backing_hd && bs->backing_hd->encrypted)
2568
        return bs->backing_file;
2569
    else if (bs->encrypted)
2570
        return bs->filename;
2571
    else
2572
        return NULL;
2573
}
2574

    
2575
void bdrv_get_backing_filename(BlockDriverState *bs,
2576
                               char *filename, int filename_size)
2577
{
2578
    pstrcpy(filename, filename_size, bs->backing_file);
2579
}
2580

    
2581
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2582
                          const uint8_t *buf, int nb_sectors)
2583
{
2584
    BlockDriver *drv = bs->drv;
2585
    if (!drv)
2586
        return -ENOMEDIUM;
2587
    if (!drv->bdrv_write_compressed)
2588
        return -ENOTSUP;
2589
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2590
        return -EIO;
2591

    
2592
    if (bs->dirty_bitmap) {
2593
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2594
    }
2595

    
2596
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2597
}
2598

    
2599
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2600
{
2601
    BlockDriver *drv = bs->drv;
2602
    if (!drv)
2603
        return -ENOMEDIUM;
2604
    if (!drv->bdrv_get_info)
2605
        return -ENOTSUP;
2606
    memset(bdi, 0, sizeof(*bdi));
2607
    return drv->bdrv_get_info(bs, bdi);
2608
}
2609

    
2610
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2611
                      int64_t pos, int size)
2612
{
2613
    BlockDriver *drv = bs->drv;
2614
    if (!drv)
2615
        return -ENOMEDIUM;
2616
    if (drv->bdrv_save_vmstate)
2617
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
2618
    if (bs->file)
2619
        return bdrv_save_vmstate(bs->file, buf, pos, size);
2620
    return -ENOTSUP;
2621
}
2622

    
2623
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2624
                      int64_t pos, int size)
2625
{
2626
    BlockDriver *drv = bs->drv;
2627
    if (!drv)
2628
        return -ENOMEDIUM;
2629
    if (drv->bdrv_load_vmstate)
2630
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
2631
    if (bs->file)
2632
        return bdrv_load_vmstate(bs->file, buf, pos, size);
2633
    return -ENOTSUP;
2634
}
2635

    
2636
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2637
{
2638
    BlockDriver *drv = bs->drv;
2639

    
2640
    if (!drv || !drv->bdrv_debug_event) {
2641
        return;
2642
    }
2643

    
2644
    return drv->bdrv_debug_event(bs, event);
2645

    
2646
}
2647

    
2648
/**************************************************************/
2649
/* handling of snapshots */
2650

    
2651
int bdrv_can_snapshot(BlockDriverState *bs)
2652
{
2653
    BlockDriver *drv = bs->drv;
2654
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2655
        return 0;
2656
    }
2657

    
2658
    if (!drv->bdrv_snapshot_create) {
2659
        if (bs->file != NULL) {
2660
            return bdrv_can_snapshot(bs->file);
2661
        }
2662
        return 0;
2663
    }
2664

    
2665
    return 1;
2666
}
2667

    
2668
int bdrv_is_snapshot(BlockDriverState *bs)
2669
{
2670
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2671
}
2672

    
2673
BlockDriverState *bdrv_snapshots(void)
2674
{
2675
    BlockDriverState *bs;
2676

    
2677
    if (bs_snapshots) {
2678
        return bs_snapshots;
2679
    }
2680

    
2681
    bs = NULL;
2682
    while ((bs = bdrv_next(bs))) {
2683
        if (bdrv_can_snapshot(bs)) {
2684
            bs_snapshots = bs;
2685
            return bs;
2686
        }
2687
    }
2688
    return NULL;
2689
}
2690

    
2691
int bdrv_snapshot_create(BlockDriverState *bs,
2692
                         QEMUSnapshotInfo *sn_info)
2693
{
2694
    BlockDriver *drv = bs->drv;
2695
    if (!drv)
2696
        return -ENOMEDIUM;
2697
    if (drv->bdrv_snapshot_create)
2698
        return drv->bdrv_snapshot_create(bs, sn_info);
2699
    if (bs->file)
2700
        return bdrv_snapshot_create(bs->file, sn_info);
2701
    return -ENOTSUP;
2702
}
2703

    
2704
int bdrv_snapshot_goto(BlockDriverState *bs,
2705
                       const char *snapshot_id)
2706
{
2707
    BlockDriver *drv = bs->drv;
2708
    int ret, open_ret;
2709

    
2710
    if (!drv)
2711
        return -ENOMEDIUM;
2712
    if (drv->bdrv_snapshot_goto)
2713
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
2714

    
2715
    if (bs->file) {
2716
        drv->bdrv_close(bs);
2717
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2718
        open_ret = drv->bdrv_open(bs, bs->open_flags);
2719
        if (open_ret < 0) {
2720
            bdrv_delete(bs->file);
2721
            bs->drv = NULL;
2722
            return open_ret;
2723
        }
2724
        return ret;
2725
    }
2726

    
2727
    return -ENOTSUP;
2728
}
2729

    
2730
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2731
{
2732
    BlockDriver *drv = bs->drv;
2733
    if (!drv)
2734
        return -ENOMEDIUM;
2735
    if (drv->bdrv_snapshot_delete)
2736
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
2737
    if (bs->file)
2738
        return bdrv_snapshot_delete(bs->file, snapshot_id);
2739
    return -ENOTSUP;
2740
}
2741

    
2742
int bdrv_snapshot_list(BlockDriverState *bs,
2743
                       QEMUSnapshotInfo **psn_info)
2744
{
2745
    BlockDriver *drv = bs->drv;
2746
    if (!drv)
2747
        return -ENOMEDIUM;
2748
    if (drv->bdrv_snapshot_list)
2749
        return drv->bdrv_snapshot_list(bs, psn_info);
2750
    if (bs->file)
2751
        return bdrv_snapshot_list(bs->file, psn_info);
2752
    return -ENOTSUP;
2753
}
2754

    
2755
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2756
        const char *snapshot_name)
2757
{
2758
    BlockDriver *drv = bs->drv;
2759
    if (!drv) {
2760
        return -ENOMEDIUM;
2761
    }
2762
    if (!bs->read_only) {
2763
        return -EINVAL;
2764
    }
2765
    if (drv->bdrv_snapshot_load_tmp) {
2766
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2767
    }
2768
    return -ENOTSUP;
2769
}
2770

    
2771
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2772
        const char *backing_file)
2773
{
2774
    if (!bs->drv) {
2775
        return NULL;
2776
    }
2777

    
2778
    if (bs->backing_hd) {
2779
        if (strcmp(bs->backing_file, backing_file) == 0) {
2780
            return bs->backing_hd;
2781
        } else {
2782
            return bdrv_find_backing_image(bs->backing_hd, backing_file);
2783
        }
2784
    }
2785

    
2786
    return NULL;
2787
}
2788

    
2789
#define NB_SUFFIXES 4
2790

    
2791
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2792
{
2793
    static const char suffixes[NB_SUFFIXES] = "KMGT";
2794
    int64_t base;
2795
    int i;
2796

    
2797
    if (size <= 999) {
2798
        snprintf(buf, buf_size, "%" PRId64, size);
2799
    } else {
2800
        base = 1024;
2801
        for(i = 0; i < NB_SUFFIXES; i++) {
2802
            if (size < (10 * base)) {
2803
                snprintf(buf, buf_size, "%0.1f%c",
2804
                         (double)size / base,
2805
                         suffixes[i]);
2806
                break;
2807
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2808
                snprintf(buf, buf_size, "%" PRId64 "%c",
2809
                         ((size + (base >> 1)) / base),
2810
                         suffixes[i]);
2811
                break;
2812
            }
2813
            base = base * 1024;
2814
        }
2815
    }
2816
    return buf;
2817
}
2818

    
2819
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2820
{
2821
    char buf1[128], date_buf[128], clock_buf[128];
2822
#ifdef _WIN32
2823
    struct tm *ptm;
2824
#else
2825
    struct tm tm;
2826
#endif
2827
    time_t ti;
2828
    int64_t secs;
2829

    
2830
    if (!sn) {
2831
        snprintf(buf, buf_size,
2832
                 "%-10s%-20s%7s%20s%15s",
2833
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2834
    } else {
2835
        ti = sn->date_sec;
2836
#ifdef _WIN32
2837
        ptm = localtime(&ti);
2838
        strftime(date_buf, sizeof(date_buf),
2839
                 "%Y-%m-%d %H:%M:%S", ptm);
2840
#else
2841
        localtime_r(&ti, &tm);
2842
        strftime(date_buf, sizeof(date_buf),
2843
                 "%Y-%m-%d %H:%M:%S", &tm);
2844
#endif
2845
        secs = sn->vm_clock_nsec / 1000000000;
2846
        snprintf(clock_buf, sizeof(clock_buf),
2847
                 "%02d:%02d:%02d.%03d",
2848
                 (int)(secs / 3600),
2849
                 (int)((secs / 60) % 60),
2850
                 (int)(secs % 60),
2851
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2852
        snprintf(buf, buf_size,
2853
                 "%-10s%-20s%7s%20s%15s",
2854
                 sn->id_str, sn->name,
2855
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2856
                 date_buf,
2857
                 clock_buf);
2858
    }
2859
    return buf;
2860
}
2861

    
2862
/**************************************************************/
2863
/* async I/Os */
2864

    
2865
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2866
                                 QEMUIOVector *qiov, int nb_sectors,
2867
                                 BlockDriverCompletionFunc *cb, void *opaque)
2868
{
2869
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2870

    
2871
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2872
                                 cb, opaque, false);
2873
}
2874

    
2875
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2876
                                  QEMUIOVector *qiov, int nb_sectors,
2877
                                  BlockDriverCompletionFunc *cb, void *opaque)
2878
{
2879
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2880

    
2881
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2882
                                 cb, opaque, true);
2883
}
2884

    
2885

    
2886
typedef struct MultiwriteCB {
2887
    int error;
2888
    int num_requests;
2889
    int num_callbacks;
2890
    struct {
2891
        BlockDriverCompletionFunc *cb;
2892
        void *opaque;
2893
        QEMUIOVector *free_qiov;
2894
    } callbacks[];
2895
} MultiwriteCB;
2896

    
2897
static void multiwrite_user_cb(MultiwriteCB *mcb)
2898
{
2899
    int i;
2900

    
2901
    for (i = 0; i < mcb->num_callbacks; i++) {
2902
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2903
        if (mcb->callbacks[i].free_qiov) {
2904
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2905
        }
2906
        g_free(mcb->callbacks[i].free_qiov);
2907
    }
2908
}
2909

    
2910
static void multiwrite_cb(void *opaque, int ret)
2911
{
2912
    MultiwriteCB *mcb = opaque;
2913

    
2914
    trace_multiwrite_cb(mcb, ret);
2915

    
2916
    if (ret < 0 && !mcb->error) {
2917
        mcb->error = ret;
2918
    }
2919

    
2920
    mcb->num_requests--;
2921
    if (mcb->num_requests == 0) {
2922
        multiwrite_user_cb(mcb);
2923
        g_free(mcb);
2924
    }
2925
}
2926

    
2927
static int multiwrite_req_compare(const void *a, const void *b)
2928
{
2929
    const BlockRequest *req1 = a, *req2 = b;
2930

    
2931
    /*
2932
     * Note that we can't simply subtract req2->sector from req1->sector
2933
     * here as that could overflow the return value.
2934
     */
2935
    if (req1->sector > req2->sector) {
2936
        return 1;
2937
    } else if (req1->sector < req2->sector) {
2938
        return -1;
2939
    } else {
2940
        return 0;
2941
    }
2942
}
2943

    
2944
/*
2945
 * Takes a bunch of requests and tries to merge them. Returns the number of
2946
 * requests that remain after merging.
2947
 */
2948
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2949
    int num_reqs, MultiwriteCB *mcb)
2950
{
2951
    int i, outidx;
2952

    
2953
    // Sort requests by start sector
2954
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2955

    
2956
    // Check if adjacent requests touch the same clusters. If so, combine them,
2957
    // filling up gaps with zero sectors.
2958
    outidx = 0;
2959
    for (i = 1; i < num_reqs; i++) {
2960
        int merge = 0;
2961
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2962

    
2963
        // Handle exactly sequential writes and overlapping writes.
2964
        if (reqs[i].sector <= oldreq_last) {
2965
            merge = 1;
2966
        }
2967

    
2968
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2969
            merge = 0;
2970
        }
2971

    
2972
        if (merge) {
2973
            size_t size;
2974
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2975
            qemu_iovec_init(qiov,
2976
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2977

    
2978
            // Add the first request to the merged one. If the requests are
2979
            // overlapping, drop the last sectors of the first request.
2980
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
2981
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
2982

    
2983
            // We should need to add any zeros between the two requests
2984
            assert (reqs[i].sector <= oldreq_last);
2985

    
2986
            // Add the second request
2987
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
2988

    
2989
            reqs[outidx].nb_sectors = qiov->size >> 9;
2990
            reqs[outidx].qiov = qiov;
2991

    
2992
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2993
        } else {
2994
            outidx++;
2995
            reqs[outidx].sector     = reqs[i].sector;
2996
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2997
            reqs[outidx].qiov       = reqs[i].qiov;
2998
        }
2999
    }
3000

    
3001
    return outidx + 1;
3002
}
3003

    
3004
/*
3005
 * Submit multiple AIO write requests at once.
3006
 *
3007
 * On success, the function returns 0 and all requests in the reqs array have
3008
 * been submitted. In error case this function returns -1, and any of the
3009
 * requests may or may not be submitted yet. In particular, this means that the
3010
 * callback will be called for some of the requests, for others it won't. The
3011
 * caller must check the error field of the BlockRequest to wait for the right
3012
 * callbacks (if error != 0, no callback will be called).
3013
 *
3014
 * The implementation may modify the contents of the reqs array, e.g. to merge
3015
 * requests. However, the fields opaque and error are left unmodified as they
3016
 * are used to signal failure for a single request to the caller.
3017
 */
3018
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3019
{
3020
    MultiwriteCB *mcb;
3021
    int i;
3022

    
3023
    /* don't submit writes if we don't have a medium */
3024
    if (bs->drv == NULL) {
3025
        for (i = 0; i < num_reqs; i++) {
3026
            reqs[i].error = -ENOMEDIUM;
3027
        }
3028
        return -1;
3029
    }
3030

    
3031
    if (num_reqs == 0) {
3032
        return 0;
3033
    }
3034

    
3035
    // Create MultiwriteCB structure
3036
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3037
    mcb->num_requests = 0;
3038
    mcb->num_callbacks = num_reqs;
3039

    
3040
    for (i = 0; i < num_reqs; i++) {
3041
        mcb->callbacks[i].cb = reqs[i].cb;
3042
        mcb->callbacks[i].opaque = reqs[i].opaque;
3043
    }
3044

    
3045
    // Check for mergable requests
3046
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3047

    
3048
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3049

    
3050
    /* Run the aio requests. */
3051
    mcb->num_requests = num_reqs;
3052
    for (i = 0; i < num_reqs; i++) {
3053
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3054
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3055
    }
3056

    
3057
    return 0;
3058
}
3059

    
3060
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3061
{
3062
    acb->pool->cancel(acb);
3063
}
3064

    
3065
/* block I/O throttling */
3066
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3067
                 bool is_write, double elapsed_time, uint64_t *wait)
3068
{
3069
    uint64_t bps_limit = 0;
3070
    double   bytes_limit, bytes_base, bytes_res;
3071
    double   slice_time, wait_time;
3072

    
3073
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3074
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3075
    } else if (bs->io_limits.bps[is_write]) {
3076
        bps_limit = bs->io_limits.bps[is_write];
3077
    } else {
3078
        if (wait) {
3079
            *wait = 0;
3080
        }
3081

    
3082
        return false;
3083
    }
3084

    
3085
    slice_time = bs->slice_end - bs->slice_start;
3086
    slice_time /= (NANOSECONDS_PER_SECOND);
3087
    bytes_limit = bps_limit * slice_time;
3088
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3089
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3090
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3091
    }
3092

    
3093
    /* bytes_base: the bytes of data which have been read/written; and
3094
     *             it is obtained from the history statistic info.
3095
     * bytes_res: the remaining bytes of data which need to be read/written.
3096
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3097
     *             the total time for completing reading/writting all data.
3098
     */
3099
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3100

    
3101
    if (bytes_base + bytes_res <= bytes_limit) {
3102
        if (wait) {
3103
            *wait = 0;
3104
        }
3105

    
3106
        return false;
3107
    }
3108

    
3109
    /* Calc approx time to dispatch */
3110
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3111

    
3112
    /* When the I/O rate at runtime exceeds the limits,
3113
     * bs->slice_end need to be extended in order that the current statistic
3114
     * info can be kept until the timer fire, so it is increased and tuned
3115
     * based on the result of experiment.
3116
     */
3117
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3118
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3119
    if (wait) {
3120
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3121
    }
3122

    
3123
    return true;
3124
}
3125

    
3126
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3127
                             double elapsed_time, uint64_t *wait)
3128
{
3129
    uint64_t iops_limit = 0;
3130
    double   ios_limit, ios_base;
3131
    double   slice_time, wait_time;
3132

    
3133
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3134
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3135
    } else if (bs->io_limits.iops[is_write]) {
3136
        iops_limit = bs->io_limits.iops[is_write];
3137
    } else {
3138
        if (wait) {
3139
            *wait = 0;
3140
        }
3141

    
3142
        return false;
3143
    }
3144

    
3145
    slice_time = bs->slice_end - bs->slice_start;
3146
    slice_time /= (NANOSECONDS_PER_SECOND);
3147
    ios_limit  = iops_limit * slice_time;
3148
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3149
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3150
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3151
    }
3152

    
3153
    if (ios_base + 1 <= ios_limit) {
3154
        if (wait) {
3155
            *wait = 0;
3156
        }
3157

    
3158
        return false;
3159
    }
3160

    
3161
    /* Calc approx time to dispatch */
3162
    wait_time = (ios_base + 1) / iops_limit;
3163
    if (wait_time > elapsed_time) {
3164
        wait_time = wait_time - elapsed_time;
3165
    } else {
3166
        wait_time = 0;
3167
    }
3168

    
3169
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3170
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3171
    if (wait) {
3172
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3173
    }
3174

    
3175
    return true;
3176
}
3177

    
3178
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3179
                           bool is_write, int64_t *wait)
3180
{
3181
    int64_t  now, max_wait;
3182
    uint64_t bps_wait = 0, iops_wait = 0;
3183
    double   elapsed_time;
3184
    int      bps_ret, iops_ret;
3185

    
3186
    now = qemu_get_clock_ns(vm_clock);
3187
    if ((bs->slice_start < now)
3188
        && (bs->slice_end > now)) {
3189
        bs->slice_end = now + bs->slice_time;
3190
    } else {
3191
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3192
        bs->slice_start = now;
3193
        bs->slice_end   = now + bs->slice_time;
3194

    
3195
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3196
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3197

    
3198
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3199
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3200
    }
3201

    
3202
    elapsed_time  = now - bs->slice_start;
3203
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3204

    
3205
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3206
                                      is_write, elapsed_time, &bps_wait);
3207
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3208
                                      elapsed_time, &iops_wait);
3209
    if (bps_ret || iops_ret) {
3210
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3211
        if (wait) {
3212
            *wait = max_wait;
3213
        }
3214

    
3215
        now = qemu_get_clock_ns(vm_clock);
3216
        if (bs->slice_end < now + max_wait) {
3217
            bs->slice_end = now + max_wait;
3218
        }
3219

    
3220
        return true;
3221
    }
3222

    
3223
    if (wait) {
3224
        *wait = 0;
3225
    }
3226

    
3227
    return false;
3228
}
3229

    
3230
/**************************************************************/
3231
/* async block device emulation */
3232

    
3233
typedef struct BlockDriverAIOCBSync {
3234
    BlockDriverAIOCB common;
3235
    QEMUBH *bh;
3236
    int ret;
3237
    /* vector translation state */
3238
    QEMUIOVector *qiov;
3239
    uint8_t *bounce;
3240
    int is_write;
3241
} BlockDriverAIOCBSync;
3242

    
3243
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3244
{
3245
    BlockDriverAIOCBSync *acb =
3246
        container_of(blockacb, BlockDriverAIOCBSync, common);
3247
    qemu_bh_delete(acb->bh);
3248
    acb->bh = NULL;
3249
    qemu_aio_release(acb);
3250
}
3251

    
3252
static AIOPool bdrv_em_aio_pool = {
3253
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3254
    .cancel             = bdrv_aio_cancel_em,
3255
};
3256

    
3257
static void bdrv_aio_bh_cb(void *opaque)
3258
{
3259
    BlockDriverAIOCBSync *acb = opaque;
3260

    
3261
    if (!acb->is_write)
3262
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3263
    qemu_vfree(acb->bounce);
3264
    acb->common.cb(acb->common.opaque, acb->ret);
3265
    qemu_bh_delete(acb->bh);
3266
    acb->bh = NULL;
3267
    qemu_aio_release(acb);
3268
}
3269

    
3270
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3271
                                            int64_t sector_num,
3272
                                            QEMUIOVector *qiov,
3273
                                            int nb_sectors,
3274
                                            BlockDriverCompletionFunc *cb,
3275
                                            void *opaque,
3276
                                            int is_write)
3277

    
3278
{
3279
    BlockDriverAIOCBSync *acb;
3280

    
3281
    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3282
    acb->is_write = is_write;
3283
    acb->qiov = qiov;
3284
    acb->bounce = qemu_blockalign(bs, qiov->size);
3285
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3286

    
3287
    if (is_write) {
3288
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3289
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3290
    } else {
3291
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3292
    }
3293

    
3294
    qemu_bh_schedule(acb->bh);
3295

    
3296
    return &acb->common;
3297
}
3298

    
3299
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3300
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3301
        BlockDriverCompletionFunc *cb, void *opaque)
3302
{
3303
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3304
}
3305

    
3306
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3307
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3308
        BlockDriverCompletionFunc *cb, void *opaque)
3309
{
3310
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3311
}
3312

    
3313

    
3314
typedef struct BlockDriverAIOCBCoroutine {
3315
    BlockDriverAIOCB common;
3316
    BlockRequest req;
3317
    bool is_write;
3318
    QEMUBH* bh;
3319
} BlockDriverAIOCBCoroutine;
3320

    
3321
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3322
{
3323
    qemu_aio_flush();
3324
}
3325

    
3326
static AIOPool bdrv_em_co_aio_pool = {
3327
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3328
    .cancel             = bdrv_aio_co_cancel_em,
3329
};
3330

    
3331
static void bdrv_co_em_bh(void *opaque)
3332
{
3333
    BlockDriverAIOCBCoroutine *acb = opaque;
3334

    
3335
    acb->common.cb(acb->common.opaque, acb->req.error);
3336
    qemu_bh_delete(acb->bh);
3337
    qemu_aio_release(acb);
3338
}
3339

    
3340
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3341
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3342
{
3343
    BlockDriverAIOCBCoroutine *acb = opaque;
3344
    BlockDriverState *bs = acb->common.bs;
3345

    
3346
    if (!acb->is_write) {
3347
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3348
            acb->req.nb_sectors, acb->req.qiov, 0);
3349
    } else {
3350
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3351
            acb->req.nb_sectors, acb->req.qiov, 0);
3352
    }
3353

    
3354
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3355
    qemu_bh_schedule(acb->bh);
3356
}
3357

    
3358
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3359
                                               int64_t sector_num,
3360
                                               QEMUIOVector *qiov,
3361
                                               int nb_sectors,
3362
                                               BlockDriverCompletionFunc *cb,
3363
                                               void *opaque,
3364
                                               bool is_write)
3365
{
3366
    Coroutine *co;
3367
    BlockDriverAIOCBCoroutine *acb;
3368

    
3369
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3370
    acb->req.sector = sector_num;
3371
    acb->req.nb_sectors = nb_sectors;
3372
    acb->req.qiov = qiov;
3373
    acb->is_write = is_write;
3374

    
3375
    co = qemu_coroutine_create(bdrv_co_do_rw);
3376
    qemu_coroutine_enter(co, acb);
3377

    
3378
    return &acb->common;
3379
}
3380

    
3381
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3382
{
3383
    BlockDriverAIOCBCoroutine *acb = opaque;
3384
    BlockDriverState *bs = acb->common.bs;
3385

    
3386
    acb->req.error = bdrv_co_flush(bs);
3387
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3388
    qemu_bh_schedule(acb->bh);
3389
}
3390

    
3391
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3392
        BlockDriverCompletionFunc *cb, void *opaque)
3393
{
3394
    trace_bdrv_aio_flush(bs, opaque);
3395

    
3396
    Coroutine *co;
3397
    BlockDriverAIOCBCoroutine *acb;
3398

    
3399
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3400
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3401
    qemu_coroutine_enter(co, acb);
3402

    
3403
    return &acb->common;
3404
}
3405

    
3406
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3407
{
3408
    BlockDriverAIOCBCoroutine *acb = opaque;
3409
    BlockDriverState *bs = acb->common.bs;
3410

    
3411
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3412
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3413
    qemu_bh_schedule(acb->bh);
3414
}
3415

    
3416
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3417
        int64_t sector_num, int nb_sectors,
3418
        BlockDriverCompletionFunc *cb, void *opaque)
3419
{
3420
    Coroutine *co;
3421
    BlockDriverAIOCBCoroutine *acb;
3422

    
3423
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3424

    
3425
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3426
    acb->req.sector = sector_num;
3427
    acb->req.nb_sectors = nb_sectors;
3428
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3429
    qemu_coroutine_enter(co, acb);
3430

    
3431
    return &acb->common;
3432
}
3433

    
3434
void bdrv_init(void)
3435
{
3436
    module_call_init(MODULE_INIT_BLOCK);
3437
}
3438

    
3439
void bdrv_init_with_whitelist(void)
3440
{
3441
    use_bdrv_whitelist = 1;
3442
    bdrv_init();
3443
}
3444

    
3445
void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3446
                   BlockDriverCompletionFunc *cb, void *opaque)
3447
{
3448
    BlockDriverAIOCB *acb;
3449

    
3450
    if (pool->free_aiocb) {
3451
        acb = pool->free_aiocb;
3452
        pool->free_aiocb = acb->next;
3453
    } else {
3454
        acb = g_malloc0(pool->aiocb_size);
3455
        acb->pool = pool;
3456
    }
3457
    acb->bs = bs;
3458
    acb->cb = cb;
3459
    acb->opaque = opaque;
3460
    return acb;
3461
}
3462

    
3463
void qemu_aio_release(void *p)
3464
{
3465
    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3466
    AIOPool *pool = acb->pool;
3467
    acb->next = pool->free_aiocb;
3468
    pool->free_aiocb = acb;
3469
}
3470

    
3471
/**************************************************************/
3472
/* Coroutine block device emulation */
3473

    
3474
typedef struct CoroutineIOCompletion {
3475
    Coroutine *coroutine;
3476
    int ret;
3477
} CoroutineIOCompletion;
3478

    
3479
static void bdrv_co_io_em_complete(void *opaque, int ret)
3480
{
3481
    CoroutineIOCompletion *co = opaque;
3482

    
3483
    co->ret = ret;
3484
    qemu_coroutine_enter(co->coroutine, NULL);
3485
}
3486

    
3487
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3488
                                      int nb_sectors, QEMUIOVector *iov,
3489
                                      bool is_write)
3490
{
3491
    CoroutineIOCompletion co = {
3492
        .coroutine = qemu_coroutine_self(),
3493
    };
3494
    BlockDriverAIOCB *acb;
3495

    
3496
    if (is_write) {
3497
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3498
                                       bdrv_co_io_em_complete, &co);
3499
    } else {
3500
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3501
                                      bdrv_co_io_em_complete, &co);
3502
    }
3503

    
3504
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3505
    if (!acb) {
3506
        return -EIO;
3507
    }
3508
    qemu_coroutine_yield();
3509

    
3510
    return co.ret;
3511
}
3512

    
3513
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3514
                                         int64_t sector_num, int nb_sectors,
3515
                                         QEMUIOVector *iov)
3516
{
3517
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3518
}
3519

    
3520
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3521
                                         int64_t sector_num, int nb_sectors,
3522
                                         QEMUIOVector *iov)
3523
{
3524
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3525
}
3526

    
3527
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3528
{
3529
    RwCo *rwco = opaque;
3530

    
3531
    rwco->ret = bdrv_co_flush(rwco->bs);
3532
}
3533

    
3534
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3535
{
3536
    int ret;
3537

    
3538
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3539
        return 0;
3540
    }
3541

    
3542
    /* Write back cached data to the OS even with cache=unsafe */
3543
    if (bs->drv->bdrv_co_flush_to_os) {
3544
        ret = bs->drv->bdrv_co_flush_to_os(bs);
3545
        if (ret < 0) {
3546
            return ret;
3547
        }
3548
    }
3549

    
3550
    /* But don't actually force it to the disk with cache=unsafe */
3551
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
3552
        return 0;
3553
    }
3554

    
3555
    if (bs->drv->bdrv_co_flush_to_disk) {
3556
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
3557
    } else if (bs->drv->bdrv_aio_flush) {
3558
        BlockDriverAIOCB *acb;
3559
        CoroutineIOCompletion co = {
3560
            .coroutine = qemu_coroutine_self(),
3561
        };
3562

    
3563
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3564
        if (acb == NULL) {
3565
            ret = -EIO;
3566
        } else {
3567
            qemu_coroutine_yield();
3568
            ret = co.ret;
3569
        }
3570
    } else {
3571
        /*
3572
         * Some block drivers always operate in either writethrough or unsafe
3573
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3574
         * know how the server works (because the behaviour is hardcoded or
3575
         * depends on server-side configuration), so we can't ensure that
3576
         * everything is safe on disk. Returning an error doesn't work because
3577
         * that would break guests even if the server operates in writethrough
3578
         * mode.
3579
         *
3580
         * Let's hope the user knows what he's doing.
3581
         */
3582
        ret = 0;
3583
    }
3584
    if (ret < 0) {
3585
        return ret;
3586
    }
3587

    
3588
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3589
     * in the case of cache=unsafe, so there are no useless flushes.
3590
     */
3591
    return bdrv_co_flush(bs->file);
3592
}
3593

    
3594
void bdrv_invalidate_cache(BlockDriverState *bs)
3595
{
3596
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3597
        bs->drv->bdrv_invalidate_cache(bs);
3598
    }
3599
}
3600

    
3601
void bdrv_invalidate_cache_all(void)
3602
{
3603
    BlockDriverState *bs;
3604

    
3605
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3606
        bdrv_invalidate_cache(bs);
3607
    }
3608
}
3609

    
3610
void bdrv_clear_incoming_migration_all(void)
3611
{
3612
    BlockDriverState *bs;
3613

    
3614
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3615
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3616
    }
3617
}
3618

    
3619
int bdrv_flush(BlockDriverState *bs)
3620
{
3621
    Coroutine *co;
3622
    RwCo rwco = {
3623
        .bs = bs,
3624
        .ret = NOT_DONE,
3625
    };
3626

    
3627
    if (qemu_in_coroutine()) {
3628
        /* Fast-path if already in coroutine context */
3629
        bdrv_flush_co_entry(&rwco);
3630
    } else {
3631
        co = qemu_coroutine_create(bdrv_flush_co_entry);
3632
        qemu_coroutine_enter(co, &rwco);
3633
        while (rwco.ret == NOT_DONE) {
3634
            qemu_aio_wait();
3635
        }
3636
    }
3637

    
3638
    return rwco.ret;
3639
}
3640

    
3641
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3642
{
3643
    RwCo *rwco = opaque;
3644

    
3645
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3646
}
3647

    
3648
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3649
                                 int nb_sectors)
3650
{
3651
    if (!bs->drv) {
3652
        return -ENOMEDIUM;
3653
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3654
        return -EIO;
3655
    } else if (bs->read_only) {
3656
        return -EROFS;
3657
    } else if (bs->drv->bdrv_co_discard) {
3658
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3659
    } else if (bs->drv->bdrv_aio_discard) {
3660
        BlockDriverAIOCB *acb;
3661
        CoroutineIOCompletion co = {
3662
            .coroutine = qemu_coroutine_self(),
3663
        };
3664

    
3665
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3666
                                        bdrv_co_io_em_complete, &co);
3667
        if (acb == NULL) {
3668
            return -EIO;
3669
        } else {
3670
            qemu_coroutine_yield();
3671
            return co.ret;
3672
        }
3673
    } else {
3674
        return 0;
3675
    }
3676
}
3677

    
3678
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3679
{
3680
    Coroutine *co;
3681
    RwCo rwco = {
3682
        .bs = bs,
3683
        .sector_num = sector_num,
3684
        .nb_sectors = nb_sectors,
3685
        .ret = NOT_DONE,
3686
    };
3687

    
3688
    if (qemu_in_coroutine()) {
3689
        /* Fast-path if already in coroutine context */
3690
        bdrv_discard_co_entry(&rwco);
3691
    } else {
3692
        co = qemu_coroutine_create(bdrv_discard_co_entry);
3693
        qemu_coroutine_enter(co, &rwco);
3694
        while (rwco.ret == NOT_DONE) {
3695
            qemu_aio_wait();
3696
        }
3697
    }
3698

    
3699
    return rwco.ret;
3700
}
3701

    
3702
/**************************************************************/
3703
/* removable device support */
3704

    
3705
/**
3706
 * Return TRUE if the media is present
3707
 */
3708
int bdrv_is_inserted(BlockDriverState *bs)
3709
{
3710
    BlockDriver *drv = bs->drv;
3711

    
3712
    if (!drv)
3713
        return 0;
3714
    if (!drv->bdrv_is_inserted)
3715
        return 1;
3716
    return drv->bdrv_is_inserted(bs);
3717
}
3718

    
3719
/**
3720
 * Return whether the media changed since the last call to this
3721
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3722
 */
3723
int bdrv_media_changed(BlockDriverState *bs)
3724
{
3725
    BlockDriver *drv = bs->drv;
3726

    
3727
    if (drv && drv->bdrv_media_changed) {
3728
        return drv->bdrv_media_changed(bs);
3729
    }
3730
    return -ENOTSUP;
3731
}
3732

    
3733
/**
3734
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3735
 */
3736
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3737
{
3738
    BlockDriver *drv = bs->drv;
3739

    
3740
    if (drv && drv->bdrv_eject) {
3741
        drv->bdrv_eject(bs, eject_flag);
3742
    }
3743

    
3744
    if (bs->device_name[0] != '\0') {
3745
        bdrv_emit_qmp_eject_event(bs, eject_flag);
3746
    }
3747
}
3748

    
3749
/**
3750
 * Lock or unlock the media (if it is locked, the user won't be able
3751
 * to eject it manually).
3752
 */
3753
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3754
{
3755
    BlockDriver *drv = bs->drv;
3756

    
3757
    trace_bdrv_lock_medium(bs, locked);
3758

    
3759
    if (drv && drv->bdrv_lock_medium) {
3760
        drv->bdrv_lock_medium(bs, locked);
3761
    }
3762
}
3763

    
3764
/* needed for generic scsi interface */
3765

    
3766
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3767
{
3768
    BlockDriver *drv = bs->drv;
3769

    
3770
    if (drv && drv->bdrv_ioctl)
3771
        return drv->bdrv_ioctl(bs, req, buf);
3772
    return -ENOTSUP;
3773
}
3774

    
3775
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3776
        unsigned long int req, void *buf,
3777
        BlockDriverCompletionFunc *cb, void *opaque)
3778
{
3779
    BlockDriver *drv = bs->drv;
3780

    
3781
    if (drv && drv->bdrv_aio_ioctl)
3782
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3783
    return NULL;
3784
}
3785

    
3786
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3787
{
3788
    bs->buffer_alignment = align;
3789
}
3790

    
3791
void *qemu_blockalign(BlockDriverState *bs, size_t size)
3792
{
3793
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3794
}
3795

    
3796
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3797
{
3798
    int64_t bitmap_size;
3799

    
3800
    bs->dirty_count = 0;
3801
    if (enable) {
3802
        if (!bs->dirty_bitmap) {
3803
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3804
                    BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3805
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3806

    
3807
            bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3808
        }
3809
    } else {
3810
        if (bs->dirty_bitmap) {
3811
            g_free(bs->dirty_bitmap);
3812
            bs->dirty_bitmap = NULL;
3813
        }
3814
    }
3815
}
3816

    
3817
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3818
{
3819
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3820

    
3821
    if (bs->dirty_bitmap &&
3822
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3823
        return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3824
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
3825
    } else {
3826
        return 0;
3827
    }
3828
}
3829

    
3830
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3831
                      int nr_sectors)
3832
{
3833
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3834
}
3835

    
3836
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3837
{
3838
    return bs->dirty_count;
3839
}
3840

    
3841
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3842
{
3843
    assert(bs->in_use != in_use);
3844
    bs->in_use = in_use;
3845
}
3846

    
3847
int bdrv_in_use(BlockDriverState *bs)
3848
{
3849
    return bs->in_use;
3850
}
3851

    
3852
void bdrv_iostatus_enable(BlockDriverState *bs)
3853
{
3854
    bs->iostatus_enabled = true;
3855
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3856
}
3857

    
3858
/* The I/O status is only enabled if the drive explicitly
3859
 * enables it _and_ the VM is configured to stop on errors */
3860
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3861
{
3862
    return (bs->iostatus_enabled &&
3863
           (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3864
            bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3865
            bs->on_read_error == BLOCK_ERR_STOP_ANY));
3866
}
3867

    
3868
void bdrv_iostatus_disable(BlockDriverState *bs)
3869
{
3870
    bs->iostatus_enabled = false;
3871
}
3872

    
3873
void bdrv_iostatus_reset(BlockDriverState *bs)
3874
{
3875
    if (bdrv_iostatus_is_enabled(bs)) {
3876
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3877
    }
3878
}
3879

    
3880
/* XXX: Today this is set by device models because it makes the implementation
3881
   quite simple. However, the block layer knows about the error, so it's
3882
   possible to implement this without device models being involved */
3883
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3884
{
3885
    if (bdrv_iostatus_is_enabled(bs) &&
3886
        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3887
        assert(error >= 0);
3888
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3889
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
3890
    }
3891
}
3892

    
3893
void
3894
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3895
        enum BlockAcctType type)
3896
{
3897
    assert(type < BDRV_MAX_IOTYPE);
3898

    
3899
    cookie->bytes = bytes;
3900
    cookie->start_time_ns = get_clock();
3901
    cookie->type = type;
3902
}
3903

    
3904
void
3905
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3906
{
3907
    assert(cookie->type < BDRV_MAX_IOTYPE);
3908

    
3909
    bs->nr_bytes[cookie->type] += cookie->bytes;
3910
    bs->nr_ops[cookie->type]++;
3911
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3912
}
3913

    
3914
int bdrv_img_create(const char *filename, const char *fmt,
3915
                    const char *base_filename, const char *base_fmt,
3916
                    char *options, uint64_t img_size, int flags)
3917
{
3918
    QEMUOptionParameter *param = NULL, *create_options = NULL;
3919
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
3920
    BlockDriverState *bs = NULL;
3921
    BlockDriver *drv, *proto_drv;
3922
    BlockDriver *backing_drv = NULL;
3923
    int ret = 0;
3924

    
3925
    /* Find driver and parse its options */
3926
    drv = bdrv_find_format(fmt);
3927
    if (!drv) {
3928
        error_report("Unknown file format '%s'", fmt);
3929
        ret = -EINVAL;
3930
        goto out;
3931
    }
3932

    
3933
    proto_drv = bdrv_find_protocol(filename);
3934
    if (!proto_drv) {
3935
        error_report("Unknown protocol '%s'", filename);
3936
        ret = -EINVAL;
3937
        goto out;
3938
    }
3939

    
3940
    create_options = append_option_parameters(create_options,
3941
                                              drv->create_options);
3942
    create_options = append_option_parameters(create_options,
3943
                                              proto_drv->create_options);
3944

    
3945
    /* Create parameter list with default values */
3946
    param = parse_option_parameters("", create_options, param);
3947

    
3948
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3949

    
3950
    /* Parse -o options */
3951
    if (options) {
3952
        param = parse_option_parameters(options, create_options, param);
3953
        if (param == NULL) {
3954
            error_report("Invalid options for file format '%s'.", fmt);
3955
            ret = -EINVAL;
3956
            goto out;
3957
        }
3958
    }
3959

    
3960
    if (base_filename) {
3961
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3962
                                 base_filename)) {
3963
            error_report("Backing file not supported for file format '%s'",
3964
                         fmt);
3965
            ret = -EINVAL;
3966
            goto out;
3967
        }
3968
    }
3969

    
3970
    if (base_fmt) {
3971
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3972
            error_report("Backing file format not supported for file "
3973
                         "format '%s'", fmt);
3974
            ret = -EINVAL;
3975
            goto out;
3976
        }
3977
    }
3978

    
3979
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3980
    if (backing_file && backing_file->value.s) {
3981
        if (!strcmp(filename, backing_file->value.s)) {
3982
            error_report("Error: Trying to create an image with the "
3983
                         "same filename as the backing file");
3984
            ret = -EINVAL;
3985
            goto out;
3986
        }
3987
    }
3988

    
3989
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3990
    if (backing_fmt && backing_fmt->value.s) {
3991
        backing_drv = bdrv_find_format(backing_fmt->value.s);
3992
        if (!backing_drv) {
3993
            error_report("Unknown backing file format '%s'",
3994
                         backing_fmt->value.s);
3995
            ret = -EINVAL;
3996
            goto out;
3997
        }
3998
    }
3999

    
4000
    // The size for the image must always be specified, with one exception:
4001
    // If we are using a backing file, we can obtain the size from there
4002
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4003
    if (size && size->value.n == -1) {
4004
        if (backing_file && backing_file->value.s) {
4005
            uint64_t size;
4006
            char buf[32];
4007
            int back_flags;
4008

    
4009
            /* backing files always opened read-only */
4010
            back_flags =
4011
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4012

    
4013
            bs = bdrv_new("");
4014

    
4015
            ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4016
            if (ret < 0) {
4017
                error_report("Could not open '%s'", backing_file->value.s);
4018
                goto out;
4019
            }
4020
            bdrv_get_geometry(bs, &size);
4021
            size *= 512;
4022

    
4023
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4024
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4025
        } else {
4026
            error_report("Image creation needs a size parameter");
4027
            ret = -EINVAL;
4028
            goto out;
4029
        }
4030
    }
4031

    
4032
    printf("Formatting '%s', fmt=%s ", filename, fmt);
4033
    print_option_parameters(param);
4034
    puts("");
4035

    
4036
    ret = bdrv_create(drv, filename, param);
4037

    
4038
    if (ret < 0) {
4039
        if (ret == -ENOTSUP) {
4040
            error_report("Formatting or formatting option not supported for "
4041
                         "file format '%s'", fmt);
4042
        } else if (ret == -EFBIG) {
4043
            error_report("The image size is too large for file format '%s'",
4044
                         fmt);
4045
        } else {
4046
            error_report("%s: error while creating %s: %s", filename, fmt,
4047
                         strerror(-ret));
4048
        }
4049
    }
4050

    
4051
out:
4052
    free_option_parameters(create_options);
4053
    free_option_parameters(param);
4054

    
4055
    if (bs) {
4056
        bdrv_delete(bs);
4057
    }
4058

    
4059
    return ret;
4060
}
4061

    
4062
void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4063
                       int64_t speed, BlockDriverCompletionFunc *cb,
4064
                       void *opaque, Error **errp)
4065
{
4066
    BlockJob *job;
4067

    
4068
    if (bs->job || bdrv_in_use(bs)) {
4069
        error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4070
        return NULL;
4071
    }
4072
    bdrv_set_in_use(bs, 1);
4073

    
4074
    job = g_malloc0(job_type->instance_size);
4075
    job->job_type      = job_type;
4076
    job->bs            = bs;
4077
    job->cb            = cb;
4078
    job->opaque        = opaque;
4079
    job->busy          = true;
4080
    bs->job = job;
4081

    
4082
    /* Only set speed when necessary to avoid NotSupported error */
4083
    if (speed != 0) {
4084
        Error *local_err = NULL;
4085

    
4086
        block_job_set_speed(job, speed, &local_err);
4087
        if (error_is_set(&local_err)) {
4088
            bs->job = NULL;
4089
            g_free(job);
4090
            bdrv_set_in_use(bs, 0);
4091
            error_propagate(errp, local_err);
4092
            return NULL;
4093
        }
4094
    }
4095
    return job;
4096
}
4097

    
4098
void block_job_complete(BlockJob *job, int ret)
4099
{
4100
    BlockDriverState *bs = job->bs;
4101

    
4102
    assert(bs->job == job);
4103
    job->cb(job->opaque, ret);
4104
    bs->job = NULL;
4105
    g_free(job);
4106
    bdrv_set_in_use(bs, 0);
4107
}
4108

    
4109
void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4110
{
4111
    Error *local_err = NULL;
4112

    
4113
    if (!job->job_type->set_speed) {
4114
        error_set(errp, QERR_NOT_SUPPORTED);
4115
        return;
4116
    }
4117
    job->job_type->set_speed(job, speed, &local_err);
4118
    if (error_is_set(&local_err)) {
4119
        error_propagate(errp, local_err);
4120
        return;
4121
    }
4122

    
4123
    job->speed = speed;
4124
}
4125

    
4126
void block_job_cancel(BlockJob *job)
4127
{
4128
    job->cancelled = true;
4129
    if (job->co && !job->busy) {
4130
        qemu_coroutine_enter(job->co, NULL);
4131
    }
4132
}
4133

    
4134
bool block_job_is_cancelled(BlockJob *job)
4135
{
4136
    return job->cancelled;
4137
}
4138

    
4139
struct BlockCancelData {
4140
    BlockJob *job;
4141
    BlockDriverCompletionFunc *cb;
4142
    void *opaque;
4143
    bool cancelled;
4144
    int ret;
4145
};
4146

    
4147
static void block_job_cancel_cb(void *opaque, int ret)
4148
{
4149
    struct BlockCancelData *data = opaque;
4150

    
4151
    data->cancelled = block_job_is_cancelled(data->job);
4152
    data->ret = ret;
4153
    data->cb(data->opaque, ret);
4154
}
4155

    
4156
int block_job_cancel_sync(BlockJob *job)
4157
{
4158
    struct BlockCancelData data;
4159
    BlockDriverState *bs = job->bs;
4160

    
4161
    assert(bs->job == job);
4162

    
4163
    /* Set up our own callback to store the result and chain to
4164
     * the original callback.
4165
     */
4166
    data.job = job;
4167
    data.cb = job->cb;
4168
    data.opaque = job->opaque;
4169
    data.ret = -EINPROGRESS;
4170
    job->cb = block_job_cancel_cb;
4171
    job->opaque = &data;
4172
    block_job_cancel(job);
4173
    while (data.ret == -EINPROGRESS) {
4174
        qemu_aio_wait();
4175
    }
4176
    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4177
}
4178

    
4179
void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4180
{
4181
    /* Check cancellation *before* setting busy = false, too!  */
4182
    if (!block_job_is_cancelled(job)) {
4183
        job->busy = false;
4184
        co_sleep_ns(clock, ns);
4185
        job->busy = true;
4186
    }
4187
}