Statistics
| Branch: | Revision:

root / block.c @ 13ef70f6

History | View | Annotate | Download (113.5 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block_int.h"
29
#include "module.h"
30
#include "qjson.h"
31
#include "qemu-coroutine.h"
32
#include "qmp-commands.h"
33
#include "qemu-timer.h"
34

    
35
#ifdef CONFIG_BSD
36
#include <sys/types.h>
37
#include <sys/stat.h>
38
#include <sys/ioctl.h>
39
#include <sys/queue.h>
40
#ifndef __DragonFly__
41
#include <sys/disk.h>
42
#endif
43
#endif
44

    
45
#ifdef _WIN32
46
#include <windows.h>
47
#endif
48

    
49
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50

    
51
typedef enum {
52
    BDRV_REQ_COPY_ON_READ = 0x1,
53
    BDRV_REQ_ZERO_WRITE   = 0x2,
54
} BdrvRequestFlags;
55

    
56
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59
        BlockDriverCompletionFunc *cb, void *opaque);
60
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64
                                         int64_t sector_num, int nb_sectors,
65
                                         QEMUIOVector *iov);
66
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71
    BdrvRequestFlags flags);
72
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76
                                               int64_t sector_num,
77
                                               QEMUIOVector *qiov,
78
                                               int nb_sectors,
79
                                               BlockDriverCompletionFunc *cb,
80
                                               void *opaque,
81
                                               bool is_write);
82
static void coroutine_fn bdrv_co_do_rw(void *opaque);
83
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84
    int64_t sector_num, int nb_sectors);
85

    
86
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87
        bool is_write, double elapsed_time, uint64_t *wait);
88
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89
        double elapsed_time, uint64_t *wait);
90
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91
        bool is_write, int64_t *wait);
92

    
93
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
95

    
96
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
98

    
99
/* The device to use for VM snapshots */
100
static BlockDriverState *bs_snapshots;
101

    
102
/* If non-zero, use only whitelisted block drivers */
103
static int use_bdrv_whitelist;
104

    
105
#ifdef _WIN32
106
static int is_windows_drive_prefix(const char *filename)
107
{
108
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110
            filename[1] == ':');
111
}
112

    
113
int is_windows_drive(const char *filename)
114
{
115
    if (is_windows_drive_prefix(filename) &&
116
        filename[2] == '\0')
117
        return 1;
118
    if (strstart(filename, "\\\\.\\", NULL) ||
119
        strstart(filename, "//./", NULL))
120
        return 1;
121
    return 0;
122
}
123
#endif
124

    
125
/* throttling disk I/O limits */
126
void bdrv_io_limits_disable(BlockDriverState *bs)
127
{
128
    bs->io_limits_enabled = false;
129

    
130
    while (qemu_co_queue_next(&bs->throttled_reqs));
131

    
132
    if (bs->block_timer) {
133
        qemu_del_timer(bs->block_timer);
134
        qemu_free_timer(bs->block_timer);
135
        bs->block_timer = NULL;
136
    }
137

    
138
    bs->slice_start = 0;
139
    bs->slice_end   = 0;
140
    bs->slice_time  = 0;
141
    memset(&bs->io_base, 0, sizeof(bs->io_base));
142
}
143

    
144
static void bdrv_block_timer(void *opaque)
145
{
146
    BlockDriverState *bs = opaque;
147

    
148
    qemu_co_queue_next(&bs->throttled_reqs);
149
}
150

    
151
void bdrv_io_limits_enable(BlockDriverState *bs)
152
{
153
    qemu_co_queue_init(&bs->throttled_reqs);
154
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
156
    bs->slice_start = qemu_get_clock_ns(vm_clock);
157
    bs->slice_end   = bs->slice_start + bs->slice_time;
158
    memset(&bs->io_base, 0, sizeof(bs->io_base));
159
    bs->io_limits_enabled = true;
160
}
161

    
162
bool bdrv_io_limits_enabled(BlockDriverState *bs)
163
{
164
    BlockIOLimit *io_limits = &bs->io_limits;
165
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
166
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
169
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171
}
172

    
173
static void bdrv_io_limits_intercept(BlockDriverState *bs,
174
                                     bool is_write, int nb_sectors)
175
{
176
    int64_t wait_time = -1;
177

    
178
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179
        qemu_co_queue_wait(&bs->throttled_reqs);
180
    }
181

    
182
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183
     * throttled requests will not be dequeued until the current request is
184
     * allowed to be serviced. So if the current request still exceeds the
185
     * limits, it will be inserted to the head. All requests followed it will
186
     * be still in throttled_reqs queue.
187
     */
188

    
189
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190
        qemu_mod_timer(bs->block_timer,
191
                       wait_time + qemu_get_clock_ns(vm_clock));
192
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193
    }
194

    
195
    qemu_co_queue_next(&bs->throttled_reqs);
196
}
197

    
198
/* check if the path starts with "<protocol>:" */
199
static int path_has_protocol(const char *path)
200
{
201
    const char *p;
202

    
203
#ifdef _WIN32
204
    if (is_windows_drive(path) ||
205
        is_windows_drive_prefix(path)) {
206
        return 0;
207
    }
208
    p = path + strcspn(path, ":/\\");
209
#else
210
    p = path + strcspn(path, ":/");
211
#endif
212

    
213
    return *p == ':';
214
}
215

    
216
int path_is_absolute(const char *path)
217
{
218
#ifdef _WIN32
219
    /* specific case for names like: "\\.\d:" */
220
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221
        return 1;
222
    }
223
    return (*path == '/' || *path == '\\');
224
#else
225
    return (*path == '/');
226
#endif
227
}
228

    
229
/* if filename is absolute, just copy it to dest. Otherwise, build a
230
   path to it by considering it is relative to base_path. URL are
231
   supported. */
232
void path_combine(char *dest, int dest_size,
233
                  const char *base_path,
234
                  const char *filename)
235
{
236
    const char *p, *p1;
237
    int len;
238

    
239
    if (dest_size <= 0)
240
        return;
241
    if (path_is_absolute(filename)) {
242
        pstrcpy(dest, dest_size, filename);
243
    } else {
244
        p = strchr(base_path, ':');
245
        if (p)
246
            p++;
247
        else
248
            p = base_path;
249
        p1 = strrchr(base_path, '/');
250
#ifdef _WIN32
251
        {
252
            const char *p2;
253
            p2 = strrchr(base_path, '\\');
254
            if (!p1 || p2 > p1)
255
                p1 = p2;
256
        }
257
#endif
258
        if (p1)
259
            p1++;
260
        else
261
            p1 = base_path;
262
        if (p1 > p)
263
            p = p1;
264
        len = p - base_path;
265
        if (len > dest_size - 1)
266
            len = dest_size - 1;
267
        memcpy(dest, base_path, len);
268
        dest[len] = '\0';
269
        pstrcat(dest, dest_size, filename);
270
    }
271
}
272

    
273
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
274
{
275
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276
        pstrcpy(dest, sz, bs->backing_file);
277
    } else {
278
        path_combine(dest, sz, bs->filename, bs->backing_file);
279
    }
280
}
281

    
282
void bdrv_register(BlockDriver *bdrv)
283
{
284
    /* Block drivers without coroutine functions need emulation */
285
    if (!bdrv->bdrv_co_readv) {
286
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
287
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
288

    
289
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290
         * the block driver lacks aio we need to emulate that too.
291
         */
292
        if (!bdrv->bdrv_aio_readv) {
293
            /* add AIO emulation layer */
294
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
296
        }
297
    }
298

    
299
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
300
}
301

    
302
/* create a new block device (by default it is empty) */
303
BlockDriverState *bdrv_new(const char *device_name)
304
{
305
    BlockDriverState *bs;
306

    
307
    bs = g_malloc0(sizeof(BlockDriverState));
308
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309
    if (device_name[0] != '\0') {
310
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
311
    }
312
    bdrv_iostatus_disable(bs);
313
    return bs;
314
}
315

    
316
BlockDriver *bdrv_find_format(const char *format_name)
317
{
318
    BlockDriver *drv1;
319
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320
        if (!strcmp(drv1->format_name, format_name)) {
321
            return drv1;
322
        }
323
    }
324
    return NULL;
325
}
326

    
327
static int bdrv_is_whitelisted(BlockDriver *drv)
328
{
329
    static const char *whitelist[] = {
330
        CONFIG_BDRV_WHITELIST
331
    };
332
    const char **p;
333

    
334
    if (!whitelist[0])
335
        return 1;               /* no whitelist, anything goes */
336

    
337
    for (p = whitelist; *p; p++) {
338
        if (!strcmp(drv->format_name, *p)) {
339
            return 1;
340
        }
341
    }
342
    return 0;
343
}
344

    
345
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
346
{
347
    BlockDriver *drv = bdrv_find_format(format_name);
348
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
349
}
350

    
351
typedef struct CreateCo {
352
    BlockDriver *drv;
353
    char *filename;
354
    QEMUOptionParameter *options;
355
    int ret;
356
} CreateCo;
357

    
358
static void coroutine_fn bdrv_create_co_entry(void *opaque)
359
{
360
    CreateCo *cco = opaque;
361
    assert(cco->drv);
362

    
363
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
364
}
365

    
366
int bdrv_create(BlockDriver *drv, const char* filename,
367
    QEMUOptionParameter *options)
368
{
369
    int ret;
370

    
371
    Coroutine *co;
372
    CreateCo cco = {
373
        .drv = drv,
374
        .filename = g_strdup(filename),
375
        .options = options,
376
        .ret = NOT_DONE,
377
    };
378

    
379
    if (!drv->bdrv_create) {
380
        return -ENOTSUP;
381
    }
382

    
383
    if (qemu_in_coroutine()) {
384
        /* Fast-path if already in coroutine context */
385
        bdrv_create_co_entry(&cco);
386
    } else {
387
        co = qemu_coroutine_create(bdrv_create_co_entry);
388
        qemu_coroutine_enter(co, &cco);
389
        while (cco.ret == NOT_DONE) {
390
            qemu_aio_wait();
391
        }
392
    }
393

    
394
    ret = cco.ret;
395
    g_free(cco.filename);
396

    
397
    return ret;
398
}
399

    
400
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
401
{
402
    BlockDriver *drv;
403

    
404
    drv = bdrv_find_protocol(filename);
405
    if (drv == NULL) {
406
        return -ENOENT;
407
    }
408

    
409
    return bdrv_create(drv, filename, options);
410
}
411

    
412
/*
413
 * Create a uniquely-named empty temporary file.
414
 * Return 0 upon success, otherwise a negative errno value.
415
 */
416
int get_tmp_filename(char *filename, int size)
417
{
418
#ifdef _WIN32
419
    char temp_dir[MAX_PATH];
420
    /* GetTempFileName requires that its output buffer (4th param)
421
       have length MAX_PATH or greater.  */
422
    assert(size >= MAX_PATH);
423
    return (GetTempPath(MAX_PATH, temp_dir)
424
            && GetTempFileName(temp_dir, "qem", 0, filename)
425
            ? 0 : -GetLastError());
426
#else
427
    int fd;
428
    const char *tmpdir;
429
    tmpdir = getenv("TMPDIR");
430
    if (!tmpdir)
431
        tmpdir = "/tmp";
432
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433
        return -EOVERFLOW;
434
    }
435
    fd = mkstemp(filename);
436
    if (fd < 0 || close(fd)) {
437
        return -errno;
438
    }
439
    return 0;
440
#endif
441
}
442

    
443
/*
444
 * Detect host devices. By convention, /dev/cdrom[N] is always
445
 * recognized as a host CDROM.
446
 */
447
static BlockDriver *find_hdev_driver(const char *filename)
448
{
449
    int score_max = 0, score;
450
    BlockDriver *drv = NULL, *d;
451

    
452
    QLIST_FOREACH(d, &bdrv_drivers, list) {
453
        if (d->bdrv_probe_device) {
454
            score = d->bdrv_probe_device(filename);
455
            if (score > score_max) {
456
                score_max = score;
457
                drv = d;
458
            }
459
        }
460
    }
461

    
462
    return drv;
463
}
464

    
465
BlockDriver *bdrv_find_protocol(const char *filename)
466
{
467
    BlockDriver *drv1;
468
    char protocol[128];
469
    int len;
470
    const char *p;
471

    
472
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
473

    
474
    /*
475
     * XXX(hch): we really should not let host device detection
476
     * override an explicit protocol specification, but moving this
477
     * later breaks access to device names with colons in them.
478
     * Thanks to the brain-dead persistent naming schemes on udev-
479
     * based Linux systems those actually are quite common.
480
     */
481
    drv1 = find_hdev_driver(filename);
482
    if (drv1) {
483
        return drv1;
484
    }
485

    
486
    if (!path_has_protocol(filename)) {
487
        return bdrv_find_format("file");
488
    }
489
    p = strchr(filename, ':');
490
    assert(p != NULL);
491
    len = p - filename;
492
    if (len > sizeof(protocol) - 1)
493
        len = sizeof(protocol) - 1;
494
    memcpy(protocol, filename, len);
495
    protocol[len] = '\0';
496
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
497
        if (drv1->protocol_name &&
498
            !strcmp(drv1->protocol_name, protocol)) {
499
            return drv1;
500
        }
501
    }
502
    return NULL;
503
}
504

    
505
static int find_image_format(const char *filename, BlockDriver **pdrv)
506
{
507
    int ret, score, score_max;
508
    BlockDriver *drv1, *drv;
509
    uint8_t buf[2048];
510
    BlockDriverState *bs;
511

    
512
    ret = bdrv_file_open(&bs, filename, 0);
513
    if (ret < 0) {
514
        *pdrv = NULL;
515
        return ret;
516
    }
517

    
518
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
519
    if (bs->sg || !bdrv_is_inserted(bs)) {
520
        bdrv_delete(bs);
521
        drv = bdrv_find_format("raw");
522
        if (!drv) {
523
            ret = -ENOENT;
524
        }
525
        *pdrv = drv;
526
        return ret;
527
    }
528

    
529
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
530
    bdrv_delete(bs);
531
    if (ret < 0) {
532
        *pdrv = NULL;
533
        return ret;
534
    }
535

    
536
    score_max = 0;
537
    drv = NULL;
538
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
539
        if (drv1->bdrv_probe) {
540
            score = drv1->bdrv_probe(buf, ret, filename);
541
            if (score > score_max) {
542
                score_max = score;
543
                drv = drv1;
544
            }
545
        }
546
    }
547
    if (!drv) {
548
        ret = -ENOENT;
549
    }
550
    *pdrv = drv;
551
    return ret;
552
}
553

    
554
/**
555
 * Set the current 'total_sectors' value
556
 */
557
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
558
{
559
    BlockDriver *drv = bs->drv;
560

    
561
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
562
    if (bs->sg)
563
        return 0;
564

    
565
    /* query actual device if possible, otherwise just trust the hint */
566
    if (drv->bdrv_getlength) {
567
        int64_t length = drv->bdrv_getlength(bs);
568
        if (length < 0) {
569
            return length;
570
        }
571
        hint = length >> BDRV_SECTOR_BITS;
572
    }
573

    
574
    bs->total_sectors = hint;
575
    return 0;
576
}
577

    
578
/**
579
 * Set open flags for a given cache mode
580
 *
581
 * Return 0 on success, -1 if the cache mode was invalid.
582
 */
583
int bdrv_parse_cache_flags(const char *mode, int *flags)
584
{
585
    *flags &= ~BDRV_O_CACHE_MASK;
586

    
587
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
588
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
589
    } else if (!strcmp(mode, "directsync")) {
590
        *flags |= BDRV_O_NOCACHE;
591
    } else if (!strcmp(mode, "writeback")) {
592
        *flags |= BDRV_O_CACHE_WB;
593
    } else if (!strcmp(mode, "unsafe")) {
594
        *flags |= BDRV_O_CACHE_WB;
595
        *flags |= BDRV_O_NO_FLUSH;
596
    } else if (!strcmp(mode, "writethrough")) {
597
        /* this is the default */
598
    } else {
599
        return -1;
600
    }
601

    
602
    return 0;
603
}
604

    
605
/**
606
 * The copy-on-read flag is actually a reference count so multiple users may
607
 * use the feature without worrying about clobbering its previous state.
608
 * Copy-on-read stays enabled until all users have called to disable it.
609
 */
610
void bdrv_enable_copy_on_read(BlockDriverState *bs)
611
{
612
    bs->copy_on_read++;
613
}
614

    
615
void bdrv_disable_copy_on_read(BlockDriverState *bs)
616
{
617
    assert(bs->copy_on_read > 0);
618
    bs->copy_on_read--;
619
}
620

    
621
/*
622
 * Common part for opening disk images and files
623
 */
624
static int bdrv_open_common(BlockDriverState *bs, const char *filename,
625
    int flags, BlockDriver *drv)
626
{
627
    int ret, open_flags;
628

    
629
    assert(drv != NULL);
630
    assert(bs->file == NULL);
631

    
632
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
633

    
634
    bs->open_flags = flags;
635
    bs->buffer_alignment = 512;
636

    
637
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
638
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
639
        bdrv_enable_copy_on_read(bs);
640
    }
641

    
642
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
643

    
644
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
645
        return -ENOTSUP;
646
    }
647

    
648
    bs->drv = drv;
649
    bs->opaque = g_malloc0(drv->instance_size);
650

    
651
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
652
    open_flags = flags | BDRV_O_CACHE_WB;
653

    
654
    /*
655
     * Clear flags that are internal to the block layer before opening the
656
     * image.
657
     */
658
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
659

    
660
    /*
661
     * Snapshots should be writable.
662
     */
663
    if (bs->is_temporary) {
664
        open_flags |= BDRV_O_RDWR;
665
    }
666

    
667
    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
668

    
669
    /* Open the image, either directly or using a protocol */
670
    if (drv->bdrv_file_open) {
671
        ret = drv->bdrv_file_open(bs, filename, open_flags);
672
    } else {
673
        ret = bdrv_file_open(&bs->file, filename, open_flags);
674
        if (ret >= 0) {
675
            ret = drv->bdrv_open(bs, open_flags);
676
        }
677
    }
678

    
679
    if (ret < 0) {
680
        goto free_and_fail;
681
    }
682

    
683
    ret = refresh_total_sectors(bs, bs->total_sectors);
684
    if (ret < 0) {
685
        goto free_and_fail;
686
    }
687

    
688
#ifndef _WIN32
689
    if (bs->is_temporary) {
690
        unlink(filename);
691
    }
692
#endif
693
    return 0;
694

    
695
free_and_fail:
696
    if (bs->file) {
697
        bdrv_delete(bs->file);
698
        bs->file = NULL;
699
    }
700
    g_free(bs->opaque);
701
    bs->opaque = NULL;
702
    bs->drv = NULL;
703
    return ret;
704
}
705

    
706
/*
707
 * Opens a file using a protocol (file, host_device, nbd, ...)
708
 */
709
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
710
{
711
    BlockDriverState *bs;
712
    BlockDriver *drv;
713
    int ret;
714

    
715
    drv = bdrv_find_protocol(filename);
716
    if (!drv) {
717
        return -ENOENT;
718
    }
719

    
720
    bs = bdrv_new("");
721
    ret = bdrv_open_common(bs, filename, flags, drv);
722
    if (ret < 0) {
723
        bdrv_delete(bs);
724
        return ret;
725
    }
726
    bs->growable = 1;
727
    *pbs = bs;
728
    return 0;
729
}
730

    
731
/*
732
 * Opens a disk image (raw, qcow2, vmdk, ...)
733
 */
734
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
735
              BlockDriver *drv)
736
{
737
    int ret;
738
    char tmp_filename[PATH_MAX];
739

    
740
    if (flags & BDRV_O_SNAPSHOT) {
741
        BlockDriverState *bs1;
742
        int64_t total_size;
743
        int is_protocol = 0;
744
        BlockDriver *bdrv_qcow2;
745
        QEMUOptionParameter *options;
746
        char backing_filename[PATH_MAX];
747

    
748
        /* if snapshot, we create a temporary backing file and open it
749
           instead of opening 'filename' directly */
750

    
751
        /* if there is a backing file, use it */
752
        bs1 = bdrv_new("");
753
        ret = bdrv_open(bs1, filename, 0, drv);
754
        if (ret < 0) {
755
            bdrv_delete(bs1);
756
            return ret;
757
        }
758
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
759

    
760
        if (bs1->drv && bs1->drv->protocol_name)
761
            is_protocol = 1;
762

    
763
        bdrv_delete(bs1);
764

    
765
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
766
        if (ret < 0) {
767
            return ret;
768
        }
769

    
770
        /* Real path is meaningless for protocols */
771
        if (is_protocol)
772
            snprintf(backing_filename, sizeof(backing_filename),
773
                     "%s", filename);
774
        else if (!realpath(filename, backing_filename))
775
            return -errno;
776

    
777
        bdrv_qcow2 = bdrv_find_format("qcow2");
778
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
779

    
780
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
781
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
782
        if (drv) {
783
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
784
                drv->format_name);
785
        }
786

    
787
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
788
        free_option_parameters(options);
789
        if (ret < 0) {
790
            return ret;
791
        }
792

    
793
        filename = tmp_filename;
794
        drv = bdrv_qcow2;
795
        bs->is_temporary = 1;
796
    }
797

    
798
    /* Find the right image format driver */
799
    if (!drv) {
800
        ret = find_image_format(filename, &drv);
801
    }
802

    
803
    if (!drv) {
804
        goto unlink_and_fail;
805
    }
806

    
807
    /* Open the image */
808
    ret = bdrv_open_common(bs, filename, flags, drv);
809
    if (ret < 0) {
810
        goto unlink_and_fail;
811
    }
812

    
813
    /* If there is a backing file, use it */
814
    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
815
        char backing_filename[PATH_MAX];
816
        int back_flags;
817
        BlockDriver *back_drv = NULL;
818

    
819
        bs->backing_hd = bdrv_new("");
820
        bdrv_get_full_backing_filename(bs, backing_filename,
821
                                       sizeof(backing_filename));
822

    
823
        if (bs->backing_format[0] != '\0') {
824
            back_drv = bdrv_find_format(bs->backing_format);
825
        }
826

    
827
        /* backing files always opened read-only */
828
        back_flags =
829
            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830

    
831
        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
832
        if (ret < 0) {
833
            bdrv_close(bs);
834
            return ret;
835
        }
836
        if (bs->is_temporary) {
837
            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
838
        } else {
839
            /* base image inherits from "parent" */
840
            bs->backing_hd->keep_read_only = bs->keep_read_only;
841
        }
842
    }
843

    
844
    if (!bdrv_key_required(bs)) {
845
        bdrv_dev_change_media_cb(bs, true);
846
    }
847

    
848
    /* throttling disk I/O limits */
849
    if (bs->io_limits_enabled) {
850
        bdrv_io_limits_enable(bs);
851
    }
852

    
853
    return 0;
854

    
855
unlink_and_fail:
856
    if (bs->is_temporary) {
857
        unlink(filename);
858
    }
859
    return ret;
860
}
861

    
862
void bdrv_close(BlockDriverState *bs)
863
{
864
    bdrv_flush(bs);
865
    if (bs->drv) {
866
        if (bs->job) {
867
            block_job_cancel_sync(bs->job);
868
        }
869
        bdrv_drain_all();
870

    
871
        if (bs == bs_snapshots) {
872
            bs_snapshots = NULL;
873
        }
874
        if (bs->backing_hd) {
875
            bdrv_delete(bs->backing_hd);
876
            bs->backing_hd = NULL;
877
        }
878
        bs->drv->bdrv_close(bs);
879
        g_free(bs->opaque);
880
#ifdef _WIN32
881
        if (bs->is_temporary) {
882
            unlink(bs->filename);
883
        }
884
#endif
885
        bs->opaque = NULL;
886
        bs->drv = NULL;
887
        bs->copy_on_read = 0;
888
        bs->backing_file[0] = '\0';
889
        bs->backing_format[0] = '\0';
890
        bs->total_sectors = 0;
891
        bs->encrypted = 0;
892
        bs->valid_key = 0;
893
        bs->sg = 0;
894
        bs->growable = 0;
895

    
896
        if (bs->file != NULL) {
897
            bdrv_delete(bs->file);
898
            bs->file = NULL;
899
        }
900

    
901
        bdrv_dev_change_media_cb(bs, false);
902
    }
903

    
904
    /*throttling disk I/O limits*/
905
    if (bs->io_limits_enabled) {
906
        bdrv_io_limits_disable(bs);
907
    }
908
}
909

    
910
void bdrv_close_all(void)
911
{
912
    BlockDriverState *bs;
913

    
914
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
915
        bdrv_close(bs);
916
    }
917
}
918

    
919
/*
920
 * Wait for pending requests to complete across all BlockDriverStates
921
 *
922
 * This function does not flush data to disk, use bdrv_flush_all() for that
923
 * after calling this function.
924
 *
925
 * Note that completion of an asynchronous I/O operation can trigger any
926
 * number of other I/O operations on other devices---for example a coroutine
927
 * can be arbitrarily complex and a constant flow of I/O can come until the
928
 * coroutine is complete.  Because of this, it is not possible to have a
929
 * function to drain a single device's I/O queue.
930
 */
931
void bdrv_drain_all(void)
932
{
933
    BlockDriverState *bs;
934
    bool busy;
935

    
936
    do {
937
        busy = qemu_aio_wait();
938

    
939
        /* FIXME: We do not have timer support here, so this is effectively
940
         * a busy wait.
941
         */
942
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
943
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
944
                qemu_co_queue_restart_all(&bs->throttled_reqs);
945
                busy = true;
946
            }
947
        }
948
    } while (busy);
949

    
950
    /* If requests are still pending there is a bug somewhere */
951
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
952
        assert(QLIST_EMPTY(&bs->tracked_requests));
953
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
954
    }
955
}
956

    
957
/* make a BlockDriverState anonymous by removing from bdrv_state list.
958
   Also, NULL terminate the device_name to prevent double remove */
959
void bdrv_make_anon(BlockDriverState *bs)
960
{
961
    if (bs->device_name[0] != '\0') {
962
        QTAILQ_REMOVE(&bdrv_states, bs, list);
963
    }
964
    bs->device_name[0] = '\0';
965
}
966

    
967
static void bdrv_rebind(BlockDriverState *bs)
968
{
969
    if (bs->drv && bs->drv->bdrv_rebind) {
970
        bs->drv->bdrv_rebind(bs);
971
    }
972
}
973

    
974
static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
975
                                     BlockDriverState *bs_src)
976
{
977
    /* move some fields that need to stay attached to the device */
978
    bs_dest->open_flags         = bs_src->open_flags;
979

    
980
    /* dev info */
981
    bs_dest->dev_ops            = bs_src->dev_ops;
982
    bs_dest->dev_opaque         = bs_src->dev_opaque;
983
    bs_dest->dev                = bs_src->dev;
984
    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
985
    bs_dest->copy_on_read       = bs_src->copy_on_read;
986

    
987
    bs_dest->enable_write_cache = bs_src->enable_write_cache;
988

    
989
    /* i/o timing parameters */
990
    bs_dest->slice_time         = bs_src->slice_time;
991
    bs_dest->slice_start        = bs_src->slice_start;
992
    bs_dest->slice_end          = bs_src->slice_end;
993
    bs_dest->io_limits          = bs_src->io_limits;
994
    bs_dest->io_base            = bs_src->io_base;
995
    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
996
    bs_dest->block_timer        = bs_src->block_timer;
997
    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
998

    
999
    /* r/w error */
1000
    bs_dest->on_read_error      = bs_src->on_read_error;
1001
    bs_dest->on_write_error     = bs_src->on_write_error;
1002

    
1003
    /* i/o status */
1004
    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1005
    bs_dest->iostatus           = bs_src->iostatus;
1006

    
1007
    /* dirty bitmap */
1008
    bs_dest->dirty_count        = bs_src->dirty_count;
1009
    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1010

    
1011
    /* job */
1012
    bs_dest->in_use             = bs_src->in_use;
1013
    bs_dest->job                = bs_src->job;
1014

    
1015
    /* keep the same entry in bdrv_states */
1016
    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1017
            bs_src->device_name);
1018
    bs_dest->list = bs_src->list;
1019
}
1020

    
1021
/*
1022
 * Swap bs contents for two image chains while they are live,
1023
 * while keeping required fields on the BlockDriverState that is
1024
 * actually attached to a device.
1025
 *
1026
 * This will modify the BlockDriverState fields, and swap contents
1027
 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1028
 *
1029
 * bs_new is required to be anonymous.
1030
 *
1031
 * This function does not create any image files.
1032
 */
1033
void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1034
{
1035
    BlockDriverState tmp;
1036

    
1037
    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1038
    assert(bs_new->device_name[0] == '\0');
1039
    assert(bs_new->dirty_bitmap == NULL);
1040
    assert(bs_new->job == NULL);
1041
    assert(bs_new->dev == NULL);
1042
    assert(bs_new->in_use == 0);
1043
    assert(bs_new->io_limits_enabled == false);
1044
    assert(bs_new->block_timer == NULL);
1045

    
1046
    tmp = *bs_new;
1047
    *bs_new = *bs_old;
1048
    *bs_old = tmp;
1049

    
1050
    /* there are some fields that should not be swapped, move them back */
1051
    bdrv_move_feature_fields(&tmp, bs_old);
1052
    bdrv_move_feature_fields(bs_old, bs_new);
1053
    bdrv_move_feature_fields(bs_new, &tmp);
1054

    
1055
    /* bs_new shouldn't be in bdrv_states even after the swap!  */
1056
    assert(bs_new->device_name[0] == '\0');
1057

    
1058
    /* Check a few fields that should remain attached to the device */
1059
    assert(bs_new->dev == NULL);
1060
    assert(bs_new->job == NULL);
1061
    assert(bs_new->in_use == 0);
1062
    assert(bs_new->io_limits_enabled == false);
1063
    assert(bs_new->block_timer == NULL);
1064

    
1065
    bdrv_rebind(bs_new);
1066
    bdrv_rebind(bs_old);
1067
}
1068

    
1069
/*
1070
 * Add new bs contents at the top of an image chain while the chain is
1071
 * live, while keeping required fields on the top layer.
1072
 *
1073
 * This will modify the BlockDriverState fields, and swap contents
1074
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1075
 *
1076
 * bs_new is required to be anonymous.
1077
 *
1078
 * This function does not create any image files.
1079
 */
1080
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1081
{
1082
    bdrv_swap(bs_new, bs_top);
1083

    
1084
    /* The contents of 'tmp' will become bs_top, as we are
1085
     * swapping bs_new and bs_top contents. */
1086
    bs_top->backing_hd = bs_new;
1087
    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1088
    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1089
            bs_new->filename);
1090
    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1091
            bs_new->drv ? bs_new->drv->format_name : "");
1092
}
1093

    
1094
void bdrv_delete(BlockDriverState *bs)
1095
{
1096
    assert(!bs->dev);
1097
    assert(!bs->job);
1098
    assert(!bs->in_use);
1099

    
1100
    /* remove from list, if necessary */
1101
    bdrv_make_anon(bs);
1102

    
1103
    bdrv_close(bs);
1104

    
1105
    assert(bs != bs_snapshots);
1106
    g_free(bs);
1107
}
1108

    
1109
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1110
/* TODO change to DeviceState *dev when all users are qdevified */
1111
{
1112
    if (bs->dev) {
1113
        return -EBUSY;
1114
    }
1115
    bs->dev = dev;
1116
    bdrv_iostatus_reset(bs);
1117
    return 0;
1118
}
1119

    
1120
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1121
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1122
{
1123
    if (bdrv_attach_dev(bs, dev) < 0) {
1124
        abort();
1125
    }
1126
}
1127

    
1128
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1129
/* TODO change to DeviceState *dev when all users are qdevified */
1130
{
1131
    assert(bs->dev == dev);
1132
    bs->dev = NULL;
1133
    bs->dev_ops = NULL;
1134
    bs->dev_opaque = NULL;
1135
    bs->buffer_alignment = 512;
1136
}
1137

    
1138
/* TODO change to return DeviceState * when all users are qdevified */
1139
void *bdrv_get_attached_dev(BlockDriverState *bs)
1140
{
1141
    return bs->dev;
1142
}
1143

    
1144
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1145
                      void *opaque)
1146
{
1147
    bs->dev_ops = ops;
1148
    bs->dev_opaque = opaque;
1149
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1150
        bs_snapshots = NULL;
1151
    }
1152
}
1153

    
1154
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1155
                               BlockQMPEventAction action, int is_read)
1156
{
1157
    QObject *data;
1158
    const char *action_str;
1159

    
1160
    switch (action) {
1161
    case BDRV_ACTION_REPORT:
1162
        action_str = "report";
1163
        break;
1164
    case BDRV_ACTION_IGNORE:
1165
        action_str = "ignore";
1166
        break;
1167
    case BDRV_ACTION_STOP:
1168
        action_str = "stop";
1169
        break;
1170
    default:
1171
        abort();
1172
    }
1173

    
1174
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1175
                              bdrv->device_name,
1176
                              action_str,
1177
                              is_read ? "read" : "write");
1178
    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1179

    
1180
    qobject_decref(data);
1181
}
1182

    
1183
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1184
{
1185
    QObject *data;
1186

    
1187
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1188
                              bdrv_get_device_name(bs), ejected);
1189
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1190

    
1191
    qobject_decref(data);
1192
}
1193

    
1194
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1195
{
1196
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1197
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1198
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1199
        if (tray_was_closed) {
1200
            /* tray open */
1201
            bdrv_emit_qmp_eject_event(bs, true);
1202
        }
1203
        if (load) {
1204
            /* tray close */
1205
            bdrv_emit_qmp_eject_event(bs, false);
1206
        }
1207
    }
1208
}
1209

    
1210
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1211
{
1212
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1213
}
1214

    
1215
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1216
{
1217
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1218
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1219
    }
1220
}
1221

    
1222
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1223
{
1224
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1225
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1226
    }
1227
    return false;
1228
}
1229

    
1230
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1231
{
1232
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1233
        bs->dev_ops->resize_cb(bs->dev_opaque);
1234
    }
1235
}
1236

    
1237
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1238
{
1239
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1240
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1241
    }
1242
    return false;
1243
}
1244

    
1245
/*
1246
 * Run consistency checks on an image
1247
 *
1248
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1249
 * free of errors) or -errno when an internal error occurred. The results of the
1250
 * check are stored in res.
1251
 */
1252
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1253
{
1254
    if (bs->drv->bdrv_check == NULL) {
1255
        return -ENOTSUP;
1256
    }
1257

    
1258
    memset(res, 0, sizeof(*res));
1259
    return bs->drv->bdrv_check(bs, res, fix);
1260
}
1261

    
1262
#define COMMIT_BUF_SECTORS 2048
1263

    
1264
/* commit COW file into the raw image */
1265
int bdrv_commit(BlockDriverState *bs)
1266
{
1267
    BlockDriver *drv = bs->drv;
1268
    BlockDriver *backing_drv;
1269
    int64_t sector, total_sectors;
1270
    int n, ro, open_flags;
1271
    int ret = 0, rw_ret = 0;
1272
    uint8_t *buf;
1273
    char filename[1024];
1274
    BlockDriverState *bs_rw, *bs_ro;
1275

    
1276
    if (!drv)
1277
        return -ENOMEDIUM;
1278
    
1279
    if (!bs->backing_hd) {
1280
        return -ENOTSUP;
1281
    }
1282

    
1283
    if (bs->backing_hd->keep_read_only) {
1284
        return -EACCES;
1285
    }
1286

    
1287
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1288
        return -EBUSY;
1289
    }
1290

    
1291
    backing_drv = bs->backing_hd->drv;
1292
    ro = bs->backing_hd->read_only;
1293
    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1294
    open_flags =  bs->backing_hd->open_flags;
1295

    
1296
    if (ro) {
1297
        /* re-open as RW */
1298
        bdrv_delete(bs->backing_hd);
1299
        bs->backing_hd = NULL;
1300
        bs_rw = bdrv_new("");
1301
        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1302
            backing_drv);
1303
        if (rw_ret < 0) {
1304
            bdrv_delete(bs_rw);
1305
            /* try to re-open read-only */
1306
            bs_ro = bdrv_new("");
1307
            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1308
                backing_drv);
1309
            if (ret < 0) {
1310
                bdrv_delete(bs_ro);
1311
                /* drive not functional anymore */
1312
                bs->drv = NULL;
1313
                return ret;
1314
            }
1315
            bs->backing_hd = bs_ro;
1316
            return rw_ret;
1317
        }
1318
        bs->backing_hd = bs_rw;
1319
    }
1320

    
1321
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1322
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1323

    
1324
    for (sector = 0; sector < total_sectors; sector += n) {
1325
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1326

    
1327
            if (bdrv_read(bs, sector, buf, n) != 0) {
1328
                ret = -EIO;
1329
                goto ro_cleanup;
1330
            }
1331

    
1332
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1333
                ret = -EIO;
1334
                goto ro_cleanup;
1335
            }
1336
        }
1337
    }
1338

    
1339
    if (drv->bdrv_make_empty) {
1340
        ret = drv->bdrv_make_empty(bs);
1341
        bdrv_flush(bs);
1342
    }
1343

    
1344
    /*
1345
     * Make sure all data we wrote to the backing device is actually
1346
     * stable on disk.
1347
     */
1348
    if (bs->backing_hd)
1349
        bdrv_flush(bs->backing_hd);
1350

    
1351
ro_cleanup:
1352
    g_free(buf);
1353

    
1354
    if (ro) {
1355
        /* re-open as RO */
1356
        bdrv_delete(bs->backing_hd);
1357
        bs->backing_hd = NULL;
1358
        bs_ro = bdrv_new("");
1359
        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1360
            backing_drv);
1361
        if (ret < 0) {
1362
            bdrv_delete(bs_ro);
1363
            /* drive not functional anymore */
1364
            bs->drv = NULL;
1365
            return ret;
1366
        }
1367
        bs->backing_hd = bs_ro;
1368
        bs->backing_hd->keep_read_only = 0;
1369
    }
1370

    
1371
    return ret;
1372
}
1373

    
1374
int bdrv_commit_all(void)
1375
{
1376
    BlockDriverState *bs;
1377

    
1378
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1379
        int ret = bdrv_commit(bs);
1380
        if (ret < 0) {
1381
            return ret;
1382
        }
1383
    }
1384
    return 0;
1385
}
1386

    
1387
struct BdrvTrackedRequest {
1388
    BlockDriverState *bs;
1389
    int64_t sector_num;
1390
    int nb_sectors;
1391
    bool is_write;
1392
    QLIST_ENTRY(BdrvTrackedRequest) list;
1393
    Coroutine *co; /* owner, used for deadlock detection */
1394
    CoQueue wait_queue; /* coroutines blocked on this request */
1395
};
1396

    
1397
/**
1398
 * Remove an active request from the tracked requests list
1399
 *
1400
 * This function should be called when a tracked request is completing.
1401
 */
1402
static void tracked_request_end(BdrvTrackedRequest *req)
1403
{
1404
    QLIST_REMOVE(req, list);
1405
    qemu_co_queue_restart_all(&req->wait_queue);
1406
}
1407

    
1408
/**
1409
 * Add an active request to the tracked requests list
1410
 */
1411
static void tracked_request_begin(BdrvTrackedRequest *req,
1412
                                  BlockDriverState *bs,
1413
                                  int64_t sector_num,
1414
                                  int nb_sectors, bool is_write)
1415
{
1416
    *req = (BdrvTrackedRequest){
1417
        .bs = bs,
1418
        .sector_num = sector_num,
1419
        .nb_sectors = nb_sectors,
1420
        .is_write = is_write,
1421
        .co = qemu_coroutine_self(),
1422
    };
1423

    
1424
    qemu_co_queue_init(&req->wait_queue);
1425

    
1426
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1427
}
1428

    
1429
/**
1430
 * Round a region to cluster boundaries
1431
 */
1432
static void round_to_clusters(BlockDriverState *bs,
1433
                              int64_t sector_num, int nb_sectors,
1434
                              int64_t *cluster_sector_num,
1435
                              int *cluster_nb_sectors)
1436
{
1437
    BlockDriverInfo bdi;
1438

    
1439
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1440
        *cluster_sector_num = sector_num;
1441
        *cluster_nb_sectors = nb_sectors;
1442
    } else {
1443
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1444
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1445
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1446
                                            nb_sectors, c);
1447
    }
1448
}
1449

    
1450
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1451
                                     int64_t sector_num, int nb_sectors) {
1452
    /*        aaaa   bbbb */
1453
    if (sector_num >= req->sector_num + req->nb_sectors) {
1454
        return false;
1455
    }
1456
    /* bbbb   aaaa        */
1457
    if (req->sector_num >= sector_num + nb_sectors) {
1458
        return false;
1459
    }
1460
    return true;
1461
}
1462

    
1463
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1464
        int64_t sector_num, int nb_sectors)
1465
{
1466
    BdrvTrackedRequest *req;
1467
    int64_t cluster_sector_num;
1468
    int cluster_nb_sectors;
1469
    bool retry;
1470

    
1471
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1472
     * that allocating writes will be serialized and not race with each other
1473
     * for the same cluster.  For example, in copy-on-read it ensures that the
1474
     * CoR read and write operations are atomic and guest writes cannot
1475
     * interleave between them.
1476
     */
1477
    round_to_clusters(bs, sector_num, nb_sectors,
1478
                      &cluster_sector_num, &cluster_nb_sectors);
1479

    
1480
    do {
1481
        retry = false;
1482
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1483
            if (tracked_request_overlaps(req, cluster_sector_num,
1484
                                         cluster_nb_sectors)) {
1485
                /* Hitting this means there was a reentrant request, for
1486
                 * example, a block driver issuing nested requests.  This must
1487
                 * never happen since it means deadlock.
1488
                 */
1489
                assert(qemu_coroutine_self() != req->co);
1490

    
1491
                qemu_co_queue_wait(&req->wait_queue);
1492
                retry = true;
1493
                break;
1494
            }
1495
        }
1496
    } while (retry);
1497
}
1498

    
1499
/*
1500
 * Return values:
1501
 * 0        - success
1502
 * -EINVAL  - backing format specified, but no file
1503
 * -ENOSPC  - can't update the backing file because no space is left in the
1504
 *            image file header
1505
 * -ENOTSUP - format driver doesn't support changing the backing file
1506
 */
1507
int bdrv_change_backing_file(BlockDriverState *bs,
1508
    const char *backing_file, const char *backing_fmt)
1509
{
1510
    BlockDriver *drv = bs->drv;
1511
    int ret;
1512

    
1513
    /* Backing file format doesn't make sense without a backing file */
1514
    if (backing_fmt && !backing_file) {
1515
        return -EINVAL;
1516
    }
1517

    
1518
    if (drv->bdrv_change_backing_file != NULL) {
1519
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1520
    } else {
1521
        ret = -ENOTSUP;
1522
    }
1523

    
1524
    if (ret == 0) {
1525
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1526
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1527
    }
1528
    return ret;
1529
}
1530

    
1531
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1532
                                   size_t size)
1533
{
1534
    int64_t len;
1535

    
1536
    if (!bdrv_is_inserted(bs))
1537
        return -ENOMEDIUM;
1538

    
1539
    if (bs->growable)
1540
        return 0;
1541

    
1542
    len = bdrv_getlength(bs);
1543

    
1544
    if (offset < 0)
1545
        return -EIO;
1546

    
1547
    if ((offset > len) || (len - offset < size))
1548
        return -EIO;
1549

    
1550
    return 0;
1551
}
1552

    
1553
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1554
                              int nb_sectors)
1555
{
1556
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1557
                                   nb_sectors * BDRV_SECTOR_SIZE);
1558
}
1559

    
1560
typedef struct RwCo {
1561
    BlockDriverState *bs;
1562
    int64_t sector_num;
1563
    int nb_sectors;
1564
    QEMUIOVector *qiov;
1565
    bool is_write;
1566
    int ret;
1567
} RwCo;
1568

    
1569
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1570
{
1571
    RwCo *rwco = opaque;
1572

    
1573
    if (!rwco->is_write) {
1574
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1575
                                     rwco->nb_sectors, rwco->qiov, 0);
1576
    } else {
1577
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1578
                                      rwco->nb_sectors, rwco->qiov, 0);
1579
    }
1580
}
1581

    
1582
/*
1583
 * Process a synchronous request using coroutines
1584
 */
1585
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1586
                      int nb_sectors, bool is_write)
1587
{
1588
    QEMUIOVector qiov;
1589
    struct iovec iov = {
1590
        .iov_base = (void *)buf,
1591
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1592
    };
1593
    Coroutine *co;
1594
    RwCo rwco = {
1595
        .bs = bs,
1596
        .sector_num = sector_num,
1597
        .nb_sectors = nb_sectors,
1598
        .qiov = &qiov,
1599
        .is_write = is_write,
1600
        .ret = NOT_DONE,
1601
    };
1602

    
1603
    qemu_iovec_init_external(&qiov, &iov, 1);
1604

    
1605
    /**
1606
     * In sync call context, when the vcpu is blocked, this throttling timer
1607
     * will not fire; so the I/O throttling function has to be disabled here
1608
     * if it has been enabled.
1609
     */
1610
    if (bs->io_limits_enabled) {
1611
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
1612
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
1613
        bdrv_io_limits_disable(bs);
1614
    }
1615

    
1616
    if (qemu_in_coroutine()) {
1617
        /* Fast-path if already in coroutine context */
1618
        bdrv_rw_co_entry(&rwco);
1619
    } else {
1620
        co = qemu_coroutine_create(bdrv_rw_co_entry);
1621
        qemu_coroutine_enter(co, &rwco);
1622
        while (rwco.ret == NOT_DONE) {
1623
            qemu_aio_wait();
1624
        }
1625
    }
1626
    return rwco.ret;
1627
}
1628

    
1629
/* return < 0 if error. See bdrv_write() for the return codes */
1630
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1631
              uint8_t *buf, int nb_sectors)
1632
{
1633
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1634
}
1635

    
1636
/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
1637
int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
1638
                          uint8_t *buf, int nb_sectors)
1639
{
1640
    bool enabled;
1641
    int ret;
1642

    
1643
    enabled = bs->io_limits_enabled;
1644
    bs->io_limits_enabled = false;
1645
    ret = bdrv_read(bs, 0, buf, 1);
1646
    bs->io_limits_enabled = enabled;
1647
    return ret;
1648
}
1649

    
1650
#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
1651

    
1652
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1653
                             int nb_sectors, int dirty)
1654
{
1655
    int64_t start, end;
1656
    unsigned long val, idx, bit;
1657

    
1658
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1659
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1660

    
1661
    for (; start <= end; start++) {
1662
        idx = start / BITS_PER_LONG;
1663
        bit = start % BITS_PER_LONG;
1664
        val = bs->dirty_bitmap[idx];
1665
        if (dirty) {
1666
            if (!(val & (1UL << bit))) {
1667
                bs->dirty_count++;
1668
                val |= 1UL << bit;
1669
            }
1670
        } else {
1671
            if (val & (1UL << bit)) {
1672
                bs->dirty_count--;
1673
                val &= ~(1UL << bit);
1674
            }
1675
        }
1676
        bs->dirty_bitmap[idx] = val;
1677
    }
1678
}
1679

    
1680
/* Return < 0 if error. Important errors are:
1681
  -EIO         generic I/O error (may happen for all errors)
1682
  -ENOMEDIUM   No media inserted.
1683
  -EINVAL      Invalid sector number or nb_sectors
1684
  -EACCES      Trying to write a read-only device
1685
*/
1686
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1687
               const uint8_t *buf, int nb_sectors)
1688
{
1689
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1690
}
1691

    
1692
int bdrv_pread(BlockDriverState *bs, int64_t offset,
1693
               void *buf, int count1)
1694
{
1695
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1696
    int len, nb_sectors, count;
1697
    int64_t sector_num;
1698
    int ret;
1699

    
1700
    count = count1;
1701
    /* first read to align to sector start */
1702
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1703
    if (len > count)
1704
        len = count;
1705
    sector_num = offset >> BDRV_SECTOR_BITS;
1706
    if (len > 0) {
1707
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1708
            return ret;
1709
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1710
        count -= len;
1711
        if (count == 0)
1712
            return count1;
1713
        sector_num++;
1714
        buf += len;
1715
    }
1716

    
1717
    /* read the sectors "in place" */
1718
    nb_sectors = count >> BDRV_SECTOR_BITS;
1719
    if (nb_sectors > 0) {
1720
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1721
            return ret;
1722
        sector_num += nb_sectors;
1723
        len = nb_sectors << BDRV_SECTOR_BITS;
1724
        buf += len;
1725
        count -= len;
1726
    }
1727

    
1728
    /* add data from the last sector */
1729
    if (count > 0) {
1730
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1731
            return ret;
1732
        memcpy(buf, tmp_buf, count);
1733
    }
1734
    return count1;
1735
}
1736

    
1737
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1738
                const void *buf, int count1)
1739
{
1740
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1741
    int len, nb_sectors, count;
1742
    int64_t sector_num;
1743
    int ret;
1744

    
1745
    count = count1;
1746
    /* first write to align to sector start */
1747
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1748
    if (len > count)
1749
        len = count;
1750
    sector_num = offset >> BDRV_SECTOR_BITS;
1751
    if (len > 0) {
1752
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1753
            return ret;
1754
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1755
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1756
            return ret;
1757
        count -= len;
1758
        if (count == 0)
1759
            return count1;
1760
        sector_num++;
1761
        buf += len;
1762
    }
1763

    
1764
    /* write the sectors "in place" */
1765
    nb_sectors = count >> BDRV_SECTOR_BITS;
1766
    if (nb_sectors > 0) {
1767
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1768
            return ret;
1769
        sector_num += nb_sectors;
1770
        len = nb_sectors << BDRV_SECTOR_BITS;
1771
        buf += len;
1772
        count -= len;
1773
    }
1774

    
1775
    /* add data from the last sector */
1776
    if (count > 0) {
1777
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1778
            return ret;
1779
        memcpy(tmp_buf, buf, count);
1780
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1781
            return ret;
1782
    }
1783
    return count1;
1784
}
1785

    
1786
/*
1787
 * Writes to the file and ensures that no writes are reordered across this
1788
 * request (acts as a barrier)
1789
 *
1790
 * Returns 0 on success, -errno in error cases.
1791
 */
1792
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1793
    const void *buf, int count)
1794
{
1795
    int ret;
1796

    
1797
    ret = bdrv_pwrite(bs, offset, buf, count);
1798
    if (ret < 0) {
1799
        return ret;
1800
    }
1801

    
1802
    /* No flush needed for cache modes that already do it */
1803
    if (bs->enable_write_cache) {
1804
        bdrv_flush(bs);
1805
    }
1806

    
1807
    return 0;
1808
}
1809

    
1810
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1811
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1812
{
1813
    /* Perform I/O through a temporary buffer so that users who scribble over
1814
     * their read buffer while the operation is in progress do not end up
1815
     * modifying the image file.  This is critical for zero-copy guest I/O
1816
     * where anything might happen inside guest memory.
1817
     */
1818
    void *bounce_buffer;
1819

    
1820
    BlockDriver *drv = bs->drv;
1821
    struct iovec iov;
1822
    QEMUIOVector bounce_qiov;
1823
    int64_t cluster_sector_num;
1824
    int cluster_nb_sectors;
1825
    size_t skip_bytes;
1826
    int ret;
1827

    
1828
    /* Cover entire cluster so no additional backing file I/O is required when
1829
     * allocating cluster in the image file.
1830
     */
1831
    round_to_clusters(bs, sector_num, nb_sectors,
1832
                      &cluster_sector_num, &cluster_nb_sectors);
1833

    
1834
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1835
                                   cluster_sector_num, cluster_nb_sectors);
1836

    
1837
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1838
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1839
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1840

    
1841
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1842
                             &bounce_qiov);
1843
    if (ret < 0) {
1844
        goto err;
1845
    }
1846

    
1847
    if (drv->bdrv_co_write_zeroes &&
1848
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
1849
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1850
                                      cluster_nb_sectors);
1851
    } else {
1852
        /* This does not change the data on the disk, it is not necessary
1853
         * to flush even in cache=writethrough mode.
1854
         */
1855
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1856
                                  &bounce_qiov);
1857
    }
1858

    
1859
    if (ret < 0) {
1860
        /* It might be okay to ignore write errors for guest requests.  If this
1861
         * is a deliberate copy-on-read then we don't want to ignore the error.
1862
         * Simply report it in all cases.
1863
         */
1864
        goto err;
1865
    }
1866

    
1867
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1868
    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
1869
                        nb_sectors * BDRV_SECTOR_SIZE);
1870

    
1871
err:
1872
    qemu_vfree(bounce_buffer);
1873
    return ret;
1874
}
1875

    
1876
/*
1877
 * Handle a read request in coroutine context
1878
 */
1879
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1880
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1881
    BdrvRequestFlags flags)
1882
{
1883
    BlockDriver *drv = bs->drv;
1884
    BdrvTrackedRequest req;
1885
    int ret;
1886

    
1887
    if (!drv) {
1888
        return -ENOMEDIUM;
1889
    }
1890
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1891
        return -EIO;
1892
    }
1893

    
1894
    /* throttling disk read I/O */
1895
    if (bs->io_limits_enabled) {
1896
        bdrv_io_limits_intercept(bs, false, nb_sectors);
1897
    }
1898

    
1899
    if (bs->copy_on_read) {
1900
        flags |= BDRV_REQ_COPY_ON_READ;
1901
    }
1902
    if (flags & BDRV_REQ_COPY_ON_READ) {
1903
        bs->copy_on_read_in_flight++;
1904
    }
1905

    
1906
    if (bs->copy_on_read_in_flight) {
1907
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1908
    }
1909

    
1910
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1911

    
1912
    if (flags & BDRV_REQ_COPY_ON_READ) {
1913
        int pnum;
1914

    
1915
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1916
        if (ret < 0) {
1917
            goto out;
1918
        }
1919

    
1920
        if (!ret || pnum != nb_sectors) {
1921
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1922
            goto out;
1923
        }
1924
    }
1925

    
1926
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1927

    
1928
out:
1929
    tracked_request_end(&req);
1930

    
1931
    if (flags & BDRV_REQ_COPY_ON_READ) {
1932
        bs->copy_on_read_in_flight--;
1933
    }
1934

    
1935
    return ret;
1936
}
1937

    
1938
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1939
    int nb_sectors, QEMUIOVector *qiov)
1940
{
1941
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1942

    
1943
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1944
}
1945

    
1946
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1947
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1948
{
1949
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1950

    
1951
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1952
                            BDRV_REQ_COPY_ON_READ);
1953
}
1954

    
1955
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1956
    int64_t sector_num, int nb_sectors)
1957
{
1958
    BlockDriver *drv = bs->drv;
1959
    QEMUIOVector qiov;
1960
    struct iovec iov;
1961
    int ret;
1962

    
1963
    /* TODO Emulate only part of misaligned requests instead of letting block
1964
     * drivers return -ENOTSUP and emulate everything */
1965

    
1966
    /* First try the efficient write zeroes operation */
1967
    if (drv->bdrv_co_write_zeroes) {
1968
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1969
        if (ret != -ENOTSUP) {
1970
            return ret;
1971
        }
1972
    }
1973

    
1974
    /* Fall back to bounce buffer if write zeroes is unsupported */
1975
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1976
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1977
    memset(iov.iov_base, 0, iov.iov_len);
1978
    qemu_iovec_init_external(&qiov, &iov, 1);
1979

    
1980
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1981

    
1982
    qemu_vfree(iov.iov_base);
1983
    return ret;
1984
}
1985

    
1986
/*
1987
 * Handle a write request in coroutine context
1988
 */
1989
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1990
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1991
    BdrvRequestFlags flags)
1992
{
1993
    BlockDriver *drv = bs->drv;
1994
    BdrvTrackedRequest req;
1995
    int ret;
1996

    
1997
    if (!bs->drv) {
1998
        return -ENOMEDIUM;
1999
    }
2000
    if (bs->read_only) {
2001
        return -EACCES;
2002
    }
2003
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2004
        return -EIO;
2005
    }
2006

    
2007
    /* throttling disk write I/O */
2008
    if (bs->io_limits_enabled) {
2009
        bdrv_io_limits_intercept(bs, true, nb_sectors);
2010
    }
2011

    
2012
    if (bs->copy_on_read_in_flight) {
2013
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2014
    }
2015

    
2016
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2017

    
2018
    if (flags & BDRV_REQ_ZERO_WRITE) {
2019
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2020
    } else {
2021
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2022
    }
2023

    
2024
    if (ret == 0 && !bs->enable_write_cache) {
2025
        ret = bdrv_co_flush(bs);
2026
    }
2027

    
2028
    if (bs->dirty_bitmap) {
2029
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2030
    }
2031

    
2032
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2033
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
2034
    }
2035

    
2036
    tracked_request_end(&req);
2037

    
2038
    return ret;
2039
}
2040

    
2041
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2042
    int nb_sectors, QEMUIOVector *qiov)
2043
{
2044
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2045

    
2046
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2047
}
2048

    
2049
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2050
                                      int64_t sector_num, int nb_sectors)
2051
{
2052
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2053

    
2054
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2055
                             BDRV_REQ_ZERO_WRITE);
2056
}
2057

    
2058
/**
2059
 * Truncate file to 'offset' bytes (needed only for file protocols)
2060
 */
2061
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2062
{
2063
    BlockDriver *drv = bs->drv;
2064
    int ret;
2065
    if (!drv)
2066
        return -ENOMEDIUM;
2067
    if (!drv->bdrv_truncate)
2068
        return -ENOTSUP;
2069
    if (bs->read_only)
2070
        return -EACCES;
2071
    if (bdrv_in_use(bs))
2072
        return -EBUSY;
2073
    ret = drv->bdrv_truncate(bs, offset);
2074
    if (ret == 0) {
2075
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2076
        bdrv_dev_resize_cb(bs);
2077
    }
2078
    return ret;
2079
}
2080

    
2081
/**
2082
 * Length of a allocated file in bytes. Sparse files are counted by actual
2083
 * allocated space. Return < 0 if error or unknown.
2084
 */
2085
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2086
{
2087
    BlockDriver *drv = bs->drv;
2088
    if (!drv) {
2089
        return -ENOMEDIUM;
2090
    }
2091
    if (drv->bdrv_get_allocated_file_size) {
2092
        return drv->bdrv_get_allocated_file_size(bs);
2093
    }
2094
    if (bs->file) {
2095
        return bdrv_get_allocated_file_size(bs->file);
2096
    }
2097
    return -ENOTSUP;
2098
}
2099

    
2100
/**
2101
 * Length of a file in bytes. Return < 0 if error or unknown.
2102
 */
2103
int64_t bdrv_getlength(BlockDriverState *bs)
2104
{
2105
    BlockDriver *drv = bs->drv;
2106
    if (!drv)
2107
        return -ENOMEDIUM;
2108

    
2109
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2110
        if (drv->bdrv_getlength) {
2111
            return drv->bdrv_getlength(bs);
2112
        }
2113
    }
2114
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2115
}
2116

    
2117
/* return 0 as number of sectors if no device present or error */
2118
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2119
{
2120
    int64_t length;
2121
    length = bdrv_getlength(bs);
2122
    if (length < 0)
2123
        length = 0;
2124
    else
2125
        length = length >> BDRV_SECTOR_BITS;
2126
    *nb_sectors_ptr = length;
2127
}
2128

    
2129
/* throttling disk io limits */
2130
void bdrv_set_io_limits(BlockDriverState *bs,
2131
                        BlockIOLimit *io_limits)
2132
{
2133
    bs->io_limits = *io_limits;
2134
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2135
}
2136

    
2137
void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2138
                       BlockErrorAction on_write_error)
2139
{
2140
    bs->on_read_error = on_read_error;
2141
    bs->on_write_error = on_write_error;
2142
}
2143

    
2144
BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2145
{
2146
    return is_read ? bs->on_read_error : bs->on_write_error;
2147
}
2148

    
2149
int bdrv_is_read_only(BlockDriverState *bs)
2150
{
2151
    return bs->read_only;
2152
}
2153

    
2154
int bdrv_is_sg(BlockDriverState *bs)
2155
{
2156
    return bs->sg;
2157
}
2158

    
2159
int bdrv_enable_write_cache(BlockDriverState *bs)
2160
{
2161
    return bs->enable_write_cache;
2162
}
2163

    
2164
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2165
{
2166
    bs->enable_write_cache = wce;
2167
}
2168

    
2169
int bdrv_is_encrypted(BlockDriverState *bs)
2170
{
2171
    if (bs->backing_hd && bs->backing_hd->encrypted)
2172
        return 1;
2173
    return bs->encrypted;
2174
}
2175

    
2176
int bdrv_key_required(BlockDriverState *bs)
2177
{
2178
    BlockDriverState *backing_hd = bs->backing_hd;
2179

    
2180
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2181
        return 1;
2182
    return (bs->encrypted && !bs->valid_key);
2183
}
2184

    
2185
int bdrv_set_key(BlockDriverState *bs, const char *key)
2186
{
2187
    int ret;
2188
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2189
        ret = bdrv_set_key(bs->backing_hd, key);
2190
        if (ret < 0)
2191
            return ret;
2192
        if (!bs->encrypted)
2193
            return 0;
2194
    }
2195
    if (!bs->encrypted) {
2196
        return -EINVAL;
2197
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2198
        return -ENOMEDIUM;
2199
    }
2200
    ret = bs->drv->bdrv_set_key(bs, key);
2201
    if (ret < 0) {
2202
        bs->valid_key = 0;
2203
    } else if (!bs->valid_key) {
2204
        bs->valid_key = 1;
2205
        /* call the change callback now, we skipped it on open */
2206
        bdrv_dev_change_media_cb(bs, true);
2207
    }
2208
    return ret;
2209
}
2210

    
2211
const char *bdrv_get_format_name(BlockDriverState *bs)
2212
{
2213
    return bs->drv ? bs->drv->format_name : NULL;
2214
}
2215

    
2216
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2217
                         void *opaque)
2218
{
2219
    BlockDriver *drv;
2220

    
2221
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2222
        it(opaque, drv->format_name);
2223
    }
2224
}
2225

    
2226
BlockDriverState *bdrv_find(const char *name)
2227
{
2228
    BlockDriverState *bs;
2229

    
2230
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2231
        if (!strcmp(name, bs->device_name)) {
2232
            return bs;
2233
        }
2234
    }
2235
    return NULL;
2236
}
2237

    
2238
BlockDriverState *bdrv_next(BlockDriverState *bs)
2239
{
2240
    if (!bs) {
2241
        return QTAILQ_FIRST(&bdrv_states);
2242
    }
2243
    return QTAILQ_NEXT(bs, list);
2244
}
2245

    
2246
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2247
{
2248
    BlockDriverState *bs;
2249

    
2250
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2251
        it(opaque, bs);
2252
    }
2253
}
2254

    
2255
const char *bdrv_get_device_name(BlockDriverState *bs)
2256
{
2257
    return bs->device_name;
2258
}
2259

    
2260
int bdrv_get_flags(BlockDriverState *bs)
2261
{
2262
    return bs->open_flags;
2263
}
2264

    
2265
void bdrv_flush_all(void)
2266
{
2267
    BlockDriverState *bs;
2268

    
2269
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2270
        bdrv_flush(bs);
2271
    }
2272
}
2273

    
2274
int bdrv_has_zero_init(BlockDriverState *bs)
2275
{
2276
    assert(bs->drv);
2277

    
2278
    if (bs->drv->bdrv_has_zero_init) {
2279
        return bs->drv->bdrv_has_zero_init(bs);
2280
    }
2281

    
2282
    return 1;
2283
}
2284

    
2285
typedef struct BdrvCoIsAllocatedData {
2286
    BlockDriverState *bs;
2287
    int64_t sector_num;
2288
    int nb_sectors;
2289
    int *pnum;
2290
    int ret;
2291
    bool done;
2292
} BdrvCoIsAllocatedData;
2293

    
2294
/*
2295
 * Returns true iff the specified sector is present in the disk image. Drivers
2296
 * not implementing the functionality are assumed to not support backing files,
2297
 * hence all their sectors are reported as allocated.
2298
 *
2299
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2300
 * and 'pnum' is set to 0.
2301
 *
2302
 * 'pnum' is set to the number of sectors (including and immediately following
2303
 * the specified sector) that are known to be in the same
2304
 * allocated/unallocated state.
2305
 *
2306
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2307
 * beyond the end of the disk image it will be clamped.
2308
 */
2309
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2310
                                      int nb_sectors, int *pnum)
2311
{
2312
    int64_t n;
2313

    
2314
    if (sector_num >= bs->total_sectors) {
2315
        *pnum = 0;
2316
        return 0;
2317
    }
2318

    
2319
    n = bs->total_sectors - sector_num;
2320
    if (n < nb_sectors) {
2321
        nb_sectors = n;
2322
    }
2323

    
2324
    if (!bs->drv->bdrv_co_is_allocated) {
2325
        *pnum = nb_sectors;
2326
        return 1;
2327
    }
2328

    
2329
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2330
}
2331

    
2332
/* Coroutine wrapper for bdrv_is_allocated() */
2333
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2334
{
2335
    BdrvCoIsAllocatedData *data = opaque;
2336
    BlockDriverState *bs = data->bs;
2337

    
2338
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2339
                                     data->pnum);
2340
    data->done = true;
2341
}
2342

    
2343
/*
2344
 * Synchronous wrapper around bdrv_co_is_allocated().
2345
 *
2346
 * See bdrv_co_is_allocated() for details.
2347
 */
2348
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2349
                      int *pnum)
2350
{
2351
    Coroutine *co;
2352
    BdrvCoIsAllocatedData data = {
2353
        .bs = bs,
2354
        .sector_num = sector_num,
2355
        .nb_sectors = nb_sectors,
2356
        .pnum = pnum,
2357
        .done = false,
2358
    };
2359

    
2360
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2361
    qemu_coroutine_enter(co, &data);
2362
    while (!data.done) {
2363
        qemu_aio_wait();
2364
    }
2365
    return data.ret;
2366
}
2367

    
2368
/*
2369
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2370
 *
2371
 * Return true if the given sector is allocated in any image between
2372
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2373
 * sector is allocated in any image of the chain.  Return false otherwise.
2374
 *
2375
 * 'pnum' is set to the number of sectors (including and immediately following
2376
 *  the specified sector) that are known to be in the same
2377
 *  allocated/unallocated state.
2378
 *
2379
 */
2380
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2381
                                            BlockDriverState *base,
2382
                                            int64_t sector_num,
2383
                                            int nb_sectors, int *pnum)
2384
{
2385
    BlockDriverState *intermediate;
2386
    int ret, n = nb_sectors;
2387

    
2388
    intermediate = top;
2389
    while (intermediate && intermediate != base) {
2390
        int pnum_inter;
2391
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2392
                                   &pnum_inter);
2393
        if (ret < 0) {
2394
            return ret;
2395
        } else if (ret) {
2396
            *pnum = pnum_inter;
2397
            return 1;
2398
        }
2399

    
2400
        /*
2401
         * [sector_num, nb_sectors] is unallocated on top but intermediate
2402
         * might have
2403
         *
2404
         * [sector_num+x, nr_sectors] allocated.
2405
         */
2406
        if (n > pnum_inter) {
2407
            n = pnum_inter;
2408
        }
2409

    
2410
        intermediate = intermediate->backing_hd;
2411
    }
2412

    
2413
    *pnum = n;
2414
    return 0;
2415
}
2416

    
2417
BlockInfoList *qmp_query_block(Error **errp)
2418
{
2419
    BlockInfoList *head = NULL, *cur_item = NULL;
2420
    BlockDriverState *bs;
2421

    
2422
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2423
        BlockInfoList *info = g_malloc0(sizeof(*info));
2424

    
2425
        info->value = g_malloc0(sizeof(*info->value));
2426
        info->value->device = g_strdup(bs->device_name);
2427
        info->value->type = g_strdup("unknown");
2428
        info->value->locked = bdrv_dev_is_medium_locked(bs);
2429
        info->value->removable = bdrv_dev_has_removable_media(bs);
2430

    
2431
        if (bdrv_dev_has_removable_media(bs)) {
2432
            info->value->has_tray_open = true;
2433
            info->value->tray_open = bdrv_dev_is_tray_open(bs);
2434
        }
2435

    
2436
        if (bdrv_iostatus_is_enabled(bs)) {
2437
            info->value->has_io_status = true;
2438
            info->value->io_status = bs->iostatus;
2439
        }
2440

    
2441
        if (bs->drv) {
2442
            info->value->has_inserted = true;
2443
            info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2444
            info->value->inserted->file = g_strdup(bs->filename);
2445
            info->value->inserted->ro = bs->read_only;
2446
            info->value->inserted->drv = g_strdup(bs->drv->format_name);
2447
            info->value->inserted->encrypted = bs->encrypted;
2448
            info->value->inserted->encryption_key_missing = bdrv_key_required(bs);
2449
            if (bs->backing_file[0]) {
2450
                info->value->inserted->has_backing_file = true;
2451
                info->value->inserted->backing_file = g_strdup(bs->backing_file);
2452
            }
2453

    
2454
            info->value->inserted->backing_file_depth =
2455
                bdrv_get_backing_file_depth(bs);
2456

    
2457
            if (bs->io_limits_enabled) {
2458
                info->value->inserted->bps =
2459
                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2460
                info->value->inserted->bps_rd =
2461
                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2462
                info->value->inserted->bps_wr =
2463
                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2464
                info->value->inserted->iops =
2465
                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2466
                info->value->inserted->iops_rd =
2467
                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2468
                info->value->inserted->iops_wr =
2469
                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2470
            }
2471
        }
2472

    
2473
        /* XXX: waiting for the qapi to support GSList */
2474
        if (!cur_item) {
2475
            head = cur_item = info;
2476
        } else {
2477
            cur_item->next = info;
2478
            cur_item = info;
2479
        }
2480
    }
2481

    
2482
    return head;
2483
}
2484

    
2485
/* Consider exposing this as a full fledged QMP command */
2486
static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2487
{
2488
    BlockStats *s;
2489

    
2490
    s = g_malloc0(sizeof(*s));
2491

    
2492
    if (bs->device_name[0]) {
2493
        s->has_device = true;
2494
        s->device = g_strdup(bs->device_name);
2495
    }
2496

    
2497
    s->stats = g_malloc0(sizeof(*s->stats));
2498
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2499
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2500
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2501
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2502
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2503
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2504
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2505
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2506
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2507

    
2508
    if (bs->file) {
2509
        s->has_parent = true;
2510
        s->parent = qmp_query_blockstat(bs->file, NULL);
2511
    }
2512

    
2513
    return s;
2514
}
2515

    
2516
BlockStatsList *qmp_query_blockstats(Error **errp)
2517
{
2518
    BlockStatsList *head = NULL, *cur_item = NULL;
2519
    BlockDriverState *bs;
2520

    
2521
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2522
        BlockStatsList *info = g_malloc0(sizeof(*info));
2523
        info->value = qmp_query_blockstat(bs, NULL);
2524

    
2525
        /* XXX: waiting for the qapi to support GSList */
2526
        if (!cur_item) {
2527
            head = cur_item = info;
2528
        } else {
2529
            cur_item->next = info;
2530
            cur_item = info;
2531
        }
2532
    }
2533

    
2534
    return head;
2535
}
2536

    
2537
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2538
{
2539
    if (bs->backing_hd && bs->backing_hd->encrypted)
2540
        return bs->backing_file;
2541
    else if (bs->encrypted)
2542
        return bs->filename;
2543
    else
2544
        return NULL;
2545
}
2546

    
2547
void bdrv_get_backing_filename(BlockDriverState *bs,
2548
                               char *filename, int filename_size)
2549
{
2550
    pstrcpy(filename, filename_size, bs->backing_file);
2551
}
2552

    
2553
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2554
                          const uint8_t *buf, int nb_sectors)
2555
{
2556
    BlockDriver *drv = bs->drv;
2557
    if (!drv)
2558
        return -ENOMEDIUM;
2559
    if (!drv->bdrv_write_compressed)
2560
        return -ENOTSUP;
2561
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2562
        return -EIO;
2563

    
2564
    if (bs->dirty_bitmap) {
2565
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2566
    }
2567

    
2568
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2569
}
2570

    
2571
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2572
{
2573
    BlockDriver *drv = bs->drv;
2574
    if (!drv)
2575
        return -ENOMEDIUM;
2576
    if (!drv->bdrv_get_info)
2577
        return -ENOTSUP;
2578
    memset(bdi, 0, sizeof(*bdi));
2579
    return drv->bdrv_get_info(bs, bdi);
2580
}
2581

    
2582
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2583
                      int64_t pos, int size)
2584
{
2585
    BlockDriver *drv = bs->drv;
2586
    if (!drv)
2587
        return -ENOMEDIUM;
2588
    if (drv->bdrv_save_vmstate)
2589
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
2590
    if (bs->file)
2591
        return bdrv_save_vmstate(bs->file, buf, pos, size);
2592
    return -ENOTSUP;
2593
}
2594

    
2595
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2596
                      int64_t pos, int size)
2597
{
2598
    BlockDriver *drv = bs->drv;
2599
    if (!drv)
2600
        return -ENOMEDIUM;
2601
    if (drv->bdrv_load_vmstate)
2602
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
2603
    if (bs->file)
2604
        return bdrv_load_vmstate(bs->file, buf, pos, size);
2605
    return -ENOTSUP;
2606
}
2607

    
2608
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2609
{
2610
    BlockDriver *drv = bs->drv;
2611

    
2612
    if (!drv || !drv->bdrv_debug_event) {
2613
        return;
2614
    }
2615

    
2616
    drv->bdrv_debug_event(bs, event);
2617

    
2618
}
2619

    
2620
/**************************************************************/
2621
/* handling of snapshots */
2622

    
2623
int bdrv_can_snapshot(BlockDriverState *bs)
2624
{
2625
    BlockDriver *drv = bs->drv;
2626
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2627
        return 0;
2628
    }
2629

    
2630
    if (!drv->bdrv_snapshot_create) {
2631
        if (bs->file != NULL) {
2632
            return bdrv_can_snapshot(bs->file);
2633
        }
2634
        return 0;
2635
    }
2636

    
2637
    return 1;
2638
}
2639

    
2640
int bdrv_is_snapshot(BlockDriverState *bs)
2641
{
2642
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2643
}
2644

    
2645
BlockDriverState *bdrv_snapshots(void)
2646
{
2647
    BlockDriverState *bs;
2648

    
2649
    if (bs_snapshots) {
2650
        return bs_snapshots;
2651
    }
2652

    
2653
    bs = NULL;
2654
    while ((bs = bdrv_next(bs))) {
2655
        if (bdrv_can_snapshot(bs)) {
2656
            bs_snapshots = bs;
2657
            return bs;
2658
        }
2659
    }
2660
    return NULL;
2661
}
2662

    
2663
int bdrv_snapshot_create(BlockDriverState *bs,
2664
                         QEMUSnapshotInfo *sn_info)
2665
{
2666
    BlockDriver *drv = bs->drv;
2667
    if (!drv)
2668
        return -ENOMEDIUM;
2669
    if (drv->bdrv_snapshot_create)
2670
        return drv->bdrv_snapshot_create(bs, sn_info);
2671
    if (bs->file)
2672
        return bdrv_snapshot_create(bs->file, sn_info);
2673
    return -ENOTSUP;
2674
}
2675

    
2676
int bdrv_snapshot_goto(BlockDriverState *bs,
2677
                       const char *snapshot_id)
2678
{
2679
    BlockDriver *drv = bs->drv;
2680
    int ret, open_ret;
2681

    
2682
    if (!drv)
2683
        return -ENOMEDIUM;
2684
    if (drv->bdrv_snapshot_goto)
2685
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
2686

    
2687
    if (bs->file) {
2688
        drv->bdrv_close(bs);
2689
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2690
        open_ret = drv->bdrv_open(bs, bs->open_flags);
2691
        if (open_ret < 0) {
2692
            bdrv_delete(bs->file);
2693
            bs->drv = NULL;
2694
            return open_ret;
2695
        }
2696
        return ret;
2697
    }
2698

    
2699
    return -ENOTSUP;
2700
}
2701

    
2702
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2703
{
2704
    BlockDriver *drv = bs->drv;
2705
    if (!drv)
2706
        return -ENOMEDIUM;
2707
    if (drv->bdrv_snapshot_delete)
2708
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
2709
    if (bs->file)
2710
        return bdrv_snapshot_delete(bs->file, snapshot_id);
2711
    return -ENOTSUP;
2712
}
2713

    
2714
int bdrv_snapshot_list(BlockDriverState *bs,
2715
                       QEMUSnapshotInfo **psn_info)
2716
{
2717
    BlockDriver *drv = bs->drv;
2718
    if (!drv)
2719
        return -ENOMEDIUM;
2720
    if (drv->bdrv_snapshot_list)
2721
        return drv->bdrv_snapshot_list(bs, psn_info);
2722
    if (bs->file)
2723
        return bdrv_snapshot_list(bs->file, psn_info);
2724
    return -ENOTSUP;
2725
}
2726

    
2727
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2728
        const char *snapshot_name)
2729
{
2730
    BlockDriver *drv = bs->drv;
2731
    if (!drv) {
2732
        return -ENOMEDIUM;
2733
    }
2734
    if (!bs->read_only) {
2735
        return -EINVAL;
2736
    }
2737
    if (drv->bdrv_snapshot_load_tmp) {
2738
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2739
    }
2740
    return -ENOTSUP;
2741
}
2742

    
2743
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2744
        const char *backing_file)
2745
{
2746
    if (!bs->drv) {
2747
        return NULL;
2748
    }
2749

    
2750
    if (bs->backing_hd) {
2751
        if (strcmp(bs->backing_file, backing_file) == 0) {
2752
            return bs->backing_hd;
2753
        } else {
2754
            return bdrv_find_backing_image(bs->backing_hd, backing_file);
2755
        }
2756
    }
2757

    
2758
    return NULL;
2759
}
2760

    
2761
int bdrv_get_backing_file_depth(BlockDriverState *bs)
2762
{
2763
    if (!bs->drv) {
2764
        return 0;
2765
    }
2766

    
2767
    if (!bs->backing_hd) {
2768
        return 0;
2769
    }
2770

    
2771
    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
2772
}
2773

    
2774
#define NB_SUFFIXES 4
2775

    
2776
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2777
{
2778
    static const char suffixes[NB_SUFFIXES] = "KMGT";
2779
    int64_t base;
2780
    int i;
2781

    
2782
    if (size <= 999) {
2783
        snprintf(buf, buf_size, "%" PRId64, size);
2784
    } else {
2785
        base = 1024;
2786
        for(i = 0; i < NB_SUFFIXES; i++) {
2787
            if (size < (10 * base)) {
2788
                snprintf(buf, buf_size, "%0.1f%c",
2789
                         (double)size / base,
2790
                         suffixes[i]);
2791
                break;
2792
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2793
                snprintf(buf, buf_size, "%" PRId64 "%c",
2794
                         ((size + (base >> 1)) / base),
2795
                         suffixes[i]);
2796
                break;
2797
            }
2798
            base = base * 1024;
2799
        }
2800
    }
2801
    return buf;
2802
}
2803

    
2804
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2805
{
2806
    char buf1[128], date_buf[128], clock_buf[128];
2807
#ifdef _WIN32
2808
    struct tm *ptm;
2809
#else
2810
    struct tm tm;
2811
#endif
2812
    time_t ti;
2813
    int64_t secs;
2814

    
2815
    if (!sn) {
2816
        snprintf(buf, buf_size,
2817
                 "%-10s%-20s%7s%20s%15s",
2818
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2819
    } else {
2820
        ti = sn->date_sec;
2821
#ifdef _WIN32
2822
        ptm = localtime(&ti);
2823
        strftime(date_buf, sizeof(date_buf),
2824
                 "%Y-%m-%d %H:%M:%S", ptm);
2825
#else
2826
        localtime_r(&ti, &tm);
2827
        strftime(date_buf, sizeof(date_buf),
2828
                 "%Y-%m-%d %H:%M:%S", &tm);
2829
#endif
2830
        secs = sn->vm_clock_nsec / 1000000000;
2831
        snprintf(clock_buf, sizeof(clock_buf),
2832
                 "%02d:%02d:%02d.%03d",
2833
                 (int)(secs / 3600),
2834
                 (int)((secs / 60) % 60),
2835
                 (int)(secs % 60),
2836
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2837
        snprintf(buf, buf_size,
2838
                 "%-10s%-20s%7s%20s%15s",
2839
                 sn->id_str, sn->name,
2840
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2841
                 date_buf,
2842
                 clock_buf);
2843
    }
2844
    return buf;
2845
}
2846

    
2847
/**************************************************************/
2848
/* async I/Os */
2849

    
2850
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2851
                                 QEMUIOVector *qiov, int nb_sectors,
2852
                                 BlockDriverCompletionFunc *cb, void *opaque)
2853
{
2854
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2855

    
2856
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2857
                                 cb, opaque, false);
2858
}
2859

    
2860
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2861
                                  QEMUIOVector *qiov, int nb_sectors,
2862
                                  BlockDriverCompletionFunc *cb, void *opaque)
2863
{
2864
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2865

    
2866
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2867
                                 cb, opaque, true);
2868
}
2869

    
2870

    
2871
typedef struct MultiwriteCB {
2872
    int error;
2873
    int num_requests;
2874
    int num_callbacks;
2875
    struct {
2876
        BlockDriverCompletionFunc *cb;
2877
        void *opaque;
2878
        QEMUIOVector *free_qiov;
2879
    } callbacks[];
2880
} MultiwriteCB;
2881

    
2882
static void multiwrite_user_cb(MultiwriteCB *mcb)
2883
{
2884
    int i;
2885

    
2886
    for (i = 0; i < mcb->num_callbacks; i++) {
2887
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2888
        if (mcb->callbacks[i].free_qiov) {
2889
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2890
        }
2891
        g_free(mcb->callbacks[i].free_qiov);
2892
    }
2893
}
2894

    
2895
static void multiwrite_cb(void *opaque, int ret)
2896
{
2897
    MultiwriteCB *mcb = opaque;
2898

    
2899
    trace_multiwrite_cb(mcb, ret);
2900

    
2901
    if (ret < 0 && !mcb->error) {
2902
        mcb->error = ret;
2903
    }
2904

    
2905
    mcb->num_requests--;
2906
    if (mcb->num_requests == 0) {
2907
        multiwrite_user_cb(mcb);
2908
        g_free(mcb);
2909
    }
2910
}
2911

    
2912
static int multiwrite_req_compare(const void *a, const void *b)
2913
{
2914
    const BlockRequest *req1 = a, *req2 = b;
2915

    
2916
    /*
2917
     * Note that we can't simply subtract req2->sector from req1->sector
2918
     * here as that could overflow the return value.
2919
     */
2920
    if (req1->sector > req2->sector) {
2921
        return 1;
2922
    } else if (req1->sector < req2->sector) {
2923
        return -1;
2924
    } else {
2925
        return 0;
2926
    }
2927
}
2928

    
2929
/*
2930
 * Takes a bunch of requests and tries to merge them. Returns the number of
2931
 * requests that remain after merging.
2932
 */
2933
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2934
    int num_reqs, MultiwriteCB *mcb)
2935
{
2936
    int i, outidx;
2937

    
2938
    // Sort requests by start sector
2939
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2940

    
2941
    // Check if adjacent requests touch the same clusters. If so, combine them,
2942
    // filling up gaps with zero sectors.
2943
    outidx = 0;
2944
    for (i = 1; i < num_reqs; i++) {
2945
        int merge = 0;
2946
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2947

    
2948
        // Handle exactly sequential writes and overlapping writes.
2949
        if (reqs[i].sector <= oldreq_last) {
2950
            merge = 1;
2951
        }
2952

    
2953
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2954
            merge = 0;
2955
        }
2956

    
2957
        if (merge) {
2958
            size_t size;
2959
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2960
            qemu_iovec_init(qiov,
2961
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2962

    
2963
            // Add the first request to the merged one. If the requests are
2964
            // overlapping, drop the last sectors of the first request.
2965
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
2966
            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
2967

    
2968
            // We should need to add any zeros between the two requests
2969
            assert (reqs[i].sector <= oldreq_last);
2970

    
2971
            // Add the second request
2972
            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
2973

    
2974
            reqs[outidx].nb_sectors = qiov->size >> 9;
2975
            reqs[outidx].qiov = qiov;
2976

    
2977
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2978
        } else {
2979
            outidx++;
2980
            reqs[outidx].sector     = reqs[i].sector;
2981
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2982
            reqs[outidx].qiov       = reqs[i].qiov;
2983
        }
2984
    }
2985

    
2986
    return outidx + 1;
2987
}
2988

    
2989
/*
2990
 * Submit multiple AIO write requests at once.
2991
 *
2992
 * On success, the function returns 0 and all requests in the reqs array have
2993
 * been submitted. In error case this function returns -1, and any of the
2994
 * requests may or may not be submitted yet. In particular, this means that the
2995
 * callback will be called for some of the requests, for others it won't. The
2996
 * caller must check the error field of the BlockRequest to wait for the right
2997
 * callbacks (if error != 0, no callback will be called).
2998
 *
2999
 * The implementation may modify the contents of the reqs array, e.g. to merge
3000
 * requests. However, the fields opaque and error are left unmodified as they
3001
 * are used to signal failure for a single request to the caller.
3002
 */
3003
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3004
{
3005
    MultiwriteCB *mcb;
3006
    int i;
3007

    
3008
    /* don't submit writes if we don't have a medium */
3009
    if (bs->drv == NULL) {
3010
        for (i = 0; i < num_reqs; i++) {
3011
            reqs[i].error = -ENOMEDIUM;
3012
        }
3013
        return -1;
3014
    }
3015

    
3016
    if (num_reqs == 0) {
3017
        return 0;
3018
    }
3019

    
3020
    // Create MultiwriteCB structure
3021
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3022
    mcb->num_requests = 0;
3023
    mcb->num_callbacks = num_reqs;
3024

    
3025
    for (i = 0; i < num_reqs; i++) {
3026
        mcb->callbacks[i].cb = reqs[i].cb;
3027
        mcb->callbacks[i].opaque = reqs[i].opaque;
3028
    }
3029

    
3030
    // Check for mergable requests
3031
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3032

    
3033
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3034

    
3035
    /* Run the aio requests. */
3036
    mcb->num_requests = num_reqs;
3037
    for (i = 0; i < num_reqs; i++) {
3038
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3039
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3040
    }
3041

    
3042
    return 0;
3043
}
3044

    
3045
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3046
{
3047
    acb->pool->cancel(acb);
3048
}
3049

    
3050
/* block I/O throttling */
3051
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3052
                 bool is_write, double elapsed_time, uint64_t *wait)
3053
{
3054
    uint64_t bps_limit = 0;
3055
    double   bytes_limit, bytes_base, bytes_res;
3056
    double   slice_time, wait_time;
3057

    
3058
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3059
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3060
    } else if (bs->io_limits.bps[is_write]) {
3061
        bps_limit = bs->io_limits.bps[is_write];
3062
    } else {
3063
        if (wait) {
3064
            *wait = 0;
3065
        }
3066

    
3067
        return false;
3068
    }
3069

    
3070
    slice_time = bs->slice_end - bs->slice_start;
3071
    slice_time /= (NANOSECONDS_PER_SECOND);
3072
    bytes_limit = bps_limit * slice_time;
3073
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3074
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3075
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3076
    }
3077

    
3078
    /* bytes_base: the bytes of data which have been read/written; and
3079
     *             it is obtained from the history statistic info.
3080
     * bytes_res: the remaining bytes of data which need to be read/written.
3081
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3082
     *             the total time for completing reading/writting all data.
3083
     */
3084
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3085

    
3086
    if (bytes_base + bytes_res <= bytes_limit) {
3087
        if (wait) {
3088
            *wait = 0;
3089
        }
3090

    
3091
        return false;
3092
    }
3093

    
3094
    /* Calc approx time to dispatch */
3095
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3096

    
3097
    /* When the I/O rate at runtime exceeds the limits,
3098
     * bs->slice_end need to be extended in order that the current statistic
3099
     * info can be kept until the timer fire, so it is increased and tuned
3100
     * based on the result of experiment.
3101
     */
3102
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3103
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3104
    if (wait) {
3105
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3106
    }
3107

    
3108
    return true;
3109
}
3110

    
3111
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3112
                             double elapsed_time, uint64_t *wait)
3113
{
3114
    uint64_t iops_limit = 0;
3115
    double   ios_limit, ios_base;
3116
    double   slice_time, wait_time;
3117

    
3118
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3119
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3120
    } else if (bs->io_limits.iops[is_write]) {
3121
        iops_limit = bs->io_limits.iops[is_write];
3122
    } else {
3123
        if (wait) {
3124
            *wait = 0;
3125
        }
3126

    
3127
        return false;
3128
    }
3129

    
3130
    slice_time = bs->slice_end - bs->slice_start;
3131
    slice_time /= (NANOSECONDS_PER_SECOND);
3132
    ios_limit  = iops_limit * slice_time;
3133
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3134
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3135
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3136
    }
3137

    
3138
    if (ios_base + 1 <= ios_limit) {
3139
        if (wait) {
3140
            *wait = 0;
3141
        }
3142

    
3143
        return false;
3144
    }
3145

    
3146
    /* Calc approx time to dispatch */
3147
    wait_time = (ios_base + 1) / iops_limit;
3148
    if (wait_time > elapsed_time) {
3149
        wait_time = wait_time - elapsed_time;
3150
    } else {
3151
        wait_time = 0;
3152
    }
3153

    
3154
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3155
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3156
    if (wait) {
3157
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3158
    }
3159

    
3160
    return true;
3161
}
3162

    
3163
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3164
                           bool is_write, int64_t *wait)
3165
{
3166
    int64_t  now, max_wait;
3167
    uint64_t bps_wait = 0, iops_wait = 0;
3168
    double   elapsed_time;
3169
    int      bps_ret, iops_ret;
3170

    
3171
    now = qemu_get_clock_ns(vm_clock);
3172
    if ((bs->slice_start < now)
3173
        && (bs->slice_end > now)) {
3174
        bs->slice_end = now + bs->slice_time;
3175
    } else {
3176
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3177
        bs->slice_start = now;
3178
        bs->slice_end   = now + bs->slice_time;
3179

    
3180
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3181
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3182

    
3183
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3184
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3185
    }
3186

    
3187
    elapsed_time  = now - bs->slice_start;
3188
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3189

    
3190
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3191
                                      is_write, elapsed_time, &bps_wait);
3192
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3193
                                      elapsed_time, &iops_wait);
3194
    if (bps_ret || iops_ret) {
3195
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3196
        if (wait) {
3197
            *wait = max_wait;
3198
        }
3199

    
3200
        now = qemu_get_clock_ns(vm_clock);
3201
        if (bs->slice_end < now + max_wait) {
3202
            bs->slice_end = now + max_wait;
3203
        }
3204

    
3205
        return true;
3206
    }
3207

    
3208
    if (wait) {
3209
        *wait = 0;
3210
    }
3211

    
3212
    return false;
3213
}
3214

    
3215
/**************************************************************/
3216
/* async block device emulation */
3217

    
3218
typedef struct BlockDriverAIOCBSync {
3219
    BlockDriverAIOCB common;
3220
    QEMUBH *bh;
3221
    int ret;
3222
    /* vector translation state */
3223
    QEMUIOVector *qiov;
3224
    uint8_t *bounce;
3225
    int is_write;
3226
} BlockDriverAIOCBSync;
3227

    
3228
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3229
{
3230
    BlockDriverAIOCBSync *acb =
3231
        container_of(blockacb, BlockDriverAIOCBSync, common);
3232
    qemu_bh_delete(acb->bh);
3233
    acb->bh = NULL;
3234
    qemu_aio_release(acb);
3235
}
3236

    
3237
static AIOPool bdrv_em_aio_pool = {
3238
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3239
    .cancel             = bdrv_aio_cancel_em,
3240
};
3241

    
3242
static void bdrv_aio_bh_cb(void *opaque)
3243
{
3244
    BlockDriverAIOCBSync *acb = opaque;
3245

    
3246
    if (!acb->is_write)
3247
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3248
    qemu_vfree(acb->bounce);
3249
    acb->common.cb(acb->common.opaque, acb->ret);
3250
    qemu_bh_delete(acb->bh);
3251
    acb->bh = NULL;
3252
    qemu_aio_release(acb);
3253
}
3254

    
3255
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3256
                                            int64_t sector_num,
3257
                                            QEMUIOVector *qiov,
3258
                                            int nb_sectors,
3259
                                            BlockDriverCompletionFunc *cb,
3260
                                            void *opaque,
3261
                                            int is_write)
3262

    
3263
{
3264
    BlockDriverAIOCBSync *acb;
3265

    
3266
    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3267
    acb->is_write = is_write;
3268
    acb->qiov = qiov;
3269
    acb->bounce = qemu_blockalign(bs, qiov->size);
3270
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3271

    
3272
    if (is_write) {
3273
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3274
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3275
    } else {
3276
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3277
    }
3278

    
3279
    qemu_bh_schedule(acb->bh);
3280

    
3281
    return &acb->common;
3282
}
3283

    
3284
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3285
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3286
        BlockDriverCompletionFunc *cb, void *opaque)
3287
{
3288
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3289
}
3290

    
3291
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3292
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3293
        BlockDriverCompletionFunc *cb, void *opaque)
3294
{
3295
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3296
}
3297

    
3298

    
3299
typedef struct BlockDriverAIOCBCoroutine {
3300
    BlockDriverAIOCB common;
3301
    BlockRequest req;
3302
    bool is_write;
3303
    QEMUBH* bh;
3304
} BlockDriverAIOCBCoroutine;
3305

    
3306
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3307
{
3308
    qemu_aio_flush();
3309
}
3310

    
3311
static AIOPool bdrv_em_co_aio_pool = {
3312
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3313
    .cancel             = bdrv_aio_co_cancel_em,
3314
};
3315

    
3316
static void bdrv_co_em_bh(void *opaque)
3317
{
3318
    BlockDriverAIOCBCoroutine *acb = opaque;
3319

    
3320
    acb->common.cb(acb->common.opaque, acb->req.error);
3321
    qemu_bh_delete(acb->bh);
3322
    qemu_aio_release(acb);
3323
}
3324

    
3325
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3326
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3327
{
3328
    BlockDriverAIOCBCoroutine *acb = opaque;
3329
    BlockDriverState *bs = acb->common.bs;
3330

    
3331
    if (!acb->is_write) {
3332
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3333
            acb->req.nb_sectors, acb->req.qiov, 0);
3334
    } else {
3335
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3336
            acb->req.nb_sectors, acb->req.qiov, 0);
3337
    }
3338

    
3339
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3340
    qemu_bh_schedule(acb->bh);
3341
}
3342

    
3343
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3344
                                               int64_t sector_num,
3345
                                               QEMUIOVector *qiov,
3346
                                               int nb_sectors,
3347
                                               BlockDriverCompletionFunc *cb,
3348
                                               void *opaque,
3349
                                               bool is_write)
3350
{
3351
    Coroutine *co;
3352
    BlockDriverAIOCBCoroutine *acb;
3353

    
3354
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3355
    acb->req.sector = sector_num;
3356
    acb->req.nb_sectors = nb_sectors;
3357
    acb->req.qiov = qiov;
3358
    acb->is_write = is_write;
3359

    
3360
    co = qemu_coroutine_create(bdrv_co_do_rw);
3361
    qemu_coroutine_enter(co, acb);
3362

    
3363
    return &acb->common;
3364
}
3365

    
3366
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3367
{
3368
    BlockDriverAIOCBCoroutine *acb = opaque;
3369
    BlockDriverState *bs = acb->common.bs;
3370

    
3371
    acb->req.error = bdrv_co_flush(bs);
3372
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3373
    qemu_bh_schedule(acb->bh);
3374
}
3375

    
3376
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3377
        BlockDriverCompletionFunc *cb, void *opaque)
3378
{
3379
    trace_bdrv_aio_flush(bs, opaque);
3380

    
3381
    Coroutine *co;
3382
    BlockDriverAIOCBCoroutine *acb;
3383

    
3384
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3385
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3386
    qemu_coroutine_enter(co, acb);
3387

    
3388
    return &acb->common;
3389
}
3390

    
3391
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3392
{
3393
    BlockDriverAIOCBCoroutine *acb = opaque;
3394
    BlockDriverState *bs = acb->common.bs;
3395

    
3396
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3397
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3398
    qemu_bh_schedule(acb->bh);
3399
}
3400

    
3401
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3402
        int64_t sector_num, int nb_sectors,
3403
        BlockDriverCompletionFunc *cb, void *opaque)
3404
{
3405
    Coroutine *co;
3406
    BlockDriverAIOCBCoroutine *acb;
3407

    
3408
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3409

    
3410
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3411
    acb->req.sector = sector_num;
3412
    acb->req.nb_sectors = nb_sectors;
3413
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3414
    qemu_coroutine_enter(co, acb);
3415

    
3416
    return &acb->common;
3417
}
3418

    
3419
void bdrv_init(void)
3420
{
3421
    module_call_init(MODULE_INIT_BLOCK);
3422
}
3423

    
3424
void bdrv_init_with_whitelist(void)
3425
{
3426
    use_bdrv_whitelist = 1;
3427
    bdrv_init();
3428
}
3429

    
3430
void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3431
                   BlockDriverCompletionFunc *cb, void *opaque)
3432
{
3433
    BlockDriverAIOCB *acb;
3434

    
3435
    if (pool->free_aiocb) {
3436
        acb = pool->free_aiocb;
3437
        pool->free_aiocb = acb->next;
3438
    } else {
3439
        acb = g_malloc0(pool->aiocb_size);
3440
        acb->pool = pool;
3441
    }
3442
    acb->bs = bs;
3443
    acb->cb = cb;
3444
    acb->opaque = opaque;
3445
    return acb;
3446
}
3447

    
3448
void qemu_aio_release(void *p)
3449
{
3450
    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3451
    AIOPool *pool = acb->pool;
3452
    acb->next = pool->free_aiocb;
3453
    pool->free_aiocb = acb;
3454
}
3455

    
3456
/**************************************************************/
3457
/* Coroutine block device emulation */
3458

    
3459
typedef struct CoroutineIOCompletion {
3460
    Coroutine *coroutine;
3461
    int ret;
3462
} CoroutineIOCompletion;
3463

    
3464
static void bdrv_co_io_em_complete(void *opaque, int ret)
3465
{
3466
    CoroutineIOCompletion *co = opaque;
3467

    
3468
    co->ret = ret;
3469
    qemu_coroutine_enter(co->coroutine, NULL);
3470
}
3471

    
3472
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3473
                                      int nb_sectors, QEMUIOVector *iov,
3474
                                      bool is_write)
3475
{
3476
    CoroutineIOCompletion co = {
3477
        .coroutine = qemu_coroutine_self(),
3478
    };
3479
    BlockDriverAIOCB *acb;
3480

    
3481
    if (is_write) {
3482
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3483
                                       bdrv_co_io_em_complete, &co);
3484
    } else {
3485
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3486
                                      bdrv_co_io_em_complete, &co);
3487
    }
3488

    
3489
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3490
    if (!acb) {
3491
        return -EIO;
3492
    }
3493
    qemu_coroutine_yield();
3494

    
3495
    return co.ret;
3496
}
3497

    
3498
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3499
                                         int64_t sector_num, int nb_sectors,
3500
                                         QEMUIOVector *iov)
3501
{
3502
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3503
}
3504

    
3505
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3506
                                         int64_t sector_num, int nb_sectors,
3507
                                         QEMUIOVector *iov)
3508
{
3509
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3510
}
3511

    
3512
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3513
{
3514
    RwCo *rwco = opaque;
3515

    
3516
    rwco->ret = bdrv_co_flush(rwco->bs);
3517
}
3518

    
3519
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3520
{
3521
    int ret;
3522

    
3523
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3524
        return 0;
3525
    }
3526

    
3527
    /* Write back cached data to the OS even with cache=unsafe */
3528
    if (bs->drv->bdrv_co_flush_to_os) {
3529
        ret = bs->drv->bdrv_co_flush_to_os(bs);
3530
        if (ret < 0) {
3531
            return ret;
3532
        }
3533
    }
3534

    
3535
    /* But don't actually force it to the disk with cache=unsafe */
3536
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
3537
        goto flush_parent;
3538
    }
3539

    
3540
    if (bs->drv->bdrv_co_flush_to_disk) {
3541
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
3542
    } else if (bs->drv->bdrv_aio_flush) {
3543
        BlockDriverAIOCB *acb;
3544
        CoroutineIOCompletion co = {
3545
            .coroutine = qemu_coroutine_self(),
3546
        };
3547

    
3548
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3549
        if (acb == NULL) {
3550
            ret = -EIO;
3551
        } else {
3552
            qemu_coroutine_yield();
3553
            ret = co.ret;
3554
        }
3555
    } else {
3556
        /*
3557
         * Some block drivers always operate in either writethrough or unsafe
3558
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3559
         * know how the server works (because the behaviour is hardcoded or
3560
         * depends on server-side configuration), so we can't ensure that
3561
         * everything is safe on disk. Returning an error doesn't work because
3562
         * that would break guests even if the server operates in writethrough
3563
         * mode.
3564
         *
3565
         * Let's hope the user knows what he's doing.
3566
         */
3567
        ret = 0;
3568
    }
3569
    if (ret < 0) {
3570
        return ret;
3571
    }
3572

    
3573
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3574
     * in the case of cache=unsafe, so there are no useless flushes.
3575
     */
3576
flush_parent:
3577
    return bdrv_co_flush(bs->file);
3578
}
3579

    
3580
void bdrv_invalidate_cache(BlockDriverState *bs)
3581
{
3582
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3583
        bs->drv->bdrv_invalidate_cache(bs);
3584
    }
3585
}
3586

    
3587
void bdrv_invalidate_cache_all(void)
3588
{
3589
    BlockDriverState *bs;
3590

    
3591
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3592
        bdrv_invalidate_cache(bs);
3593
    }
3594
}
3595

    
3596
void bdrv_clear_incoming_migration_all(void)
3597
{
3598
    BlockDriverState *bs;
3599

    
3600
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3601
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3602
    }
3603
}
3604

    
3605
int bdrv_flush(BlockDriverState *bs)
3606
{
3607
    Coroutine *co;
3608
    RwCo rwco = {
3609
        .bs = bs,
3610
        .ret = NOT_DONE,
3611
    };
3612

    
3613
    if (qemu_in_coroutine()) {
3614
        /* Fast-path if already in coroutine context */
3615
        bdrv_flush_co_entry(&rwco);
3616
    } else {
3617
        co = qemu_coroutine_create(bdrv_flush_co_entry);
3618
        qemu_coroutine_enter(co, &rwco);
3619
        while (rwco.ret == NOT_DONE) {
3620
            qemu_aio_wait();
3621
        }
3622
    }
3623

    
3624
    return rwco.ret;
3625
}
3626

    
3627
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3628
{
3629
    RwCo *rwco = opaque;
3630

    
3631
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3632
}
3633

    
3634
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3635
                                 int nb_sectors)
3636
{
3637
    if (!bs->drv) {
3638
        return -ENOMEDIUM;
3639
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3640
        return -EIO;
3641
    } else if (bs->read_only) {
3642
        return -EROFS;
3643
    } else if (bs->drv->bdrv_co_discard) {
3644
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3645
    } else if (bs->drv->bdrv_aio_discard) {
3646
        BlockDriverAIOCB *acb;
3647
        CoroutineIOCompletion co = {
3648
            .coroutine = qemu_coroutine_self(),
3649
        };
3650

    
3651
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3652
                                        bdrv_co_io_em_complete, &co);
3653
        if (acb == NULL) {
3654
            return -EIO;
3655
        } else {
3656
            qemu_coroutine_yield();
3657
            return co.ret;
3658
        }
3659
    } else {
3660
        return 0;
3661
    }
3662
}
3663

    
3664
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3665
{
3666
    Coroutine *co;
3667
    RwCo rwco = {
3668
        .bs = bs,
3669
        .sector_num = sector_num,
3670
        .nb_sectors = nb_sectors,
3671
        .ret = NOT_DONE,
3672
    };
3673

    
3674
    if (qemu_in_coroutine()) {
3675
        /* Fast-path if already in coroutine context */
3676
        bdrv_discard_co_entry(&rwco);
3677
    } else {
3678
        co = qemu_coroutine_create(bdrv_discard_co_entry);
3679
        qemu_coroutine_enter(co, &rwco);
3680
        while (rwco.ret == NOT_DONE) {
3681
            qemu_aio_wait();
3682
        }
3683
    }
3684

    
3685
    return rwco.ret;
3686
}
3687

    
3688
/**************************************************************/
3689
/* removable device support */
3690

    
3691
/**
3692
 * Return TRUE if the media is present
3693
 */
3694
int bdrv_is_inserted(BlockDriverState *bs)
3695
{
3696
    BlockDriver *drv = bs->drv;
3697

    
3698
    if (!drv)
3699
        return 0;
3700
    if (!drv->bdrv_is_inserted)
3701
        return 1;
3702
    return drv->bdrv_is_inserted(bs);
3703
}
3704

    
3705
/**
3706
 * Return whether the media changed since the last call to this
3707
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3708
 */
3709
int bdrv_media_changed(BlockDriverState *bs)
3710
{
3711
    BlockDriver *drv = bs->drv;
3712

    
3713
    if (drv && drv->bdrv_media_changed) {
3714
        return drv->bdrv_media_changed(bs);
3715
    }
3716
    return -ENOTSUP;
3717
}
3718

    
3719
/**
3720
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3721
 */
3722
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3723
{
3724
    BlockDriver *drv = bs->drv;
3725

    
3726
    if (drv && drv->bdrv_eject) {
3727
        drv->bdrv_eject(bs, eject_flag);
3728
    }
3729

    
3730
    if (bs->device_name[0] != '\0') {
3731
        bdrv_emit_qmp_eject_event(bs, eject_flag);
3732
    }
3733
}
3734

    
3735
/**
3736
 * Lock or unlock the media (if it is locked, the user won't be able
3737
 * to eject it manually).
3738
 */
3739
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3740
{
3741
    BlockDriver *drv = bs->drv;
3742

    
3743
    trace_bdrv_lock_medium(bs, locked);
3744

    
3745
    if (drv && drv->bdrv_lock_medium) {
3746
        drv->bdrv_lock_medium(bs, locked);
3747
    }
3748
}
3749

    
3750
/* needed for generic scsi interface */
3751

    
3752
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3753
{
3754
    BlockDriver *drv = bs->drv;
3755

    
3756
    if (drv && drv->bdrv_ioctl)
3757
        return drv->bdrv_ioctl(bs, req, buf);
3758
    return -ENOTSUP;
3759
}
3760

    
3761
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3762
        unsigned long int req, void *buf,
3763
        BlockDriverCompletionFunc *cb, void *opaque)
3764
{
3765
    BlockDriver *drv = bs->drv;
3766

    
3767
    if (drv && drv->bdrv_aio_ioctl)
3768
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3769
    return NULL;
3770
}
3771

    
3772
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3773
{
3774
    bs->buffer_alignment = align;
3775
}
3776

    
3777
void *qemu_blockalign(BlockDriverState *bs, size_t size)
3778
{
3779
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3780
}
3781

    
3782
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3783
{
3784
    int64_t bitmap_size;
3785

    
3786
    bs->dirty_count = 0;
3787
    if (enable) {
3788
        if (!bs->dirty_bitmap) {
3789
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3790
                    BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3791
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3792

    
3793
            bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3794
        }
3795
    } else {
3796
        if (bs->dirty_bitmap) {
3797
            g_free(bs->dirty_bitmap);
3798
            bs->dirty_bitmap = NULL;
3799
        }
3800
    }
3801
}
3802

    
3803
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3804
{
3805
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3806

    
3807
    if (bs->dirty_bitmap &&
3808
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3809
        return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3810
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
3811
    } else {
3812
        return 0;
3813
    }
3814
}
3815

    
3816
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3817
                      int nr_sectors)
3818
{
3819
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3820
}
3821

    
3822
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3823
{
3824
    return bs->dirty_count;
3825
}
3826

    
3827
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3828
{
3829
    assert(bs->in_use != in_use);
3830
    bs->in_use = in_use;
3831
}
3832

    
3833
int bdrv_in_use(BlockDriverState *bs)
3834
{
3835
    return bs->in_use;
3836
}
3837

    
3838
void bdrv_iostatus_enable(BlockDriverState *bs)
3839
{
3840
    bs->iostatus_enabled = true;
3841
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3842
}
3843

    
3844
/* The I/O status is only enabled if the drive explicitly
3845
 * enables it _and_ the VM is configured to stop on errors */
3846
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3847
{
3848
    return (bs->iostatus_enabled &&
3849
           (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3850
            bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3851
            bs->on_read_error == BLOCK_ERR_STOP_ANY));
3852
}
3853

    
3854
void bdrv_iostatus_disable(BlockDriverState *bs)
3855
{
3856
    bs->iostatus_enabled = false;
3857
}
3858

    
3859
void bdrv_iostatus_reset(BlockDriverState *bs)
3860
{
3861
    if (bdrv_iostatus_is_enabled(bs)) {
3862
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3863
    }
3864
}
3865

    
3866
/* XXX: Today this is set by device models because it makes the implementation
3867
   quite simple. However, the block layer knows about the error, so it's
3868
   possible to implement this without device models being involved */
3869
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3870
{
3871
    if (bdrv_iostatus_is_enabled(bs) &&
3872
        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3873
        assert(error >= 0);
3874
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3875
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
3876
    }
3877
}
3878

    
3879
void
3880
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3881
        enum BlockAcctType type)
3882
{
3883
    assert(type < BDRV_MAX_IOTYPE);
3884

    
3885
    cookie->bytes = bytes;
3886
    cookie->start_time_ns = get_clock();
3887
    cookie->type = type;
3888
}
3889

    
3890
void
3891
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3892
{
3893
    assert(cookie->type < BDRV_MAX_IOTYPE);
3894

    
3895
    bs->nr_bytes[cookie->type] += cookie->bytes;
3896
    bs->nr_ops[cookie->type]++;
3897
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3898
}
3899

    
3900
int bdrv_img_create(const char *filename, const char *fmt,
3901
                    const char *base_filename, const char *base_fmt,
3902
                    char *options, uint64_t img_size, int flags)
3903
{
3904
    QEMUOptionParameter *param = NULL, *create_options = NULL;
3905
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
3906
    BlockDriverState *bs = NULL;
3907
    BlockDriver *drv, *proto_drv;
3908
    BlockDriver *backing_drv = NULL;
3909
    int ret = 0;
3910

    
3911
    /* Find driver and parse its options */
3912
    drv = bdrv_find_format(fmt);
3913
    if (!drv) {
3914
        error_report("Unknown file format '%s'", fmt);
3915
        ret = -EINVAL;
3916
        goto out;
3917
    }
3918

    
3919
    proto_drv = bdrv_find_protocol(filename);
3920
    if (!proto_drv) {
3921
        error_report("Unknown protocol '%s'", filename);
3922
        ret = -EINVAL;
3923
        goto out;
3924
    }
3925

    
3926
    create_options = append_option_parameters(create_options,
3927
                                              drv->create_options);
3928
    create_options = append_option_parameters(create_options,
3929
                                              proto_drv->create_options);
3930

    
3931
    /* Create parameter list with default values */
3932
    param = parse_option_parameters("", create_options, param);
3933

    
3934
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3935

    
3936
    /* Parse -o options */
3937
    if (options) {
3938
        param = parse_option_parameters(options, create_options, param);
3939
        if (param == NULL) {
3940
            error_report("Invalid options for file format '%s'.", fmt);
3941
            ret = -EINVAL;
3942
            goto out;
3943
        }
3944
    }
3945

    
3946
    if (base_filename) {
3947
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3948
                                 base_filename)) {
3949
            error_report("Backing file not supported for file format '%s'",
3950
                         fmt);
3951
            ret = -EINVAL;
3952
            goto out;
3953
        }
3954
    }
3955

    
3956
    if (base_fmt) {
3957
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3958
            error_report("Backing file format not supported for file "
3959
                         "format '%s'", fmt);
3960
            ret = -EINVAL;
3961
            goto out;
3962
        }
3963
    }
3964

    
3965
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3966
    if (backing_file && backing_file->value.s) {
3967
        if (!strcmp(filename, backing_file->value.s)) {
3968
            error_report("Error: Trying to create an image with the "
3969
                         "same filename as the backing file");
3970
            ret = -EINVAL;
3971
            goto out;
3972
        }
3973
    }
3974

    
3975
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3976
    if (backing_fmt && backing_fmt->value.s) {
3977
        backing_drv = bdrv_find_format(backing_fmt->value.s);
3978
        if (!backing_drv) {
3979
            error_report("Unknown backing file format '%s'",
3980
                         backing_fmt->value.s);
3981
            ret = -EINVAL;
3982
            goto out;
3983
        }
3984
    }
3985

    
3986
    // The size for the image must always be specified, with one exception:
3987
    // If we are using a backing file, we can obtain the size from there
3988
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
3989
    if (size && size->value.n == -1) {
3990
        if (backing_file && backing_file->value.s) {
3991
            uint64_t size;
3992
            char buf[32];
3993
            int back_flags;
3994

    
3995
            /* backing files always opened read-only */
3996
            back_flags =
3997
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
3998

    
3999
            bs = bdrv_new("");
4000

    
4001
            ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4002
            if (ret < 0) {
4003
                error_report("Could not open '%s'", backing_file->value.s);
4004
                goto out;
4005
            }
4006
            bdrv_get_geometry(bs, &size);
4007
            size *= 512;
4008

    
4009
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4010
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4011
        } else {
4012
            error_report("Image creation needs a size parameter");
4013
            ret = -EINVAL;
4014
            goto out;
4015
        }
4016
    }
4017

    
4018
    printf("Formatting '%s', fmt=%s ", filename, fmt);
4019
    print_option_parameters(param);
4020
    puts("");
4021

    
4022
    ret = bdrv_create(drv, filename, param);
4023

    
4024
    if (ret < 0) {
4025
        if (ret == -ENOTSUP) {
4026
            error_report("Formatting or formatting option not supported for "
4027
                         "file format '%s'", fmt);
4028
        } else if (ret == -EFBIG) {
4029
            error_report("The image size is too large for file format '%s'",
4030
                         fmt);
4031
        } else {
4032
            error_report("%s: error while creating %s: %s", filename, fmt,
4033
                         strerror(-ret));
4034
        }
4035
    }
4036

    
4037
out:
4038
    free_option_parameters(create_options);
4039
    free_option_parameters(param);
4040

    
4041
    if (bs) {
4042
        bdrv_delete(bs);
4043
    }
4044

    
4045
    return ret;
4046
}
4047

    
4048
void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4049
                       int64_t speed, BlockDriverCompletionFunc *cb,
4050
                       void *opaque, Error **errp)
4051
{
4052
    BlockJob *job;
4053

    
4054
    if (bs->job || bdrv_in_use(bs)) {
4055
        error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4056
        return NULL;
4057
    }
4058
    bdrv_set_in_use(bs, 1);
4059

    
4060
    job = g_malloc0(job_type->instance_size);
4061
    job->job_type      = job_type;
4062
    job->bs            = bs;
4063
    job->cb            = cb;
4064
    job->opaque        = opaque;
4065
    job->busy          = true;
4066
    bs->job = job;
4067

    
4068
    /* Only set speed when necessary to avoid NotSupported error */
4069
    if (speed != 0) {
4070
        Error *local_err = NULL;
4071

    
4072
        block_job_set_speed(job, speed, &local_err);
4073
        if (error_is_set(&local_err)) {
4074
            bs->job = NULL;
4075
            g_free(job);
4076
            bdrv_set_in_use(bs, 0);
4077
            error_propagate(errp, local_err);
4078
            return NULL;
4079
        }
4080
    }
4081
    return job;
4082
}
4083

    
4084
void block_job_complete(BlockJob *job, int ret)
4085
{
4086
    BlockDriverState *bs = job->bs;
4087

    
4088
    assert(bs->job == job);
4089
    job->cb(job->opaque, ret);
4090
    bs->job = NULL;
4091
    g_free(job);
4092
    bdrv_set_in_use(bs, 0);
4093
}
4094

    
4095
void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4096
{
4097
    Error *local_err = NULL;
4098

    
4099
    if (!job->job_type->set_speed) {
4100
        error_set(errp, QERR_NOT_SUPPORTED);
4101
        return;
4102
    }
4103
    job->job_type->set_speed(job, speed, &local_err);
4104
    if (error_is_set(&local_err)) {
4105
        error_propagate(errp, local_err);
4106
        return;
4107
    }
4108

    
4109
    job->speed = speed;
4110
}
4111

    
4112
void block_job_cancel(BlockJob *job)
4113
{
4114
    job->cancelled = true;
4115
    if (job->co && !job->busy) {
4116
        qemu_coroutine_enter(job->co, NULL);
4117
    }
4118
}
4119

    
4120
bool block_job_is_cancelled(BlockJob *job)
4121
{
4122
    return job->cancelled;
4123
}
4124

    
4125
struct BlockCancelData {
4126
    BlockJob *job;
4127
    BlockDriverCompletionFunc *cb;
4128
    void *opaque;
4129
    bool cancelled;
4130
    int ret;
4131
};
4132

    
4133
static void block_job_cancel_cb(void *opaque, int ret)
4134
{
4135
    struct BlockCancelData *data = opaque;
4136

    
4137
    data->cancelled = block_job_is_cancelled(data->job);
4138
    data->ret = ret;
4139
    data->cb(data->opaque, ret);
4140
}
4141

    
4142
int block_job_cancel_sync(BlockJob *job)
4143
{
4144
    struct BlockCancelData data;
4145
    BlockDriverState *bs = job->bs;
4146

    
4147
    assert(bs->job == job);
4148

    
4149
    /* Set up our own callback to store the result and chain to
4150
     * the original callback.
4151
     */
4152
    data.job = job;
4153
    data.cb = job->cb;
4154
    data.opaque = job->opaque;
4155
    data.ret = -EINPROGRESS;
4156
    job->cb = block_job_cancel_cb;
4157
    job->opaque = &data;
4158
    block_job_cancel(job);
4159
    while (data.ret == -EINPROGRESS) {
4160
        qemu_aio_wait();
4161
    }
4162
    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4163
}
4164

    
4165
void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4166
{
4167
    /* Check cancellation *before* setting busy = false, too!  */
4168
    if (!block_job_is_cancelled(job)) {
4169
        job->busy = false;
4170
        co_sleep_ns(clock, ns);
4171
        job->busy = true;
4172
    }
4173
}