Statistics
| Branch: | Revision:

root / block.c @ e1e9b0ac

History | View | Annotate | Download (120.9 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block_int.h"
29
#include "module.h"
30
#include "qjson.h"
31
#include "qemu-coroutine.h"
32
#include "qmp-commands.h"
33
#include "qemu-timer.h"
34

    
35
#ifdef CONFIG_BSD
36
#include <sys/types.h>
37
#include <sys/stat.h>
38
#include <sys/ioctl.h>
39
#include <sys/queue.h>
40
#ifndef __DragonFly__
41
#include <sys/disk.h>
42
#endif
43
#endif
44

    
45
#ifdef _WIN32
46
#include <windows.h>
47
#endif
48

    
49
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50

    
51
typedef enum {
52
    BDRV_REQ_COPY_ON_READ = 0x1,
53
    BDRV_REQ_ZERO_WRITE   = 0x2,
54
} BdrvRequestFlags;
55

    
56
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59
        BlockDriverCompletionFunc *cb, void *opaque);
60
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64
                                         int64_t sector_num, int nb_sectors,
65
                                         QEMUIOVector *iov);
66
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71
    BdrvRequestFlags flags);
72
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76
                                               int64_t sector_num,
77
                                               QEMUIOVector *qiov,
78
                                               int nb_sectors,
79
                                               BlockDriverCompletionFunc *cb,
80
                                               void *opaque,
81
                                               bool is_write);
82
static void coroutine_fn bdrv_co_do_rw(void *opaque);
83
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84
    int64_t sector_num, int nb_sectors);
85

    
86
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87
        bool is_write, double elapsed_time, uint64_t *wait);
88
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89
        double elapsed_time, uint64_t *wait);
90
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91
        bool is_write, int64_t *wait);
92

    
93
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
95

    
96
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
98

    
99
/* The device to use for VM snapshots */
100
static BlockDriverState *bs_snapshots;
101

    
102
/* If non-zero, use only whitelisted block drivers */
103
static int use_bdrv_whitelist;
104

    
105
#ifdef _WIN32
106
static int is_windows_drive_prefix(const char *filename)
107
{
108
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110
            filename[1] == ':');
111
}
112

    
113
int is_windows_drive(const char *filename)
114
{
115
    if (is_windows_drive_prefix(filename) &&
116
        filename[2] == '\0')
117
        return 1;
118
    if (strstart(filename, "\\\\.\\", NULL) ||
119
        strstart(filename, "//./", NULL))
120
        return 1;
121
    return 0;
122
}
123
#endif
124

    
125
/* throttling disk I/O limits */
126
void bdrv_io_limits_disable(BlockDriverState *bs)
127
{
128
    bs->io_limits_enabled = false;
129

    
130
    while (qemu_co_queue_next(&bs->throttled_reqs));
131

    
132
    if (bs->block_timer) {
133
        qemu_del_timer(bs->block_timer);
134
        qemu_free_timer(bs->block_timer);
135
        bs->block_timer = NULL;
136
    }
137

    
138
    bs->slice_start = 0;
139
    bs->slice_end   = 0;
140
    bs->slice_time  = 0;
141
    memset(&bs->io_base, 0, sizeof(bs->io_base));
142
}
143

    
144
static void bdrv_block_timer(void *opaque)
145
{
146
    BlockDriverState *bs = opaque;
147

    
148
    qemu_co_queue_next(&bs->throttled_reqs);
149
}
150

    
151
void bdrv_io_limits_enable(BlockDriverState *bs)
152
{
153
    qemu_co_queue_init(&bs->throttled_reqs);
154
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
156
    bs->slice_start = qemu_get_clock_ns(vm_clock);
157
    bs->slice_end   = bs->slice_start + bs->slice_time;
158
    memset(&bs->io_base, 0, sizeof(bs->io_base));
159
    bs->io_limits_enabled = true;
160
}
161

    
162
bool bdrv_io_limits_enabled(BlockDriverState *bs)
163
{
164
    BlockIOLimit *io_limits = &bs->io_limits;
165
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
166
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
169
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171
}
172

    
173
static void bdrv_io_limits_intercept(BlockDriverState *bs,
174
                                     bool is_write, int nb_sectors)
175
{
176
    int64_t wait_time = -1;
177

    
178
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179
        qemu_co_queue_wait(&bs->throttled_reqs);
180
    }
181

    
182
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183
     * throttled requests will not be dequeued until the current request is
184
     * allowed to be serviced. So if the current request still exceeds the
185
     * limits, it will be inserted to the head. All requests followed it will
186
     * be still in throttled_reqs queue.
187
     */
188

    
189
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190
        qemu_mod_timer(bs->block_timer,
191
                       wait_time + qemu_get_clock_ns(vm_clock));
192
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193
    }
194

    
195
    qemu_co_queue_next(&bs->throttled_reqs);
196
}
197

    
198
/* check if the path starts with "<protocol>:" */
199
static int path_has_protocol(const char *path)
200
{
201
    const char *p;
202

    
203
#ifdef _WIN32
204
    if (is_windows_drive(path) ||
205
        is_windows_drive_prefix(path)) {
206
        return 0;
207
    }
208
    p = path + strcspn(path, ":/\\");
209
#else
210
    p = path + strcspn(path, ":/");
211
#endif
212

    
213
    return *p == ':';
214
}
215

    
216
int path_is_absolute(const char *path)
217
{
218
#ifdef _WIN32
219
    /* specific case for names like: "\\.\d:" */
220
    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221
        return 1;
222
    }
223
    return (*path == '/' || *path == '\\');
224
#else
225
    return (*path == '/');
226
#endif
227
}
228

    
229
/* if filename is absolute, just copy it to dest. Otherwise, build a
230
   path to it by considering it is relative to base_path. URL are
231
   supported. */
232
void path_combine(char *dest, int dest_size,
233
                  const char *base_path,
234
                  const char *filename)
235
{
236
    const char *p, *p1;
237
    int len;
238

    
239
    if (dest_size <= 0)
240
        return;
241
    if (path_is_absolute(filename)) {
242
        pstrcpy(dest, dest_size, filename);
243
    } else {
244
        p = strchr(base_path, ':');
245
        if (p)
246
            p++;
247
        else
248
            p = base_path;
249
        p1 = strrchr(base_path, '/');
250
#ifdef _WIN32
251
        {
252
            const char *p2;
253
            p2 = strrchr(base_path, '\\');
254
            if (!p1 || p2 > p1)
255
                p1 = p2;
256
        }
257
#endif
258
        if (p1)
259
            p1++;
260
        else
261
            p1 = base_path;
262
        if (p1 > p)
263
            p = p1;
264
        len = p - base_path;
265
        if (len > dest_size - 1)
266
            len = dest_size - 1;
267
        memcpy(dest, base_path, len);
268
        dest[len] = '\0';
269
        pstrcat(dest, dest_size, filename);
270
    }
271
}
272

    
273
void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
274
{
275
    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276
        pstrcpy(dest, sz, bs->backing_file);
277
    } else {
278
        path_combine(dest, sz, bs->filename, bs->backing_file);
279
    }
280
}
281

    
282
void bdrv_register(BlockDriver *bdrv)
283
{
284
    /* Block drivers without coroutine functions need emulation */
285
    if (!bdrv->bdrv_co_readv) {
286
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
287
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
288

    
289
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290
         * the block driver lacks aio we need to emulate that too.
291
         */
292
        if (!bdrv->bdrv_aio_readv) {
293
            /* add AIO emulation layer */
294
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
296
        }
297
    }
298

    
299
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
300
}
301

    
302
/* create a new block device (by default it is empty) */
303
BlockDriverState *bdrv_new(const char *device_name)
304
{
305
    BlockDriverState *bs;
306

    
307
    bs = g_malloc0(sizeof(BlockDriverState));
308
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309
    if (device_name[0] != '\0') {
310
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
311
    }
312
    bdrv_iostatus_disable(bs);
313
    return bs;
314
}
315

    
316
BlockDriver *bdrv_find_format(const char *format_name)
317
{
318
    BlockDriver *drv1;
319
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320
        if (!strcmp(drv1->format_name, format_name)) {
321
            return drv1;
322
        }
323
    }
324
    return NULL;
325
}
326

    
327
static int bdrv_is_whitelisted(BlockDriver *drv)
328
{
329
    static const char *whitelist[] = {
330
        CONFIG_BDRV_WHITELIST
331
    };
332
    const char **p;
333

    
334
    if (!whitelist[0])
335
        return 1;               /* no whitelist, anything goes */
336

    
337
    for (p = whitelist; *p; p++) {
338
        if (!strcmp(drv->format_name, *p)) {
339
            return 1;
340
        }
341
    }
342
    return 0;
343
}
344

    
345
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
346
{
347
    BlockDriver *drv = bdrv_find_format(format_name);
348
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
349
}
350

    
351
typedef struct CreateCo {
352
    BlockDriver *drv;
353
    char *filename;
354
    QEMUOptionParameter *options;
355
    int ret;
356
} CreateCo;
357

    
358
static void coroutine_fn bdrv_create_co_entry(void *opaque)
359
{
360
    CreateCo *cco = opaque;
361
    assert(cco->drv);
362

    
363
    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
364
}
365

    
366
int bdrv_create(BlockDriver *drv, const char* filename,
367
    QEMUOptionParameter *options)
368
{
369
    int ret;
370

    
371
    Coroutine *co;
372
    CreateCo cco = {
373
        .drv = drv,
374
        .filename = g_strdup(filename),
375
        .options = options,
376
        .ret = NOT_DONE,
377
    };
378

    
379
    if (!drv->bdrv_create) {
380
        return -ENOTSUP;
381
    }
382

    
383
    if (qemu_in_coroutine()) {
384
        /* Fast-path if already in coroutine context */
385
        bdrv_create_co_entry(&cco);
386
    } else {
387
        co = qemu_coroutine_create(bdrv_create_co_entry);
388
        qemu_coroutine_enter(co, &cco);
389
        while (cco.ret == NOT_DONE) {
390
            qemu_aio_wait();
391
        }
392
    }
393

    
394
    ret = cco.ret;
395
    g_free(cco.filename);
396

    
397
    return ret;
398
}
399

    
400
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
401
{
402
    BlockDriver *drv;
403

    
404
    drv = bdrv_find_protocol(filename);
405
    if (drv == NULL) {
406
        return -ENOENT;
407
    }
408

    
409
    return bdrv_create(drv, filename, options);
410
}
411

    
412
/*
413
 * Create a uniquely-named empty temporary file.
414
 * Return 0 upon success, otherwise a negative errno value.
415
 */
416
int get_tmp_filename(char *filename, int size)
417
{
418
#ifdef _WIN32
419
    char temp_dir[MAX_PATH];
420
    /* GetTempFileName requires that its output buffer (4th param)
421
       have length MAX_PATH or greater.  */
422
    assert(size >= MAX_PATH);
423
    return (GetTempPath(MAX_PATH, temp_dir)
424
            && GetTempFileName(temp_dir, "qem", 0, filename)
425
            ? 0 : -GetLastError());
426
#else
427
    int fd;
428
    const char *tmpdir;
429
    tmpdir = getenv("TMPDIR");
430
    if (!tmpdir)
431
        tmpdir = "/tmp";
432
    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433
        return -EOVERFLOW;
434
    }
435
    fd = mkstemp(filename);
436
    if (fd < 0 || close(fd)) {
437
        return -errno;
438
    }
439
    return 0;
440
#endif
441
}
442

    
443
/*
444
 * Detect host devices. By convention, /dev/cdrom[N] is always
445
 * recognized as a host CDROM.
446
 */
447
static BlockDriver *find_hdev_driver(const char *filename)
448
{
449
    int score_max = 0, score;
450
    BlockDriver *drv = NULL, *d;
451

    
452
    QLIST_FOREACH(d, &bdrv_drivers, list) {
453
        if (d->bdrv_probe_device) {
454
            score = d->bdrv_probe_device(filename);
455
            if (score > score_max) {
456
                score_max = score;
457
                drv = d;
458
            }
459
        }
460
    }
461

    
462
    return drv;
463
}
464

    
465
BlockDriver *bdrv_find_protocol(const char *filename)
466
{
467
    BlockDriver *drv1;
468
    char protocol[128];
469
    int len;
470
    const char *p;
471

    
472
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
473

    
474
    /*
475
     * XXX(hch): we really should not let host device detection
476
     * override an explicit protocol specification, but moving this
477
     * later breaks access to device names with colons in them.
478
     * Thanks to the brain-dead persistent naming schemes on udev-
479
     * based Linux systems those actually are quite common.
480
     */
481
    drv1 = find_hdev_driver(filename);
482
    if (drv1) {
483
        return drv1;
484
    }
485

    
486
    if (!path_has_protocol(filename)) {
487
        return bdrv_find_format("file");
488
    }
489
    p = strchr(filename, ':');
490
    assert(p != NULL);
491
    len = p - filename;
492
    if (len > sizeof(protocol) - 1)
493
        len = sizeof(protocol) - 1;
494
    memcpy(protocol, filename, len);
495
    protocol[len] = '\0';
496
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
497
        if (drv1->protocol_name &&
498
            !strcmp(drv1->protocol_name, protocol)) {
499
            return drv1;
500
        }
501
    }
502
    return NULL;
503
}
504

    
505
static int find_image_format(const char *filename, BlockDriver **pdrv)
506
{
507
    int ret, score, score_max;
508
    BlockDriver *drv1, *drv;
509
    uint8_t buf[2048];
510
    BlockDriverState *bs;
511

    
512
    ret = bdrv_file_open(&bs, filename, 0);
513
    if (ret < 0) {
514
        *pdrv = NULL;
515
        return ret;
516
    }
517

    
518
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
519
    if (bs->sg || !bdrv_is_inserted(bs)) {
520
        bdrv_delete(bs);
521
        drv = bdrv_find_format("raw");
522
        if (!drv) {
523
            ret = -ENOENT;
524
        }
525
        *pdrv = drv;
526
        return ret;
527
    }
528

    
529
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
530
    bdrv_delete(bs);
531
    if (ret < 0) {
532
        *pdrv = NULL;
533
        return ret;
534
    }
535

    
536
    score_max = 0;
537
    drv = NULL;
538
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
539
        if (drv1->bdrv_probe) {
540
            score = drv1->bdrv_probe(buf, ret, filename);
541
            if (score > score_max) {
542
                score_max = score;
543
                drv = drv1;
544
            }
545
        }
546
    }
547
    if (!drv) {
548
        ret = -ENOENT;
549
    }
550
    *pdrv = drv;
551
    return ret;
552
}
553

    
554
/**
555
 * Set the current 'total_sectors' value
556
 */
557
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
558
{
559
    BlockDriver *drv = bs->drv;
560

    
561
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
562
    if (bs->sg)
563
        return 0;
564

    
565
    /* query actual device if possible, otherwise just trust the hint */
566
    if (drv->bdrv_getlength) {
567
        int64_t length = drv->bdrv_getlength(bs);
568
        if (length < 0) {
569
            return length;
570
        }
571
        hint = length >> BDRV_SECTOR_BITS;
572
    }
573

    
574
    bs->total_sectors = hint;
575
    return 0;
576
}
577

    
578
/**
579
 * Set open flags for a given cache mode
580
 *
581
 * Return 0 on success, -1 if the cache mode was invalid.
582
 */
583
int bdrv_parse_cache_flags(const char *mode, int *flags)
584
{
585
    *flags &= ~BDRV_O_CACHE_MASK;
586

    
587
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
588
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
589
    } else if (!strcmp(mode, "directsync")) {
590
        *flags |= BDRV_O_NOCACHE;
591
    } else if (!strcmp(mode, "writeback")) {
592
        *flags |= BDRV_O_CACHE_WB;
593
    } else if (!strcmp(mode, "unsafe")) {
594
        *flags |= BDRV_O_CACHE_WB;
595
        *flags |= BDRV_O_NO_FLUSH;
596
    } else if (!strcmp(mode, "writethrough")) {
597
        /* this is the default */
598
    } else {
599
        return -1;
600
    }
601

    
602
    return 0;
603
}
604

    
605
/**
606
 * The copy-on-read flag is actually a reference count so multiple users may
607
 * use the feature without worrying about clobbering its previous state.
608
 * Copy-on-read stays enabled until all users have called to disable it.
609
 */
610
void bdrv_enable_copy_on_read(BlockDriverState *bs)
611
{
612
    bs->copy_on_read++;
613
}
614

    
615
void bdrv_disable_copy_on_read(BlockDriverState *bs)
616
{
617
    assert(bs->copy_on_read > 0);
618
    bs->copy_on_read--;
619
}
620

    
621
/*
622
 * Common part for opening disk images and files
623
 */
624
static int bdrv_open_common(BlockDriverState *bs, const char *filename,
625
    int flags, BlockDriver *drv)
626
{
627
    int ret, open_flags;
628

    
629
    assert(drv != NULL);
630
    assert(bs->file == NULL);
631

    
632
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
633

    
634
    bs->open_flags = flags;
635
    bs->buffer_alignment = 512;
636

    
637
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
638
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
639
        bdrv_enable_copy_on_read(bs);
640
    }
641

    
642
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
643

    
644
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
645
        return -ENOTSUP;
646
    }
647

    
648
    bs->drv = drv;
649
    bs->opaque = g_malloc0(drv->instance_size);
650

    
651
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
652
    open_flags = flags | BDRV_O_CACHE_WB;
653

    
654
    /*
655
     * Clear flags that are internal to the block layer before opening the
656
     * image.
657
     */
658
    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
659

    
660
    /*
661
     * Snapshots should be writable.
662
     */
663
    if (bs->is_temporary) {
664
        open_flags |= BDRV_O_RDWR;
665
    }
666

    
667
    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
668

    
669
    /* Open the image, either directly or using a protocol */
670
    if (drv->bdrv_file_open) {
671
        ret = drv->bdrv_file_open(bs, filename, open_flags);
672
    } else {
673
        ret = bdrv_file_open(&bs->file, filename, open_flags);
674
        if (ret >= 0) {
675
            ret = drv->bdrv_open(bs, open_flags);
676
        }
677
    }
678

    
679
    if (ret < 0) {
680
        goto free_and_fail;
681
    }
682

    
683
    ret = refresh_total_sectors(bs, bs->total_sectors);
684
    if (ret < 0) {
685
        goto free_and_fail;
686
    }
687

    
688
#ifndef _WIN32
689
    if (bs->is_temporary) {
690
        unlink(filename);
691
    }
692
#endif
693
    return 0;
694

    
695
free_and_fail:
696
    if (bs->file) {
697
        bdrv_delete(bs->file);
698
        bs->file = NULL;
699
    }
700
    g_free(bs->opaque);
701
    bs->opaque = NULL;
702
    bs->drv = NULL;
703
    return ret;
704
}
705

    
706
/*
707
 * Opens a file using a protocol (file, host_device, nbd, ...)
708
 */
709
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
710
{
711
    BlockDriverState *bs;
712
    BlockDriver *drv;
713
    int ret;
714

    
715
    drv = bdrv_find_protocol(filename);
716
    if (!drv) {
717
        return -ENOENT;
718
    }
719

    
720
    bs = bdrv_new("");
721
    ret = bdrv_open_common(bs, filename, flags, drv);
722
    if (ret < 0) {
723
        bdrv_delete(bs);
724
        return ret;
725
    }
726
    bs->growable = 1;
727
    *pbs = bs;
728
    return 0;
729
}
730

    
731
/*
732
 * Opens a disk image (raw, qcow2, vmdk, ...)
733
 */
734
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
735
              BlockDriver *drv)
736
{
737
    int ret;
738
    char tmp_filename[PATH_MAX];
739

    
740
    if (flags & BDRV_O_SNAPSHOT) {
741
        BlockDriverState *bs1;
742
        int64_t total_size;
743
        int is_protocol = 0;
744
        BlockDriver *bdrv_qcow2;
745
        QEMUOptionParameter *options;
746
        char backing_filename[PATH_MAX];
747

    
748
        /* if snapshot, we create a temporary backing file and open it
749
           instead of opening 'filename' directly */
750

    
751
        /* if there is a backing file, use it */
752
        bs1 = bdrv_new("");
753
        ret = bdrv_open(bs1, filename, 0, drv);
754
        if (ret < 0) {
755
            bdrv_delete(bs1);
756
            return ret;
757
        }
758
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
759

    
760
        if (bs1->drv && bs1->drv->protocol_name)
761
            is_protocol = 1;
762

    
763
        bdrv_delete(bs1);
764

    
765
        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
766
        if (ret < 0) {
767
            return ret;
768
        }
769

    
770
        /* Real path is meaningless for protocols */
771
        if (is_protocol)
772
            snprintf(backing_filename, sizeof(backing_filename),
773
                     "%s", filename);
774
        else if (!realpath(filename, backing_filename))
775
            return -errno;
776

    
777
        bdrv_qcow2 = bdrv_find_format("qcow2");
778
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
779

    
780
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
781
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
782
        if (drv) {
783
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
784
                drv->format_name);
785
        }
786

    
787
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
788
        free_option_parameters(options);
789
        if (ret < 0) {
790
            return ret;
791
        }
792

    
793
        filename = tmp_filename;
794
        drv = bdrv_qcow2;
795
        bs->is_temporary = 1;
796
    }
797

    
798
    /* Find the right image format driver */
799
    if (!drv) {
800
        ret = find_image_format(filename, &drv);
801
    }
802

    
803
    if (!drv) {
804
        goto unlink_and_fail;
805
    }
806

    
807
    /* Open the image */
808
    ret = bdrv_open_common(bs, filename, flags, drv);
809
    if (ret < 0) {
810
        goto unlink_and_fail;
811
    }
812

    
813
    /* If there is a backing file, use it */
814
    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
815
        char backing_filename[PATH_MAX];
816
        int back_flags;
817
        BlockDriver *back_drv = NULL;
818

    
819
        bs->backing_hd = bdrv_new("");
820
        bdrv_get_full_backing_filename(bs, backing_filename,
821
                                       sizeof(backing_filename));
822

    
823
        if (bs->backing_format[0] != '\0') {
824
            back_drv = bdrv_find_format(bs->backing_format);
825
        }
826

    
827
        /* backing files always opened read-only */
828
        back_flags =
829
            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830

    
831
        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
832
        if (ret < 0) {
833
            bdrv_close(bs);
834
            return ret;
835
        }
836
        if (bs->is_temporary) {
837
            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
838
        } else {
839
            /* base image inherits from "parent" */
840
            bs->backing_hd->keep_read_only = bs->keep_read_only;
841
        }
842
    }
843

    
844
    if (!bdrv_key_required(bs)) {
845
        bdrv_dev_change_media_cb(bs, true);
846
    }
847

    
848
    /* throttling disk I/O limits */
849
    if (bs->io_limits_enabled) {
850
        bdrv_io_limits_enable(bs);
851
    }
852

    
853
    return 0;
854

    
855
unlink_and_fail:
856
    if (bs->is_temporary) {
857
        unlink(filename);
858
    }
859
    return ret;
860
}
861

    
862
void bdrv_close(BlockDriverState *bs)
863
{
864
    bdrv_flush(bs);
865
    if (bs->drv) {
866
        if (bs->job) {
867
            block_job_cancel_sync(bs->job);
868
        }
869
        bdrv_drain_all();
870

    
871
        if (bs == bs_snapshots) {
872
            bs_snapshots = NULL;
873
        }
874
        if (bs->backing_hd) {
875
            bdrv_delete(bs->backing_hd);
876
            bs->backing_hd = NULL;
877
        }
878
        bs->drv->bdrv_close(bs);
879
        g_free(bs->opaque);
880
#ifdef _WIN32
881
        if (bs->is_temporary) {
882
            unlink(bs->filename);
883
        }
884
#endif
885
        bs->opaque = NULL;
886
        bs->drv = NULL;
887
        bs->copy_on_read = 0;
888
        bs->backing_file[0] = '\0';
889
        bs->backing_format[0] = '\0';
890
        bs->total_sectors = 0;
891
        bs->encrypted = 0;
892
        bs->valid_key = 0;
893
        bs->sg = 0;
894
        bs->growable = 0;
895

    
896
        if (bs->file != NULL) {
897
            bdrv_delete(bs->file);
898
            bs->file = NULL;
899
        }
900

    
901
        bdrv_dev_change_media_cb(bs, false);
902
    }
903

    
904
    /*throttling disk I/O limits*/
905
    if (bs->io_limits_enabled) {
906
        bdrv_io_limits_disable(bs);
907
    }
908
}
909

    
910
void bdrv_close_all(void)
911
{
912
    BlockDriverState *bs;
913

    
914
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
915
        bdrv_close(bs);
916
    }
917
}
918

    
919
/*
920
 * Wait for pending requests to complete across all BlockDriverStates
921
 *
922
 * This function does not flush data to disk, use bdrv_flush_all() for that
923
 * after calling this function.
924
 *
925
 * Note that completion of an asynchronous I/O operation can trigger any
926
 * number of other I/O operations on other devices---for example a coroutine
927
 * can be arbitrarily complex and a constant flow of I/O can come until the
928
 * coroutine is complete.  Because of this, it is not possible to have a
929
 * function to drain a single device's I/O queue.
930
 */
931
void bdrv_drain_all(void)
932
{
933
    BlockDriverState *bs;
934
    bool busy;
935

    
936
    do {
937
        busy = qemu_aio_wait();
938

    
939
        /* FIXME: We do not have timer support here, so this is effectively
940
         * a busy wait.
941
         */
942
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
943
            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
944
                qemu_co_queue_restart_all(&bs->throttled_reqs);
945
                busy = true;
946
            }
947
        }
948
    } while (busy);
949

    
950
    /* If requests are still pending there is a bug somewhere */
951
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
952
        assert(QLIST_EMPTY(&bs->tracked_requests));
953
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
954
    }
955
}
956

    
957
/* make a BlockDriverState anonymous by removing from bdrv_state list.
958
   Also, NULL terminate the device_name to prevent double remove */
959
void bdrv_make_anon(BlockDriverState *bs)
960
{
961
    if (bs->device_name[0] != '\0') {
962
        QTAILQ_REMOVE(&bdrv_states, bs, list);
963
    }
964
    bs->device_name[0] = '\0';
965
}
966

    
967
static void bdrv_rebind(BlockDriverState *bs)
968
{
969
    if (bs->drv && bs->drv->bdrv_rebind) {
970
        bs->drv->bdrv_rebind(bs);
971
    }
972
}
973

    
974
/*
975
 * Add new bs contents at the top of an image chain while the chain is
976
 * live, while keeping required fields on the top layer.
977
 *
978
 * This will modify the BlockDriverState fields, and swap contents
979
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
980
 *
981
 * bs_new is required to be anonymous.
982
 *
983
 * This function does not create any image files.
984
 */
985
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
986
{
987
    BlockDriverState tmp;
988

    
989
    /* bs_new must be anonymous */
990
    assert(bs_new->device_name[0] == '\0');
991

    
992
    tmp = *bs_new;
993

    
994
    /* there are some fields that need to stay on the top layer: */
995
    tmp.open_flags        = bs_top->open_flags;
996

    
997
    /* dev info */
998
    tmp.dev_ops           = bs_top->dev_ops;
999
    tmp.dev_opaque        = bs_top->dev_opaque;
1000
    tmp.dev               = bs_top->dev;
1001
    tmp.buffer_alignment  = bs_top->buffer_alignment;
1002
    tmp.copy_on_read      = bs_top->copy_on_read;
1003

    
1004
    tmp.enable_write_cache = bs_top->enable_write_cache;
1005

    
1006
    /* i/o timing parameters */
1007
    tmp.slice_time        = bs_top->slice_time;
1008
    tmp.slice_start       = bs_top->slice_start;
1009
    tmp.slice_end         = bs_top->slice_end;
1010
    tmp.io_limits         = bs_top->io_limits;
1011
    tmp.io_base           = bs_top->io_base;
1012
    tmp.throttled_reqs    = bs_top->throttled_reqs;
1013
    tmp.block_timer       = bs_top->block_timer;
1014
    tmp.io_limits_enabled = bs_top->io_limits_enabled;
1015

    
1016
    /* geometry */
1017
    tmp.cyls              = bs_top->cyls;
1018
    tmp.heads             = bs_top->heads;
1019
    tmp.secs              = bs_top->secs;
1020
    tmp.translation       = bs_top->translation;
1021

    
1022
    /* r/w error */
1023
    tmp.on_read_error     = bs_top->on_read_error;
1024
    tmp.on_write_error    = bs_top->on_write_error;
1025

    
1026
    /* i/o status */
1027
    tmp.iostatus_enabled  = bs_top->iostatus_enabled;
1028
    tmp.iostatus          = bs_top->iostatus;
1029

    
1030
    /* keep the same entry in bdrv_states */
1031
    pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1032
    tmp.list = bs_top->list;
1033

    
1034
    /* The contents of 'tmp' will become bs_top, as we are
1035
     * swapping bs_new and bs_top contents. */
1036
    tmp.backing_hd = bs_new;
1037
    pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1038
    bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1039

    
1040
    /* swap contents of the fixed new bs and the current top */
1041
    *bs_new = *bs_top;
1042
    *bs_top = tmp;
1043

    
1044
    /* device_name[] was carried over from the old bs_top.  bs_new
1045
     * shouldn't be in bdrv_states, so we need to make device_name[]
1046
     * reflect the anonymity of bs_new
1047
     */
1048
    bs_new->device_name[0] = '\0';
1049

    
1050
    /* clear the copied fields in the new backing file */
1051
    bdrv_detach_dev(bs_new, bs_new->dev);
1052

    
1053
    qemu_co_queue_init(&bs_new->throttled_reqs);
1054
    memset(&bs_new->io_base,   0, sizeof(bs_new->io_base));
1055
    memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1056
    bdrv_iostatus_disable(bs_new);
1057

    
1058
    /* we don't use bdrv_io_limits_disable() for this, because we don't want
1059
     * to affect or delete the block_timer, as it has been moved to bs_top */
1060
    bs_new->io_limits_enabled = false;
1061
    bs_new->block_timer       = NULL;
1062
    bs_new->slice_time        = 0;
1063
    bs_new->slice_start       = 0;
1064
    bs_new->slice_end         = 0;
1065

    
1066
    bdrv_rebind(bs_new);
1067
    bdrv_rebind(bs_top);
1068
}
1069

    
1070
void bdrv_delete(BlockDriverState *bs)
1071
{
1072
    assert(!bs->dev);
1073
    assert(!bs->job);
1074
    assert(!bs->in_use);
1075

    
1076
    /* remove from list, if necessary */
1077
    bdrv_make_anon(bs);
1078

    
1079
    bdrv_close(bs);
1080

    
1081
    assert(bs != bs_snapshots);
1082
    g_free(bs);
1083
}
1084

    
1085
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1086
/* TODO change to DeviceState *dev when all users are qdevified */
1087
{
1088
    if (bs->dev) {
1089
        return -EBUSY;
1090
    }
1091
    bs->dev = dev;
1092
    bdrv_iostatus_reset(bs);
1093
    return 0;
1094
}
1095

    
1096
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1097
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1098
{
1099
    if (bdrv_attach_dev(bs, dev) < 0) {
1100
        abort();
1101
    }
1102
}
1103

    
1104
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1105
/* TODO change to DeviceState *dev when all users are qdevified */
1106
{
1107
    assert(bs->dev == dev);
1108
    bs->dev = NULL;
1109
    bs->dev_ops = NULL;
1110
    bs->dev_opaque = NULL;
1111
    bs->buffer_alignment = 512;
1112
}
1113

    
1114
/* TODO change to return DeviceState * when all users are qdevified */
1115
void *bdrv_get_attached_dev(BlockDriverState *bs)
1116
{
1117
    return bs->dev;
1118
}
1119

    
1120
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1121
                      void *opaque)
1122
{
1123
    bs->dev_ops = ops;
1124
    bs->dev_opaque = opaque;
1125
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1126
        bs_snapshots = NULL;
1127
    }
1128
}
1129

    
1130
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1131
                               BlockQMPEventAction action, int is_read)
1132
{
1133
    QObject *data;
1134
    const char *action_str;
1135

    
1136
    switch (action) {
1137
    case BDRV_ACTION_REPORT:
1138
        action_str = "report";
1139
        break;
1140
    case BDRV_ACTION_IGNORE:
1141
        action_str = "ignore";
1142
        break;
1143
    case BDRV_ACTION_STOP:
1144
        action_str = "stop";
1145
        break;
1146
    default:
1147
        abort();
1148
    }
1149

    
1150
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1151
                              bdrv->device_name,
1152
                              action_str,
1153
                              is_read ? "read" : "write");
1154
    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1155

    
1156
    qobject_decref(data);
1157
}
1158

    
1159
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1160
{
1161
    QObject *data;
1162

    
1163
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1164
                              bdrv_get_device_name(bs), ejected);
1165
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1166

    
1167
    qobject_decref(data);
1168
}
1169

    
1170
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1171
{
1172
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1173
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1174
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1175
        if (tray_was_closed) {
1176
            /* tray open */
1177
            bdrv_emit_qmp_eject_event(bs, true);
1178
        }
1179
        if (load) {
1180
            /* tray close */
1181
            bdrv_emit_qmp_eject_event(bs, false);
1182
        }
1183
    }
1184
}
1185

    
1186
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1187
{
1188
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1189
}
1190

    
1191
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1192
{
1193
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1194
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1195
    }
1196
}
1197

    
1198
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1199
{
1200
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1201
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1202
    }
1203
    return false;
1204
}
1205

    
1206
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1207
{
1208
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1209
        bs->dev_ops->resize_cb(bs->dev_opaque);
1210
    }
1211
}
1212

    
1213
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1214
{
1215
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1216
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1217
    }
1218
    return false;
1219
}
1220

    
1221
/*
1222
 * Run consistency checks on an image
1223
 *
1224
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1225
 * free of errors) or -errno when an internal error occurred. The results of the
1226
 * check are stored in res.
1227
 */
1228
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1229
{
1230
    if (bs->drv->bdrv_check == NULL) {
1231
        return -ENOTSUP;
1232
    }
1233

    
1234
    memset(res, 0, sizeof(*res));
1235
    return bs->drv->bdrv_check(bs, res, fix);
1236
}
1237

    
1238
#define COMMIT_BUF_SECTORS 2048
1239

    
1240
/* commit COW file into the raw image */
1241
int bdrv_commit(BlockDriverState *bs)
1242
{
1243
    BlockDriver *drv = bs->drv;
1244
    BlockDriver *backing_drv;
1245
    int64_t sector, total_sectors;
1246
    int n, ro, open_flags;
1247
    int ret = 0, rw_ret = 0;
1248
    uint8_t *buf;
1249
    char filename[1024];
1250
    BlockDriverState *bs_rw, *bs_ro;
1251

    
1252
    if (!drv)
1253
        return -ENOMEDIUM;
1254
    
1255
    if (!bs->backing_hd) {
1256
        return -ENOTSUP;
1257
    }
1258

    
1259
    if (bs->backing_hd->keep_read_only) {
1260
        return -EACCES;
1261
    }
1262

    
1263
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1264
        return -EBUSY;
1265
    }
1266

    
1267
    backing_drv = bs->backing_hd->drv;
1268
    ro = bs->backing_hd->read_only;
1269
    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1270
    open_flags =  bs->backing_hd->open_flags;
1271

    
1272
    if (ro) {
1273
        /* re-open as RW */
1274
        bdrv_delete(bs->backing_hd);
1275
        bs->backing_hd = NULL;
1276
        bs_rw = bdrv_new("");
1277
        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1278
            backing_drv);
1279
        if (rw_ret < 0) {
1280
            bdrv_delete(bs_rw);
1281
            /* try to re-open read-only */
1282
            bs_ro = bdrv_new("");
1283
            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1284
                backing_drv);
1285
            if (ret < 0) {
1286
                bdrv_delete(bs_ro);
1287
                /* drive not functional anymore */
1288
                bs->drv = NULL;
1289
                return ret;
1290
            }
1291
            bs->backing_hd = bs_ro;
1292
            return rw_ret;
1293
        }
1294
        bs->backing_hd = bs_rw;
1295
    }
1296

    
1297
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1298
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1299

    
1300
    for (sector = 0; sector < total_sectors; sector += n) {
1301
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1302

    
1303
            if (bdrv_read(bs, sector, buf, n) != 0) {
1304
                ret = -EIO;
1305
                goto ro_cleanup;
1306
            }
1307

    
1308
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1309
                ret = -EIO;
1310
                goto ro_cleanup;
1311
            }
1312
        }
1313
    }
1314

    
1315
    if (drv->bdrv_make_empty) {
1316
        ret = drv->bdrv_make_empty(bs);
1317
        bdrv_flush(bs);
1318
    }
1319

    
1320
    /*
1321
     * Make sure all data we wrote to the backing device is actually
1322
     * stable on disk.
1323
     */
1324
    if (bs->backing_hd)
1325
        bdrv_flush(bs->backing_hd);
1326

    
1327
ro_cleanup:
1328
    g_free(buf);
1329

    
1330
    if (ro) {
1331
        /* re-open as RO */
1332
        bdrv_delete(bs->backing_hd);
1333
        bs->backing_hd = NULL;
1334
        bs_ro = bdrv_new("");
1335
        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1336
            backing_drv);
1337
        if (ret < 0) {
1338
            bdrv_delete(bs_ro);
1339
            /* drive not functional anymore */
1340
            bs->drv = NULL;
1341
            return ret;
1342
        }
1343
        bs->backing_hd = bs_ro;
1344
        bs->backing_hd->keep_read_only = 0;
1345
    }
1346

    
1347
    return ret;
1348
}
1349

    
1350
int bdrv_commit_all(void)
1351
{
1352
    BlockDriverState *bs;
1353

    
1354
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1355
        int ret = bdrv_commit(bs);
1356
        if (ret < 0) {
1357
            return ret;
1358
        }
1359
    }
1360
    return 0;
1361
}
1362

    
1363
struct BdrvTrackedRequest {
1364
    BlockDriverState *bs;
1365
    int64_t sector_num;
1366
    int nb_sectors;
1367
    bool is_write;
1368
    QLIST_ENTRY(BdrvTrackedRequest) list;
1369
    Coroutine *co; /* owner, used for deadlock detection */
1370
    CoQueue wait_queue; /* coroutines blocked on this request */
1371
};
1372

    
1373
/**
1374
 * Remove an active request from the tracked requests list
1375
 *
1376
 * This function should be called when a tracked request is completing.
1377
 */
1378
static void tracked_request_end(BdrvTrackedRequest *req)
1379
{
1380
    QLIST_REMOVE(req, list);
1381
    qemu_co_queue_restart_all(&req->wait_queue);
1382
}
1383

    
1384
/**
1385
 * Add an active request to the tracked requests list
1386
 */
1387
static void tracked_request_begin(BdrvTrackedRequest *req,
1388
                                  BlockDriverState *bs,
1389
                                  int64_t sector_num,
1390
                                  int nb_sectors, bool is_write)
1391
{
1392
    *req = (BdrvTrackedRequest){
1393
        .bs = bs,
1394
        .sector_num = sector_num,
1395
        .nb_sectors = nb_sectors,
1396
        .is_write = is_write,
1397
        .co = qemu_coroutine_self(),
1398
    };
1399

    
1400
    qemu_co_queue_init(&req->wait_queue);
1401

    
1402
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1403
}
1404

    
1405
/**
1406
 * Round a region to cluster boundaries
1407
 */
1408
static void round_to_clusters(BlockDriverState *bs,
1409
                              int64_t sector_num, int nb_sectors,
1410
                              int64_t *cluster_sector_num,
1411
                              int *cluster_nb_sectors)
1412
{
1413
    BlockDriverInfo bdi;
1414

    
1415
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1416
        *cluster_sector_num = sector_num;
1417
        *cluster_nb_sectors = nb_sectors;
1418
    } else {
1419
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1420
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1421
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1422
                                            nb_sectors, c);
1423
    }
1424
}
1425

    
1426
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1427
                                     int64_t sector_num, int nb_sectors) {
1428
    /*        aaaa   bbbb */
1429
    if (sector_num >= req->sector_num + req->nb_sectors) {
1430
        return false;
1431
    }
1432
    /* bbbb   aaaa        */
1433
    if (req->sector_num >= sector_num + nb_sectors) {
1434
        return false;
1435
    }
1436
    return true;
1437
}
1438

    
1439
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1440
        int64_t sector_num, int nb_sectors)
1441
{
1442
    BdrvTrackedRequest *req;
1443
    int64_t cluster_sector_num;
1444
    int cluster_nb_sectors;
1445
    bool retry;
1446

    
1447
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1448
     * that allocating writes will be serialized and not race with each other
1449
     * for the same cluster.  For example, in copy-on-read it ensures that the
1450
     * CoR read and write operations are atomic and guest writes cannot
1451
     * interleave between them.
1452
     */
1453
    round_to_clusters(bs, sector_num, nb_sectors,
1454
                      &cluster_sector_num, &cluster_nb_sectors);
1455

    
1456
    do {
1457
        retry = false;
1458
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1459
            if (tracked_request_overlaps(req, cluster_sector_num,
1460
                                         cluster_nb_sectors)) {
1461
                /* Hitting this means there was a reentrant request, for
1462
                 * example, a block driver issuing nested requests.  This must
1463
                 * never happen since it means deadlock.
1464
                 */
1465
                assert(qemu_coroutine_self() != req->co);
1466

    
1467
                qemu_co_queue_wait(&req->wait_queue);
1468
                retry = true;
1469
                break;
1470
            }
1471
        }
1472
    } while (retry);
1473
}
1474

    
1475
/*
1476
 * Return values:
1477
 * 0        - success
1478
 * -EINVAL  - backing format specified, but no file
1479
 * -ENOSPC  - can't update the backing file because no space is left in the
1480
 *            image file header
1481
 * -ENOTSUP - format driver doesn't support changing the backing file
1482
 */
1483
int bdrv_change_backing_file(BlockDriverState *bs,
1484
    const char *backing_file, const char *backing_fmt)
1485
{
1486
    BlockDriver *drv = bs->drv;
1487
    int ret;
1488

    
1489
    /* Backing file format doesn't make sense without a backing file */
1490
    if (backing_fmt && !backing_file) {
1491
        return -EINVAL;
1492
    }
1493

    
1494
    if (drv->bdrv_change_backing_file != NULL) {
1495
        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1496
    } else {
1497
        ret = -ENOTSUP;
1498
    }
1499

    
1500
    if (ret == 0) {
1501
        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1502
        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1503
    }
1504
    return ret;
1505
}
1506

    
1507
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1508
                                   size_t size)
1509
{
1510
    int64_t len;
1511

    
1512
    if (!bdrv_is_inserted(bs))
1513
        return -ENOMEDIUM;
1514

    
1515
    if (bs->growable)
1516
        return 0;
1517

    
1518
    len = bdrv_getlength(bs);
1519

    
1520
    if (offset < 0)
1521
        return -EIO;
1522

    
1523
    if ((offset > len) || (len - offset < size))
1524
        return -EIO;
1525

    
1526
    return 0;
1527
}
1528

    
1529
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1530
                              int nb_sectors)
1531
{
1532
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1533
                                   nb_sectors * BDRV_SECTOR_SIZE);
1534
}
1535

    
1536
typedef struct RwCo {
1537
    BlockDriverState *bs;
1538
    int64_t sector_num;
1539
    int nb_sectors;
1540
    QEMUIOVector *qiov;
1541
    bool is_write;
1542
    int ret;
1543
} RwCo;
1544

    
1545
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1546
{
1547
    RwCo *rwco = opaque;
1548

    
1549
    if (!rwco->is_write) {
1550
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1551
                                     rwco->nb_sectors, rwco->qiov, 0);
1552
    } else {
1553
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1554
                                      rwco->nb_sectors, rwco->qiov, 0);
1555
    }
1556
}
1557

    
1558
/*
1559
 * Process a synchronous request using coroutines
1560
 */
1561
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1562
                      int nb_sectors, bool is_write)
1563
{
1564
    QEMUIOVector qiov;
1565
    struct iovec iov = {
1566
        .iov_base = (void *)buf,
1567
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1568
    };
1569
    Coroutine *co;
1570
    RwCo rwco = {
1571
        .bs = bs,
1572
        .sector_num = sector_num,
1573
        .nb_sectors = nb_sectors,
1574
        .qiov = &qiov,
1575
        .is_write = is_write,
1576
        .ret = NOT_DONE,
1577
    };
1578

    
1579
    qemu_iovec_init_external(&qiov, &iov, 1);
1580

    
1581
    /**
1582
     * In sync call context, when the vcpu is blocked, this throttling timer
1583
     * will not fire; so the I/O throttling function has to be disabled here
1584
     * if it has been enabled.
1585
     */
1586
    if (bs->io_limits_enabled) {
1587
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
1588
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
1589
        bdrv_io_limits_disable(bs);
1590
    }
1591

    
1592
    if (qemu_in_coroutine()) {
1593
        /* Fast-path if already in coroutine context */
1594
        bdrv_rw_co_entry(&rwco);
1595
    } else {
1596
        co = qemu_coroutine_create(bdrv_rw_co_entry);
1597
        qemu_coroutine_enter(co, &rwco);
1598
        while (rwco.ret == NOT_DONE) {
1599
            qemu_aio_wait();
1600
        }
1601
    }
1602
    return rwco.ret;
1603
}
1604

    
1605
/* return < 0 if error. See bdrv_write() for the return codes */
1606
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1607
              uint8_t *buf, int nb_sectors)
1608
{
1609
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1610
}
1611

    
1612
#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
1613

    
1614
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1615
                             int nb_sectors, int dirty)
1616
{
1617
    int64_t start, end;
1618
    unsigned long val, idx, bit;
1619

    
1620
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1621
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1622

    
1623
    for (; start <= end; start++) {
1624
        idx = start / BITS_PER_LONG;
1625
        bit = start % BITS_PER_LONG;
1626
        val = bs->dirty_bitmap[idx];
1627
        if (dirty) {
1628
            if (!(val & (1UL << bit))) {
1629
                bs->dirty_count++;
1630
                val |= 1UL << bit;
1631
            }
1632
        } else {
1633
            if (val & (1UL << bit)) {
1634
                bs->dirty_count--;
1635
                val &= ~(1UL << bit);
1636
            }
1637
        }
1638
        bs->dirty_bitmap[idx] = val;
1639
    }
1640
}
1641

    
1642
/* Return < 0 if error. Important errors are:
1643
  -EIO         generic I/O error (may happen for all errors)
1644
  -ENOMEDIUM   No media inserted.
1645
  -EINVAL      Invalid sector number or nb_sectors
1646
  -EACCES      Trying to write a read-only device
1647
*/
1648
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1649
               const uint8_t *buf, int nb_sectors)
1650
{
1651
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1652
}
1653

    
1654
int bdrv_pread(BlockDriverState *bs, int64_t offset,
1655
               void *buf, int count1)
1656
{
1657
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1658
    int len, nb_sectors, count;
1659
    int64_t sector_num;
1660
    int ret;
1661

    
1662
    count = count1;
1663
    /* first read to align to sector start */
1664
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1665
    if (len > count)
1666
        len = count;
1667
    sector_num = offset >> BDRV_SECTOR_BITS;
1668
    if (len > 0) {
1669
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1670
            return ret;
1671
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1672
        count -= len;
1673
        if (count == 0)
1674
            return count1;
1675
        sector_num++;
1676
        buf += len;
1677
    }
1678

    
1679
    /* read the sectors "in place" */
1680
    nb_sectors = count >> BDRV_SECTOR_BITS;
1681
    if (nb_sectors > 0) {
1682
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1683
            return ret;
1684
        sector_num += nb_sectors;
1685
        len = nb_sectors << BDRV_SECTOR_BITS;
1686
        buf += len;
1687
        count -= len;
1688
    }
1689

    
1690
    /* add data from the last sector */
1691
    if (count > 0) {
1692
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1693
            return ret;
1694
        memcpy(buf, tmp_buf, count);
1695
    }
1696
    return count1;
1697
}
1698

    
1699
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1700
                const void *buf, int count1)
1701
{
1702
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1703
    int len, nb_sectors, count;
1704
    int64_t sector_num;
1705
    int ret;
1706

    
1707
    count = count1;
1708
    /* first write to align to sector start */
1709
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1710
    if (len > count)
1711
        len = count;
1712
    sector_num = offset >> BDRV_SECTOR_BITS;
1713
    if (len > 0) {
1714
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1715
            return ret;
1716
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1717
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1718
            return ret;
1719
        count -= len;
1720
        if (count == 0)
1721
            return count1;
1722
        sector_num++;
1723
        buf += len;
1724
    }
1725

    
1726
    /* write the sectors "in place" */
1727
    nb_sectors = count >> BDRV_SECTOR_BITS;
1728
    if (nb_sectors > 0) {
1729
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1730
            return ret;
1731
        sector_num += nb_sectors;
1732
        len = nb_sectors << BDRV_SECTOR_BITS;
1733
        buf += len;
1734
        count -= len;
1735
    }
1736

    
1737
    /* add data from the last sector */
1738
    if (count > 0) {
1739
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1740
            return ret;
1741
        memcpy(tmp_buf, buf, count);
1742
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1743
            return ret;
1744
    }
1745
    return count1;
1746
}
1747

    
1748
/*
1749
 * Writes to the file and ensures that no writes are reordered across this
1750
 * request (acts as a barrier)
1751
 *
1752
 * Returns 0 on success, -errno in error cases.
1753
 */
1754
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1755
    const void *buf, int count)
1756
{
1757
    int ret;
1758

    
1759
    ret = bdrv_pwrite(bs, offset, buf, count);
1760
    if (ret < 0) {
1761
        return ret;
1762
    }
1763

    
1764
    /* No flush needed for cache modes that already do it */
1765
    if (bs->enable_write_cache) {
1766
        bdrv_flush(bs);
1767
    }
1768

    
1769
    return 0;
1770
}
1771

    
1772
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1773
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1774
{
1775
    /* Perform I/O through a temporary buffer so that users who scribble over
1776
     * their read buffer while the operation is in progress do not end up
1777
     * modifying the image file.  This is critical for zero-copy guest I/O
1778
     * where anything might happen inside guest memory.
1779
     */
1780
    void *bounce_buffer;
1781

    
1782
    BlockDriver *drv = bs->drv;
1783
    struct iovec iov;
1784
    QEMUIOVector bounce_qiov;
1785
    int64_t cluster_sector_num;
1786
    int cluster_nb_sectors;
1787
    size_t skip_bytes;
1788
    int ret;
1789

    
1790
    /* Cover entire cluster so no additional backing file I/O is required when
1791
     * allocating cluster in the image file.
1792
     */
1793
    round_to_clusters(bs, sector_num, nb_sectors,
1794
                      &cluster_sector_num, &cluster_nb_sectors);
1795

    
1796
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1797
                                   cluster_sector_num, cluster_nb_sectors);
1798

    
1799
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1800
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1801
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1802

    
1803
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1804
                             &bounce_qiov);
1805
    if (ret < 0) {
1806
        goto err;
1807
    }
1808

    
1809
    if (drv->bdrv_co_write_zeroes &&
1810
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
1811
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1812
                                      cluster_nb_sectors);
1813
    } else {
1814
        /* This does not change the data on the disk, it is not necessary
1815
         * to flush even in cache=writethrough mode.
1816
         */
1817
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1818
                                  &bounce_qiov);
1819
    }
1820

    
1821
    if (ret < 0) {
1822
        /* It might be okay to ignore write errors for guest requests.  If this
1823
         * is a deliberate copy-on-read then we don't want to ignore the error.
1824
         * Simply report it in all cases.
1825
         */
1826
        goto err;
1827
    }
1828

    
1829
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1830
    qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1831
                           nb_sectors * BDRV_SECTOR_SIZE);
1832

    
1833
err:
1834
    qemu_vfree(bounce_buffer);
1835
    return ret;
1836
}
1837

    
1838
/*
1839
 * Handle a read request in coroutine context
1840
 */
1841
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1842
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1843
    BdrvRequestFlags flags)
1844
{
1845
    BlockDriver *drv = bs->drv;
1846
    BdrvTrackedRequest req;
1847
    int ret;
1848

    
1849
    if (!drv) {
1850
        return -ENOMEDIUM;
1851
    }
1852
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1853
        return -EIO;
1854
    }
1855

    
1856
    /* throttling disk read I/O */
1857
    if (bs->io_limits_enabled) {
1858
        bdrv_io_limits_intercept(bs, false, nb_sectors);
1859
    }
1860

    
1861
    if (bs->copy_on_read) {
1862
        flags |= BDRV_REQ_COPY_ON_READ;
1863
    }
1864
    if (flags & BDRV_REQ_COPY_ON_READ) {
1865
        bs->copy_on_read_in_flight++;
1866
    }
1867

    
1868
    if (bs->copy_on_read_in_flight) {
1869
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1870
    }
1871

    
1872
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1873

    
1874
    if (flags & BDRV_REQ_COPY_ON_READ) {
1875
        int pnum;
1876

    
1877
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1878
        if (ret < 0) {
1879
            goto out;
1880
        }
1881

    
1882
        if (!ret || pnum != nb_sectors) {
1883
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1884
            goto out;
1885
        }
1886
    }
1887

    
1888
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1889

    
1890
out:
1891
    tracked_request_end(&req);
1892

    
1893
    if (flags & BDRV_REQ_COPY_ON_READ) {
1894
        bs->copy_on_read_in_flight--;
1895
    }
1896

    
1897
    return ret;
1898
}
1899

    
1900
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1901
    int nb_sectors, QEMUIOVector *qiov)
1902
{
1903
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1904

    
1905
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1906
}
1907

    
1908
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1909
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1910
{
1911
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1912

    
1913
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1914
                            BDRV_REQ_COPY_ON_READ);
1915
}
1916

    
1917
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1918
    int64_t sector_num, int nb_sectors)
1919
{
1920
    BlockDriver *drv = bs->drv;
1921
    QEMUIOVector qiov;
1922
    struct iovec iov;
1923
    int ret;
1924

    
1925
    /* TODO Emulate only part of misaligned requests instead of letting block
1926
     * drivers return -ENOTSUP and emulate everything */
1927

    
1928
    /* First try the efficient write zeroes operation */
1929
    if (drv->bdrv_co_write_zeroes) {
1930
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1931
        if (ret != -ENOTSUP) {
1932
            return ret;
1933
        }
1934
    }
1935

    
1936
    /* Fall back to bounce buffer if write zeroes is unsupported */
1937
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1938
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1939
    memset(iov.iov_base, 0, iov.iov_len);
1940
    qemu_iovec_init_external(&qiov, &iov, 1);
1941

    
1942
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1943

    
1944
    qemu_vfree(iov.iov_base);
1945
    return ret;
1946
}
1947

    
1948
/*
1949
 * Handle a write request in coroutine context
1950
 */
1951
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1952
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1953
    BdrvRequestFlags flags)
1954
{
1955
    BlockDriver *drv = bs->drv;
1956
    BdrvTrackedRequest req;
1957
    int ret;
1958

    
1959
    if (!bs->drv) {
1960
        return -ENOMEDIUM;
1961
    }
1962
    if (bs->read_only) {
1963
        return -EACCES;
1964
    }
1965
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1966
        return -EIO;
1967
    }
1968

    
1969
    /* throttling disk write I/O */
1970
    if (bs->io_limits_enabled) {
1971
        bdrv_io_limits_intercept(bs, true, nb_sectors);
1972
    }
1973

    
1974
    if (bs->copy_on_read_in_flight) {
1975
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1976
    }
1977

    
1978
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1979

    
1980
    if (flags & BDRV_REQ_ZERO_WRITE) {
1981
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1982
    } else {
1983
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1984
    }
1985

    
1986
    if (ret == 0 && !bs->enable_write_cache) {
1987
        ret = bdrv_co_flush(bs);
1988
    }
1989

    
1990
    if (bs->dirty_bitmap) {
1991
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1992
    }
1993

    
1994
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1995
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
1996
    }
1997

    
1998
    tracked_request_end(&req);
1999

    
2000
    return ret;
2001
}
2002

    
2003
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2004
    int nb_sectors, QEMUIOVector *qiov)
2005
{
2006
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2007

    
2008
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2009
}
2010

    
2011
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2012
                                      int64_t sector_num, int nb_sectors)
2013
{
2014
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2015

    
2016
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2017
                             BDRV_REQ_ZERO_WRITE);
2018
}
2019

    
2020
/**
2021
 * Truncate file to 'offset' bytes (needed only for file protocols)
2022
 */
2023
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2024
{
2025
    BlockDriver *drv = bs->drv;
2026
    int ret;
2027
    if (!drv)
2028
        return -ENOMEDIUM;
2029
    if (!drv->bdrv_truncate)
2030
        return -ENOTSUP;
2031
    if (bs->read_only)
2032
        return -EACCES;
2033
    if (bdrv_in_use(bs))
2034
        return -EBUSY;
2035
    ret = drv->bdrv_truncate(bs, offset);
2036
    if (ret == 0) {
2037
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2038
        bdrv_dev_resize_cb(bs);
2039
    }
2040
    return ret;
2041
}
2042

    
2043
/**
2044
 * Length of a allocated file in bytes. Sparse files are counted by actual
2045
 * allocated space. Return < 0 if error or unknown.
2046
 */
2047
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2048
{
2049
    BlockDriver *drv = bs->drv;
2050
    if (!drv) {
2051
        return -ENOMEDIUM;
2052
    }
2053
    if (drv->bdrv_get_allocated_file_size) {
2054
        return drv->bdrv_get_allocated_file_size(bs);
2055
    }
2056
    if (bs->file) {
2057
        return bdrv_get_allocated_file_size(bs->file);
2058
    }
2059
    return -ENOTSUP;
2060
}
2061

    
2062
/**
2063
 * Length of a file in bytes. Return < 0 if error or unknown.
2064
 */
2065
int64_t bdrv_getlength(BlockDriverState *bs)
2066
{
2067
    BlockDriver *drv = bs->drv;
2068
    if (!drv)
2069
        return -ENOMEDIUM;
2070

    
2071
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2072
        if (drv->bdrv_getlength) {
2073
            return drv->bdrv_getlength(bs);
2074
        }
2075
    }
2076
    return bs->total_sectors * BDRV_SECTOR_SIZE;
2077
}
2078

    
2079
/* return 0 as number of sectors if no device present or error */
2080
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2081
{
2082
    int64_t length;
2083
    length = bdrv_getlength(bs);
2084
    if (length < 0)
2085
        length = 0;
2086
    else
2087
        length = length >> BDRV_SECTOR_BITS;
2088
    *nb_sectors_ptr = length;
2089
}
2090

    
2091
struct partition {
2092
        uint8_t boot_ind;           /* 0x80 - active */
2093
        uint8_t head;               /* starting head */
2094
        uint8_t sector;             /* starting sector */
2095
        uint8_t cyl;                /* starting cylinder */
2096
        uint8_t sys_ind;            /* What partition type */
2097
        uint8_t end_head;           /* end head */
2098
        uint8_t end_sector;         /* end sector */
2099
        uint8_t end_cyl;            /* end cylinder */
2100
        uint32_t start_sect;        /* starting sector counting from 0 */
2101
        uint32_t nr_sects;          /* nr of sectors in partition */
2102
} QEMU_PACKED;
2103

    
2104
/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2105
static int guess_disk_lchs(BlockDriverState *bs,
2106
                           int *pcylinders, int *pheads, int *psectors)
2107
{
2108
    uint8_t buf[BDRV_SECTOR_SIZE];
2109
    int ret, i, heads, sectors, cylinders;
2110
    struct partition *p;
2111
    uint32_t nr_sects;
2112
    uint64_t nb_sectors;
2113
    bool enabled;
2114

    
2115
    bdrv_get_geometry(bs, &nb_sectors);
2116

    
2117
    /**
2118
     * The function will be invoked during startup not only in sync I/O mode,
2119
     * but also in async I/O mode. So the I/O throttling function has to
2120
     * be disabled temporarily here, not permanently.
2121
     */
2122
    enabled = bs->io_limits_enabled;
2123
    bs->io_limits_enabled = false;
2124
    ret = bdrv_read(bs, 0, buf, 1);
2125
    bs->io_limits_enabled = enabled;
2126
    if (ret < 0)
2127
        return -1;
2128
    /* test msdos magic */
2129
    if (buf[510] != 0x55 || buf[511] != 0xaa)
2130
        return -1;
2131
    for(i = 0; i < 4; i++) {
2132
        p = ((struct partition *)(buf + 0x1be)) + i;
2133
        nr_sects = le32_to_cpu(p->nr_sects);
2134
        if (nr_sects && p->end_head) {
2135
            /* We make the assumption that the partition terminates on
2136
               a cylinder boundary */
2137
            heads = p->end_head + 1;
2138
            sectors = p->end_sector & 63;
2139
            if (sectors == 0)
2140
                continue;
2141
            cylinders = nb_sectors / (heads * sectors);
2142
            if (cylinders < 1 || cylinders > 16383)
2143
                continue;
2144
            *pheads = heads;
2145
            *psectors = sectors;
2146
            *pcylinders = cylinders;
2147
#if 0
2148
            printf("guessed geometry: LCHS=%d %d %d\n",
2149
                   cylinders, heads, sectors);
2150
#endif
2151
            return 0;
2152
        }
2153
    }
2154
    return -1;
2155
}
2156

    
2157
void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2158
{
2159
    int translation, lba_detected = 0;
2160
    int cylinders, heads, secs;
2161
    uint64_t nb_sectors;
2162

    
2163
    /* if a geometry hint is available, use it */
2164
    bdrv_get_geometry(bs, &nb_sectors);
2165
    bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2166
    translation = bdrv_get_translation_hint(bs);
2167
    if (cylinders != 0) {
2168
        *pcyls = cylinders;
2169
        *pheads = heads;
2170
        *psecs = secs;
2171
    } else {
2172
        if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2173
            if (heads > 16) {
2174
                /* if heads > 16, it means that a BIOS LBA
2175
                   translation was active, so the default
2176
                   hardware geometry is OK */
2177
                lba_detected = 1;
2178
                goto default_geometry;
2179
            } else {
2180
                *pcyls = cylinders;
2181
                *pheads = heads;
2182
                *psecs = secs;
2183
                /* disable any translation to be in sync with
2184
                   the logical geometry */
2185
                if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2186
                    bdrv_set_translation_hint(bs,
2187
                                              BIOS_ATA_TRANSLATION_NONE);
2188
                }
2189
            }
2190
        } else {
2191
        default_geometry:
2192
            /* if no geometry, use a standard physical disk geometry */
2193
            cylinders = nb_sectors / (16 * 63);
2194

    
2195
            if (cylinders > 16383)
2196
                cylinders = 16383;
2197
            else if (cylinders < 2)
2198
                cylinders = 2;
2199
            *pcyls = cylinders;
2200
            *pheads = 16;
2201
            *psecs = 63;
2202
            if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2203
                if ((*pcyls * *pheads) <= 131072) {
2204
                    bdrv_set_translation_hint(bs,
2205
                                              BIOS_ATA_TRANSLATION_LARGE);
2206
                } else {
2207
                    bdrv_set_translation_hint(bs,
2208
                                              BIOS_ATA_TRANSLATION_LBA);
2209
                }
2210
            }
2211
        }
2212
        bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2213
    }
2214
}
2215

    
2216
void bdrv_set_geometry_hint(BlockDriverState *bs,
2217
                            int cyls, int heads, int secs)
2218
{
2219
    bs->cyls = cyls;
2220
    bs->heads = heads;
2221
    bs->secs = secs;
2222
}
2223

    
2224
void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2225
{
2226
    bs->translation = translation;
2227
}
2228

    
2229
void bdrv_get_geometry_hint(BlockDriverState *bs,
2230
                            int *pcyls, int *pheads, int *psecs)
2231
{
2232
    *pcyls = bs->cyls;
2233
    *pheads = bs->heads;
2234
    *psecs = bs->secs;
2235
}
2236

    
2237
/* throttling disk io limits */
2238
void bdrv_set_io_limits(BlockDriverState *bs,
2239
                        BlockIOLimit *io_limits)
2240
{
2241
    bs->io_limits = *io_limits;
2242
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2243
}
2244

    
2245
/* Recognize floppy formats */
2246
typedef struct FDFormat {
2247
    FDriveType drive;
2248
    uint8_t last_sect;
2249
    uint8_t max_track;
2250
    uint8_t max_head;
2251
    FDriveRate rate;
2252
} FDFormat;
2253

    
2254
static const FDFormat fd_formats[] = {
2255
    /* First entry is default format */
2256
    /* 1.44 MB 3"1/2 floppy disks */
2257
    { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2258
    { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2259
    { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2260
    { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2261
    { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2262
    { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2263
    { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2264
    { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2265
    /* 2.88 MB 3"1/2 floppy disks */
2266
    { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2267
    { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2268
    { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2269
    { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2270
    { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2271
    /* 720 kB 3"1/2 floppy disks */
2272
    { FDRIVE_DRV_144,  9, 80, 1, FDRIVE_RATE_250K, },
2273
    { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2274
    { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2275
    { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2276
    { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2277
    { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2278
    /* 1.2 MB 5"1/4 floppy disks */
2279
    { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2280
    { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2281
    { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2282
    { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2283
    { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2284
    /* 720 kB 5"1/4 floppy disks */
2285
    { FDRIVE_DRV_120,  9, 80, 1, FDRIVE_RATE_250K, },
2286
    { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2287
    /* 360 kB 5"1/4 floppy disks */
2288
    { FDRIVE_DRV_120,  9, 40, 1, FDRIVE_RATE_300K, },
2289
    { FDRIVE_DRV_120,  9, 40, 0, FDRIVE_RATE_300K, },
2290
    { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2291
    { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2292
    /* 320 kB 5"1/4 floppy disks */
2293
    { FDRIVE_DRV_120,  8, 40, 1, FDRIVE_RATE_250K, },
2294
    { FDRIVE_DRV_120,  8, 40, 0, FDRIVE_RATE_250K, },
2295
    /* 360 kB must match 5"1/4 better than 3"1/2... */
2296
    { FDRIVE_DRV_144,  9, 80, 0, FDRIVE_RATE_250K, },
2297
    /* end */
2298
    { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2299
};
2300

    
2301
void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2302
                                   int *max_track, int *last_sect,
2303
                                   FDriveType drive_in, FDriveType *drive,
2304
                                   FDriveRate *rate)
2305
{
2306
    const FDFormat *parse;
2307
    uint64_t nb_sectors, size;
2308
    int i, first_match, match;
2309

    
2310
    bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2311
    if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2312
        /* User defined disk */
2313
        *rate = FDRIVE_RATE_500K;
2314
    } else {
2315
        bdrv_get_geometry(bs, &nb_sectors);
2316
        match = -1;
2317
        first_match = -1;
2318
        for (i = 0; ; i++) {
2319
            parse = &fd_formats[i];
2320
            if (parse->drive == FDRIVE_DRV_NONE) {
2321
                break;
2322
            }
2323
            if (drive_in == parse->drive ||
2324
                drive_in == FDRIVE_DRV_NONE) {
2325
                size = (parse->max_head + 1) * parse->max_track *
2326
                    parse->last_sect;
2327
                if (nb_sectors == size) {
2328
                    match = i;
2329
                    break;
2330
                }
2331
                if (first_match == -1) {
2332
                    first_match = i;
2333
                }
2334
            }
2335
        }
2336
        if (match == -1) {
2337
            if (first_match == -1) {
2338
                match = 1;
2339
            } else {
2340
                match = first_match;
2341
            }
2342
            parse = &fd_formats[match];
2343
        }
2344
        *nb_heads = parse->max_head + 1;
2345
        *max_track = parse->max_track;
2346
        *last_sect = parse->last_sect;
2347
        *drive = parse->drive;
2348
        *rate = parse->rate;
2349
    }
2350
}
2351

    
2352
int bdrv_get_translation_hint(BlockDriverState *bs)
2353
{
2354
    return bs->translation;
2355
}
2356

    
2357
void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2358
                       BlockErrorAction on_write_error)
2359
{
2360
    bs->on_read_error = on_read_error;
2361
    bs->on_write_error = on_write_error;
2362
}
2363

    
2364
BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2365
{
2366
    return is_read ? bs->on_read_error : bs->on_write_error;
2367
}
2368

    
2369
int bdrv_is_read_only(BlockDriverState *bs)
2370
{
2371
    return bs->read_only;
2372
}
2373

    
2374
int bdrv_is_sg(BlockDriverState *bs)
2375
{
2376
    return bs->sg;
2377
}
2378

    
2379
int bdrv_enable_write_cache(BlockDriverState *bs)
2380
{
2381
    return bs->enable_write_cache;
2382
}
2383

    
2384
void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2385
{
2386
    bs->enable_write_cache = wce;
2387
}
2388

    
2389
int bdrv_is_encrypted(BlockDriverState *bs)
2390
{
2391
    if (bs->backing_hd && bs->backing_hd->encrypted)
2392
        return 1;
2393
    return bs->encrypted;
2394
}
2395

    
2396
int bdrv_key_required(BlockDriverState *bs)
2397
{
2398
    BlockDriverState *backing_hd = bs->backing_hd;
2399

    
2400
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2401
        return 1;
2402
    return (bs->encrypted && !bs->valid_key);
2403
}
2404

    
2405
int bdrv_set_key(BlockDriverState *bs, const char *key)
2406
{
2407
    int ret;
2408
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2409
        ret = bdrv_set_key(bs->backing_hd, key);
2410
        if (ret < 0)
2411
            return ret;
2412
        if (!bs->encrypted)
2413
            return 0;
2414
    }
2415
    if (!bs->encrypted) {
2416
        return -EINVAL;
2417
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2418
        return -ENOMEDIUM;
2419
    }
2420
    ret = bs->drv->bdrv_set_key(bs, key);
2421
    if (ret < 0) {
2422
        bs->valid_key = 0;
2423
    } else if (!bs->valid_key) {
2424
        bs->valid_key = 1;
2425
        /* call the change callback now, we skipped it on open */
2426
        bdrv_dev_change_media_cb(bs, true);
2427
    }
2428
    return ret;
2429
}
2430

    
2431
void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2432
{
2433
    if (!bs->drv) {
2434
        buf[0] = '\0';
2435
    } else {
2436
        pstrcpy(buf, buf_size, bs->drv->format_name);
2437
    }
2438
}
2439

    
2440
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2441
                         void *opaque)
2442
{
2443
    BlockDriver *drv;
2444

    
2445
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2446
        it(opaque, drv->format_name);
2447
    }
2448
}
2449

    
2450
BlockDriverState *bdrv_find(const char *name)
2451
{
2452
    BlockDriverState *bs;
2453

    
2454
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2455
        if (!strcmp(name, bs->device_name)) {
2456
            return bs;
2457
        }
2458
    }
2459
    return NULL;
2460
}
2461

    
2462
BlockDriverState *bdrv_next(BlockDriverState *bs)
2463
{
2464
    if (!bs) {
2465
        return QTAILQ_FIRST(&bdrv_states);
2466
    }
2467
    return QTAILQ_NEXT(bs, list);
2468
}
2469

    
2470
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2471
{
2472
    BlockDriverState *bs;
2473

    
2474
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2475
        it(opaque, bs);
2476
    }
2477
}
2478

    
2479
const char *bdrv_get_device_name(BlockDriverState *bs)
2480
{
2481
    return bs->device_name;
2482
}
2483

    
2484
int bdrv_get_flags(BlockDriverState *bs)
2485
{
2486
    return bs->open_flags;
2487
}
2488

    
2489
void bdrv_flush_all(void)
2490
{
2491
    BlockDriverState *bs;
2492

    
2493
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2494
        bdrv_flush(bs);
2495
    }
2496
}
2497

    
2498
int bdrv_has_zero_init(BlockDriverState *bs)
2499
{
2500
    assert(bs->drv);
2501

    
2502
    if (bs->drv->bdrv_has_zero_init) {
2503
        return bs->drv->bdrv_has_zero_init(bs);
2504
    }
2505

    
2506
    return 1;
2507
}
2508

    
2509
typedef struct BdrvCoIsAllocatedData {
2510
    BlockDriverState *bs;
2511
    int64_t sector_num;
2512
    int nb_sectors;
2513
    int *pnum;
2514
    int ret;
2515
    bool done;
2516
} BdrvCoIsAllocatedData;
2517

    
2518
/*
2519
 * Returns true iff the specified sector is present in the disk image. Drivers
2520
 * not implementing the functionality are assumed to not support backing files,
2521
 * hence all their sectors are reported as allocated.
2522
 *
2523
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2524
 * and 'pnum' is set to 0.
2525
 *
2526
 * 'pnum' is set to the number of sectors (including and immediately following
2527
 * the specified sector) that are known to be in the same
2528
 * allocated/unallocated state.
2529
 *
2530
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2531
 * beyond the end of the disk image it will be clamped.
2532
 */
2533
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2534
                                      int nb_sectors, int *pnum)
2535
{
2536
    int64_t n;
2537

    
2538
    if (sector_num >= bs->total_sectors) {
2539
        *pnum = 0;
2540
        return 0;
2541
    }
2542

    
2543
    n = bs->total_sectors - sector_num;
2544
    if (n < nb_sectors) {
2545
        nb_sectors = n;
2546
    }
2547

    
2548
    if (!bs->drv->bdrv_co_is_allocated) {
2549
        *pnum = nb_sectors;
2550
        return 1;
2551
    }
2552

    
2553
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2554
}
2555

    
2556
/* Coroutine wrapper for bdrv_is_allocated() */
2557
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2558
{
2559
    BdrvCoIsAllocatedData *data = opaque;
2560
    BlockDriverState *bs = data->bs;
2561

    
2562
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2563
                                     data->pnum);
2564
    data->done = true;
2565
}
2566

    
2567
/*
2568
 * Synchronous wrapper around bdrv_co_is_allocated().
2569
 *
2570
 * See bdrv_co_is_allocated() for details.
2571
 */
2572
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2573
                      int *pnum)
2574
{
2575
    Coroutine *co;
2576
    BdrvCoIsAllocatedData data = {
2577
        .bs = bs,
2578
        .sector_num = sector_num,
2579
        .nb_sectors = nb_sectors,
2580
        .pnum = pnum,
2581
        .done = false,
2582
    };
2583

    
2584
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2585
    qemu_coroutine_enter(co, &data);
2586
    while (!data.done) {
2587
        qemu_aio_wait();
2588
    }
2589
    return data.ret;
2590
}
2591

    
2592
/*
2593
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2594
 *
2595
 * Return true if the given sector is allocated in any image between
2596
 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2597
 * sector is allocated in any image of the chain.  Return false otherwise.
2598
 *
2599
 * 'pnum' is set to the number of sectors (including and immediately following
2600
 *  the specified sector) that are known to be in the same
2601
 *  allocated/unallocated state.
2602
 *
2603
 */
2604
int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2605
                                            BlockDriverState *base,
2606
                                            int64_t sector_num,
2607
                                            int nb_sectors, int *pnum)
2608
{
2609
    BlockDriverState *intermediate;
2610
    int ret, n = nb_sectors;
2611

    
2612
    intermediate = top;
2613
    while (intermediate && intermediate != base) {
2614
        int pnum_inter;
2615
        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2616
                                   &pnum_inter);
2617
        if (ret < 0) {
2618
            return ret;
2619
        } else if (ret) {
2620
            *pnum = pnum_inter;
2621
            return 1;
2622
        }
2623

    
2624
        /*
2625
         * [sector_num, nb_sectors] is unallocated on top but intermediate
2626
         * might have
2627
         *
2628
         * [sector_num+x, nr_sectors] allocated.
2629
         */
2630
        if (n > pnum_inter) {
2631
            n = pnum_inter;
2632
        }
2633

    
2634
        intermediate = intermediate->backing_hd;
2635
    }
2636

    
2637
    *pnum = n;
2638
    return 0;
2639
}
2640

    
2641
BlockInfoList *qmp_query_block(Error **errp)
2642
{
2643
    BlockInfoList *head = NULL, *cur_item = NULL;
2644
    BlockDriverState *bs;
2645

    
2646
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2647
        BlockInfoList *info = g_malloc0(sizeof(*info));
2648

    
2649
        info->value = g_malloc0(sizeof(*info->value));
2650
        info->value->device = g_strdup(bs->device_name);
2651
        info->value->type = g_strdup("unknown");
2652
        info->value->locked = bdrv_dev_is_medium_locked(bs);
2653
        info->value->removable = bdrv_dev_has_removable_media(bs);
2654

    
2655
        if (bdrv_dev_has_removable_media(bs)) {
2656
            info->value->has_tray_open = true;
2657
            info->value->tray_open = bdrv_dev_is_tray_open(bs);
2658
        }
2659

    
2660
        if (bdrv_iostatus_is_enabled(bs)) {
2661
            info->value->has_io_status = true;
2662
            info->value->io_status = bs->iostatus;
2663
        }
2664

    
2665
        if (bs->drv) {
2666
            info->value->has_inserted = true;
2667
            info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2668
            info->value->inserted->file = g_strdup(bs->filename);
2669
            info->value->inserted->ro = bs->read_only;
2670
            info->value->inserted->drv = g_strdup(bs->drv->format_name);
2671
            info->value->inserted->encrypted = bs->encrypted;
2672
            if (bs->backing_file[0]) {
2673
                info->value->inserted->has_backing_file = true;
2674
                info->value->inserted->backing_file = g_strdup(bs->backing_file);
2675
            }
2676

    
2677
            if (bs->io_limits_enabled) {
2678
                info->value->inserted->bps =
2679
                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2680
                info->value->inserted->bps_rd =
2681
                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2682
                info->value->inserted->bps_wr =
2683
                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2684
                info->value->inserted->iops =
2685
                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2686
                info->value->inserted->iops_rd =
2687
                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2688
                info->value->inserted->iops_wr =
2689
                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2690
            }
2691
        }
2692

    
2693
        /* XXX: waiting for the qapi to support GSList */
2694
        if (!cur_item) {
2695
            head = cur_item = info;
2696
        } else {
2697
            cur_item->next = info;
2698
            cur_item = info;
2699
        }
2700
    }
2701

    
2702
    return head;
2703
}
2704

    
2705
/* Consider exposing this as a full fledged QMP command */
2706
static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2707
{
2708
    BlockStats *s;
2709

    
2710
    s = g_malloc0(sizeof(*s));
2711

    
2712
    if (bs->device_name[0]) {
2713
        s->has_device = true;
2714
        s->device = g_strdup(bs->device_name);
2715
    }
2716

    
2717
    s->stats = g_malloc0(sizeof(*s->stats));
2718
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2719
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2720
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2721
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2722
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2723
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2724
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2725
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2726
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2727

    
2728
    if (bs->file) {
2729
        s->has_parent = true;
2730
        s->parent = qmp_query_blockstat(bs->file, NULL);
2731
    }
2732

    
2733
    return s;
2734
}
2735

    
2736
BlockStatsList *qmp_query_blockstats(Error **errp)
2737
{
2738
    BlockStatsList *head = NULL, *cur_item = NULL;
2739
    BlockDriverState *bs;
2740

    
2741
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2742
        BlockStatsList *info = g_malloc0(sizeof(*info));
2743
        info->value = qmp_query_blockstat(bs, NULL);
2744

    
2745
        /* XXX: waiting for the qapi to support GSList */
2746
        if (!cur_item) {
2747
            head = cur_item = info;
2748
        } else {
2749
            cur_item->next = info;
2750
            cur_item = info;
2751
        }
2752
    }
2753

    
2754
    return head;
2755
}
2756

    
2757
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2758
{
2759
    if (bs->backing_hd && bs->backing_hd->encrypted)
2760
        return bs->backing_file;
2761
    else if (bs->encrypted)
2762
        return bs->filename;
2763
    else
2764
        return NULL;
2765
}
2766

    
2767
void bdrv_get_backing_filename(BlockDriverState *bs,
2768
                               char *filename, int filename_size)
2769
{
2770
    pstrcpy(filename, filename_size, bs->backing_file);
2771
}
2772

    
2773
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2774
                          const uint8_t *buf, int nb_sectors)
2775
{
2776
    BlockDriver *drv = bs->drv;
2777
    if (!drv)
2778
        return -ENOMEDIUM;
2779
    if (!drv->bdrv_write_compressed)
2780
        return -ENOTSUP;
2781
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2782
        return -EIO;
2783

    
2784
    if (bs->dirty_bitmap) {
2785
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2786
    }
2787

    
2788
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2789
}
2790

    
2791
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2792
{
2793
    BlockDriver *drv = bs->drv;
2794
    if (!drv)
2795
        return -ENOMEDIUM;
2796
    if (!drv->bdrv_get_info)
2797
        return -ENOTSUP;
2798
    memset(bdi, 0, sizeof(*bdi));
2799
    return drv->bdrv_get_info(bs, bdi);
2800
}
2801

    
2802
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2803
                      int64_t pos, int size)
2804
{
2805
    BlockDriver *drv = bs->drv;
2806
    if (!drv)
2807
        return -ENOMEDIUM;
2808
    if (drv->bdrv_save_vmstate)
2809
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
2810
    if (bs->file)
2811
        return bdrv_save_vmstate(bs->file, buf, pos, size);
2812
    return -ENOTSUP;
2813
}
2814

    
2815
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2816
                      int64_t pos, int size)
2817
{
2818
    BlockDriver *drv = bs->drv;
2819
    if (!drv)
2820
        return -ENOMEDIUM;
2821
    if (drv->bdrv_load_vmstate)
2822
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
2823
    if (bs->file)
2824
        return bdrv_load_vmstate(bs->file, buf, pos, size);
2825
    return -ENOTSUP;
2826
}
2827

    
2828
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2829
{
2830
    BlockDriver *drv = bs->drv;
2831

    
2832
    if (!drv || !drv->bdrv_debug_event) {
2833
        return;
2834
    }
2835

    
2836
    return drv->bdrv_debug_event(bs, event);
2837

    
2838
}
2839

    
2840
/**************************************************************/
2841
/* handling of snapshots */
2842

    
2843
int bdrv_can_snapshot(BlockDriverState *bs)
2844
{
2845
    BlockDriver *drv = bs->drv;
2846
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2847
        return 0;
2848
    }
2849

    
2850
    if (!drv->bdrv_snapshot_create) {
2851
        if (bs->file != NULL) {
2852
            return bdrv_can_snapshot(bs->file);
2853
        }
2854
        return 0;
2855
    }
2856

    
2857
    return 1;
2858
}
2859

    
2860
int bdrv_is_snapshot(BlockDriverState *bs)
2861
{
2862
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2863
}
2864

    
2865
BlockDriverState *bdrv_snapshots(void)
2866
{
2867
    BlockDriverState *bs;
2868

    
2869
    if (bs_snapshots) {
2870
        return bs_snapshots;
2871
    }
2872

    
2873
    bs = NULL;
2874
    while ((bs = bdrv_next(bs))) {
2875
        if (bdrv_can_snapshot(bs)) {
2876
            bs_snapshots = bs;
2877
            return bs;
2878
        }
2879
    }
2880
    return NULL;
2881
}
2882

    
2883
int bdrv_snapshot_create(BlockDriverState *bs,
2884
                         QEMUSnapshotInfo *sn_info)
2885
{
2886
    BlockDriver *drv = bs->drv;
2887
    if (!drv)
2888
        return -ENOMEDIUM;
2889
    if (drv->bdrv_snapshot_create)
2890
        return drv->bdrv_snapshot_create(bs, sn_info);
2891
    if (bs->file)
2892
        return bdrv_snapshot_create(bs->file, sn_info);
2893
    return -ENOTSUP;
2894
}
2895

    
2896
int bdrv_snapshot_goto(BlockDriverState *bs,
2897
                       const char *snapshot_id)
2898
{
2899
    BlockDriver *drv = bs->drv;
2900
    int ret, open_ret;
2901

    
2902
    if (!drv)
2903
        return -ENOMEDIUM;
2904
    if (drv->bdrv_snapshot_goto)
2905
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
2906

    
2907
    if (bs->file) {
2908
        drv->bdrv_close(bs);
2909
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2910
        open_ret = drv->bdrv_open(bs, bs->open_flags);
2911
        if (open_ret < 0) {
2912
            bdrv_delete(bs->file);
2913
            bs->drv = NULL;
2914
            return open_ret;
2915
        }
2916
        return ret;
2917
    }
2918

    
2919
    return -ENOTSUP;
2920
}
2921

    
2922
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2923
{
2924
    BlockDriver *drv = bs->drv;
2925
    if (!drv)
2926
        return -ENOMEDIUM;
2927
    if (drv->bdrv_snapshot_delete)
2928
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
2929
    if (bs->file)
2930
        return bdrv_snapshot_delete(bs->file, snapshot_id);
2931
    return -ENOTSUP;
2932
}
2933

    
2934
int bdrv_snapshot_list(BlockDriverState *bs,
2935
                       QEMUSnapshotInfo **psn_info)
2936
{
2937
    BlockDriver *drv = bs->drv;
2938
    if (!drv)
2939
        return -ENOMEDIUM;
2940
    if (drv->bdrv_snapshot_list)
2941
        return drv->bdrv_snapshot_list(bs, psn_info);
2942
    if (bs->file)
2943
        return bdrv_snapshot_list(bs->file, psn_info);
2944
    return -ENOTSUP;
2945
}
2946

    
2947
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2948
        const char *snapshot_name)
2949
{
2950
    BlockDriver *drv = bs->drv;
2951
    if (!drv) {
2952
        return -ENOMEDIUM;
2953
    }
2954
    if (!bs->read_only) {
2955
        return -EINVAL;
2956
    }
2957
    if (drv->bdrv_snapshot_load_tmp) {
2958
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2959
    }
2960
    return -ENOTSUP;
2961
}
2962

    
2963
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2964
        const char *backing_file)
2965
{
2966
    if (!bs->drv) {
2967
        return NULL;
2968
    }
2969

    
2970
    if (bs->backing_hd) {
2971
        if (strcmp(bs->backing_file, backing_file) == 0) {
2972
            return bs->backing_hd;
2973
        } else {
2974
            return bdrv_find_backing_image(bs->backing_hd, backing_file);
2975
        }
2976
    }
2977

    
2978
    return NULL;
2979
}
2980

    
2981
#define NB_SUFFIXES 4
2982

    
2983
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2984
{
2985
    static const char suffixes[NB_SUFFIXES] = "KMGT";
2986
    int64_t base;
2987
    int i;
2988

    
2989
    if (size <= 999) {
2990
        snprintf(buf, buf_size, "%" PRId64, size);
2991
    } else {
2992
        base = 1024;
2993
        for(i = 0; i < NB_SUFFIXES; i++) {
2994
            if (size < (10 * base)) {
2995
                snprintf(buf, buf_size, "%0.1f%c",
2996
                         (double)size / base,
2997
                         suffixes[i]);
2998
                break;
2999
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3000
                snprintf(buf, buf_size, "%" PRId64 "%c",
3001
                         ((size + (base >> 1)) / base),
3002
                         suffixes[i]);
3003
                break;
3004
            }
3005
            base = base * 1024;
3006
        }
3007
    }
3008
    return buf;
3009
}
3010

    
3011
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3012
{
3013
    char buf1[128], date_buf[128], clock_buf[128];
3014
#ifdef _WIN32
3015
    struct tm *ptm;
3016
#else
3017
    struct tm tm;
3018
#endif
3019
    time_t ti;
3020
    int64_t secs;
3021

    
3022
    if (!sn) {
3023
        snprintf(buf, buf_size,
3024
                 "%-10s%-20s%7s%20s%15s",
3025
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3026
    } else {
3027
        ti = sn->date_sec;
3028
#ifdef _WIN32
3029
        ptm = localtime(&ti);
3030
        strftime(date_buf, sizeof(date_buf),
3031
                 "%Y-%m-%d %H:%M:%S", ptm);
3032
#else
3033
        localtime_r(&ti, &tm);
3034
        strftime(date_buf, sizeof(date_buf),
3035
                 "%Y-%m-%d %H:%M:%S", &tm);
3036
#endif
3037
        secs = sn->vm_clock_nsec / 1000000000;
3038
        snprintf(clock_buf, sizeof(clock_buf),
3039
                 "%02d:%02d:%02d.%03d",
3040
                 (int)(secs / 3600),
3041
                 (int)((secs / 60) % 60),
3042
                 (int)(secs % 60),
3043
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3044
        snprintf(buf, buf_size,
3045
                 "%-10s%-20s%7s%20s%15s",
3046
                 sn->id_str, sn->name,
3047
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3048
                 date_buf,
3049
                 clock_buf);
3050
    }
3051
    return buf;
3052
}
3053

    
3054
/**************************************************************/
3055
/* async I/Os */
3056

    
3057
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3058
                                 QEMUIOVector *qiov, int nb_sectors,
3059
                                 BlockDriverCompletionFunc *cb, void *opaque)
3060
{
3061
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3062

    
3063
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3064
                                 cb, opaque, false);
3065
}
3066

    
3067
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3068
                                  QEMUIOVector *qiov, int nb_sectors,
3069
                                  BlockDriverCompletionFunc *cb, void *opaque)
3070
{
3071
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3072

    
3073
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3074
                                 cb, opaque, true);
3075
}
3076

    
3077

    
3078
typedef struct MultiwriteCB {
3079
    int error;
3080
    int num_requests;
3081
    int num_callbacks;
3082
    struct {
3083
        BlockDriverCompletionFunc *cb;
3084
        void *opaque;
3085
        QEMUIOVector *free_qiov;
3086
    } callbacks[];
3087
} MultiwriteCB;
3088

    
3089
static void multiwrite_user_cb(MultiwriteCB *mcb)
3090
{
3091
    int i;
3092

    
3093
    for (i = 0; i < mcb->num_callbacks; i++) {
3094
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3095
        if (mcb->callbacks[i].free_qiov) {
3096
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3097
        }
3098
        g_free(mcb->callbacks[i].free_qiov);
3099
    }
3100
}
3101

    
3102
static void multiwrite_cb(void *opaque, int ret)
3103
{
3104
    MultiwriteCB *mcb = opaque;
3105

    
3106
    trace_multiwrite_cb(mcb, ret);
3107

    
3108
    if (ret < 0 && !mcb->error) {
3109
        mcb->error = ret;
3110
    }
3111

    
3112
    mcb->num_requests--;
3113
    if (mcb->num_requests == 0) {
3114
        multiwrite_user_cb(mcb);
3115
        g_free(mcb);
3116
    }
3117
}
3118

    
3119
static int multiwrite_req_compare(const void *a, const void *b)
3120
{
3121
    const BlockRequest *req1 = a, *req2 = b;
3122

    
3123
    /*
3124
     * Note that we can't simply subtract req2->sector from req1->sector
3125
     * here as that could overflow the return value.
3126
     */
3127
    if (req1->sector > req2->sector) {
3128
        return 1;
3129
    } else if (req1->sector < req2->sector) {
3130
        return -1;
3131
    } else {
3132
        return 0;
3133
    }
3134
}
3135

    
3136
/*
3137
 * Takes a bunch of requests and tries to merge them. Returns the number of
3138
 * requests that remain after merging.
3139
 */
3140
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3141
    int num_reqs, MultiwriteCB *mcb)
3142
{
3143
    int i, outidx;
3144

    
3145
    // Sort requests by start sector
3146
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3147

    
3148
    // Check if adjacent requests touch the same clusters. If so, combine them,
3149
    // filling up gaps with zero sectors.
3150
    outidx = 0;
3151
    for (i = 1; i < num_reqs; i++) {
3152
        int merge = 0;
3153
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3154

    
3155
        // Handle exactly sequential writes and overlapping writes.
3156
        if (reqs[i].sector <= oldreq_last) {
3157
            merge = 1;
3158
        }
3159

    
3160
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3161
            merge = 0;
3162
        }
3163

    
3164
        if (merge) {
3165
            size_t size;
3166
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3167
            qemu_iovec_init(qiov,
3168
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3169

    
3170
            // Add the first request to the merged one. If the requests are
3171
            // overlapping, drop the last sectors of the first request.
3172
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3173
            qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3174

    
3175
            // We should need to add any zeros between the two requests
3176
            assert (reqs[i].sector <= oldreq_last);
3177

    
3178
            // Add the second request
3179
            qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3180

    
3181
            reqs[outidx].nb_sectors = qiov->size >> 9;
3182
            reqs[outidx].qiov = qiov;
3183

    
3184
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3185
        } else {
3186
            outidx++;
3187
            reqs[outidx].sector     = reqs[i].sector;
3188
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3189
            reqs[outidx].qiov       = reqs[i].qiov;
3190
        }
3191
    }
3192

    
3193
    return outidx + 1;
3194
}
3195

    
3196
/*
3197
 * Submit multiple AIO write requests at once.
3198
 *
3199
 * On success, the function returns 0 and all requests in the reqs array have
3200
 * been submitted. In error case this function returns -1, and any of the
3201
 * requests may or may not be submitted yet. In particular, this means that the
3202
 * callback will be called for some of the requests, for others it won't. The
3203
 * caller must check the error field of the BlockRequest to wait for the right
3204
 * callbacks (if error != 0, no callback will be called).
3205
 *
3206
 * The implementation may modify the contents of the reqs array, e.g. to merge
3207
 * requests. However, the fields opaque and error are left unmodified as they
3208
 * are used to signal failure for a single request to the caller.
3209
 */
3210
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3211
{
3212
    MultiwriteCB *mcb;
3213
    int i;
3214

    
3215
    /* don't submit writes if we don't have a medium */
3216
    if (bs->drv == NULL) {
3217
        for (i = 0; i < num_reqs; i++) {
3218
            reqs[i].error = -ENOMEDIUM;
3219
        }
3220
        return -1;
3221
    }
3222

    
3223
    if (num_reqs == 0) {
3224
        return 0;
3225
    }
3226

    
3227
    // Create MultiwriteCB structure
3228
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3229
    mcb->num_requests = 0;
3230
    mcb->num_callbacks = num_reqs;
3231

    
3232
    for (i = 0; i < num_reqs; i++) {
3233
        mcb->callbacks[i].cb = reqs[i].cb;
3234
        mcb->callbacks[i].opaque = reqs[i].opaque;
3235
    }
3236

    
3237
    // Check for mergable requests
3238
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3239

    
3240
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3241

    
3242
    /* Run the aio requests. */
3243
    mcb->num_requests = num_reqs;
3244
    for (i = 0; i < num_reqs; i++) {
3245
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3246
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3247
    }
3248

    
3249
    return 0;
3250
}
3251

    
3252
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3253
{
3254
    acb->pool->cancel(acb);
3255
}
3256

    
3257
/* block I/O throttling */
3258
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3259
                 bool is_write, double elapsed_time, uint64_t *wait)
3260
{
3261
    uint64_t bps_limit = 0;
3262
    double   bytes_limit, bytes_base, bytes_res;
3263
    double   slice_time, wait_time;
3264

    
3265
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3266
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3267
    } else if (bs->io_limits.bps[is_write]) {
3268
        bps_limit = bs->io_limits.bps[is_write];
3269
    } else {
3270
        if (wait) {
3271
            *wait = 0;
3272
        }
3273

    
3274
        return false;
3275
    }
3276

    
3277
    slice_time = bs->slice_end - bs->slice_start;
3278
    slice_time /= (NANOSECONDS_PER_SECOND);
3279
    bytes_limit = bps_limit * slice_time;
3280
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3281
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3282
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3283
    }
3284

    
3285
    /* bytes_base: the bytes of data which have been read/written; and
3286
     *             it is obtained from the history statistic info.
3287
     * bytes_res: the remaining bytes of data which need to be read/written.
3288
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3289
     *             the total time for completing reading/writting all data.
3290
     */
3291
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3292

    
3293
    if (bytes_base + bytes_res <= bytes_limit) {
3294
        if (wait) {
3295
            *wait = 0;
3296
        }
3297

    
3298
        return false;
3299
    }
3300

    
3301
    /* Calc approx time to dispatch */
3302
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3303

    
3304
    /* When the I/O rate at runtime exceeds the limits,
3305
     * bs->slice_end need to be extended in order that the current statistic
3306
     * info can be kept until the timer fire, so it is increased and tuned
3307
     * based on the result of experiment.
3308
     */
3309
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3310
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3311
    if (wait) {
3312
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3313
    }
3314

    
3315
    return true;
3316
}
3317

    
3318
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3319
                             double elapsed_time, uint64_t *wait)
3320
{
3321
    uint64_t iops_limit = 0;
3322
    double   ios_limit, ios_base;
3323
    double   slice_time, wait_time;
3324

    
3325
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3326
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3327
    } else if (bs->io_limits.iops[is_write]) {
3328
        iops_limit = bs->io_limits.iops[is_write];
3329
    } else {
3330
        if (wait) {
3331
            *wait = 0;
3332
        }
3333

    
3334
        return false;
3335
    }
3336

    
3337
    slice_time = bs->slice_end - bs->slice_start;
3338
    slice_time /= (NANOSECONDS_PER_SECOND);
3339
    ios_limit  = iops_limit * slice_time;
3340
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3341
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3342
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3343
    }
3344

    
3345
    if (ios_base + 1 <= ios_limit) {
3346
        if (wait) {
3347
            *wait = 0;
3348
        }
3349

    
3350
        return false;
3351
    }
3352

    
3353
    /* Calc approx time to dispatch */
3354
    wait_time = (ios_base + 1) / iops_limit;
3355
    if (wait_time > elapsed_time) {
3356
        wait_time = wait_time - elapsed_time;
3357
    } else {
3358
        wait_time = 0;
3359
    }
3360

    
3361
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3362
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3363
    if (wait) {
3364
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3365
    }
3366

    
3367
    return true;
3368
}
3369

    
3370
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3371
                           bool is_write, int64_t *wait)
3372
{
3373
    int64_t  now, max_wait;
3374
    uint64_t bps_wait = 0, iops_wait = 0;
3375
    double   elapsed_time;
3376
    int      bps_ret, iops_ret;
3377

    
3378
    now = qemu_get_clock_ns(vm_clock);
3379
    if ((bs->slice_start < now)
3380
        && (bs->slice_end > now)) {
3381
        bs->slice_end = now + bs->slice_time;
3382
    } else {
3383
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3384
        bs->slice_start = now;
3385
        bs->slice_end   = now + bs->slice_time;
3386

    
3387
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3388
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3389

    
3390
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3391
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3392
    }
3393

    
3394
    elapsed_time  = now - bs->slice_start;
3395
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3396

    
3397
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3398
                                      is_write, elapsed_time, &bps_wait);
3399
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3400
                                      elapsed_time, &iops_wait);
3401
    if (bps_ret || iops_ret) {
3402
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3403
        if (wait) {
3404
            *wait = max_wait;
3405
        }
3406

    
3407
        now = qemu_get_clock_ns(vm_clock);
3408
        if (bs->slice_end < now + max_wait) {
3409
            bs->slice_end = now + max_wait;
3410
        }
3411

    
3412
        return true;
3413
    }
3414

    
3415
    if (wait) {
3416
        *wait = 0;
3417
    }
3418

    
3419
    return false;
3420
}
3421

    
3422
/**************************************************************/
3423
/* async block device emulation */
3424

    
3425
typedef struct BlockDriverAIOCBSync {
3426
    BlockDriverAIOCB common;
3427
    QEMUBH *bh;
3428
    int ret;
3429
    /* vector translation state */
3430
    QEMUIOVector *qiov;
3431
    uint8_t *bounce;
3432
    int is_write;
3433
} BlockDriverAIOCBSync;
3434

    
3435
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3436
{
3437
    BlockDriverAIOCBSync *acb =
3438
        container_of(blockacb, BlockDriverAIOCBSync, common);
3439
    qemu_bh_delete(acb->bh);
3440
    acb->bh = NULL;
3441
    qemu_aio_release(acb);
3442
}
3443

    
3444
static AIOPool bdrv_em_aio_pool = {
3445
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3446
    .cancel             = bdrv_aio_cancel_em,
3447
};
3448

    
3449
static void bdrv_aio_bh_cb(void *opaque)
3450
{
3451
    BlockDriverAIOCBSync *acb = opaque;
3452

    
3453
    if (!acb->is_write)
3454
        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3455
    qemu_vfree(acb->bounce);
3456
    acb->common.cb(acb->common.opaque, acb->ret);
3457
    qemu_bh_delete(acb->bh);
3458
    acb->bh = NULL;
3459
    qemu_aio_release(acb);
3460
}
3461

    
3462
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3463
                                            int64_t sector_num,
3464
                                            QEMUIOVector *qiov,
3465
                                            int nb_sectors,
3466
                                            BlockDriverCompletionFunc *cb,
3467
                                            void *opaque,
3468
                                            int is_write)
3469

    
3470
{
3471
    BlockDriverAIOCBSync *acb;
3472

    
3473
    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3474
    acb->is_write = is_write;
3475
    acb->qiov = qiov;
3476
    acb->bounce = qemu_blockalign(bs, qiov->size);
3477
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3478

    
3479
    if (is_write) {
3480
        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3481
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3482
    } else {
3483
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3484
    }
3485

    
3486
    qemu_bh_schedule(acb->bh);
3487

    
3488
    return &acb->common;
3489
}
3490

    
3491
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3492
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3493
        BlockDriverCompletionFunc *cb, void *opaque)
3494
{
3495
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3496
}
3497

    
3498
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3499
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3500
        BlockDriverCompletionFunc *cb, void *opaque)
3501
{
3502
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3503
}
3504

    
3505

    
3506
typedef struct BlockDriverAIOCBCoroutine {
3507
    BlockDriverAIOCB common;
3508
    BlockRequest req;
3509
    bool is_write;
3510
    QEMUBH* bh;
3511
} BlockDriverAIOCBCoroutine;
3512

    
3513
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3514
{
3515
    qemu_aio_flush();
3516
}
3517

    
3518
static AIOPool bdrv_em_co_aio_pool = {
3519
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3520
    .cancel             = bdrv_aio_co_cancel_em,
3521
};
3522

    
3523
static void bdrv_co_em_bh(void *opaque)
3524
{
3525
    BlockDriverAIOCBCoroutine *acb = opaque;
3526

    
3527
    acb->common.cb(acb->common.opaque, acb->req.error);
3528
    qemu_bh_delete(acb->bh);
3529
    qemu_aio_release(acb);
3530
}
3531

    
3532
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3533
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3534
{
3535
    BlockDriverAIOCBCoroutine *acb = opaque;
3536
    BlockDriverState *bs = acb->common.bs;
3537

    
3538
    if (!acb->is_write) {
3539
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3540
            acb->req.nb_sectors, acb->req.qiov, 0);
3541
    } else {
3542
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3543
            acb->req.nb_sectors, acb->req.qiov, 0);
3544
    }
3545

    
3546
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3547
    qemu_bh_schedule(acb->bh);
3548
}
3549

    
3550
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3551
                                               int64_t sector_num,
3552
                                               QEMUIOVector *qiov,
3553
                                               int nb_sectors,
3554
                                               BlockDriverCompletionFunc *cb,
3555
                                               void *opaque,
3556
                                               bool is_write)
3557
{
3558
    Coroutine *co;
3559
    BlockDriverAIOCBCoroutine *acb;
3560

    
3561
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3562
    acb->req.sector = sector_num;
3563
    acb->req.nb_sectors = nb_sectors;
3564
    acb->req.qiov = qiov;
3565
    acb->is_write = is_write;
3566

    
3567
    co = qemu_coroutine_create(bdrv_co_do_rw);
3568
    qemu_coroutine_enter(co, acb);
3569

    
3570
    return &acb->common;
3571
}
3572

    
3573
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3574
{
3575
    BlockDriverAIOCBCoroutine *acb = opaque;
3576
    BlockDriverState *bs = acb->common.bs;
3577

    
3578
    acb->req.error = bdrv_co_flush(bs);
3579
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3580
    qemu_bh_schedule(acb->bh);
3581
}
3582

    
3583
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3584
        BlockDriverCompletionFunc *cb, void *opaque)
3585
{
3586
    trace_bdrv_aio_flush(bs, opaque);
3587

    
3588
    Coroutine *co;
3589
    BlockDriverAIOCBCoroutine *acb;
3590

    
3591
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3592
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3593
    qemu_coroutine_enter(co, acb);
3594

    
3595
    return &acb->common;
3596
}
3597

    
3598
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3599
{
3600
    BlockDriverAIOCBCoroutine *acb = opaque;
3601
    BlockDriverState *bs = acb->common.bs;
3602

    
3603
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3604
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3605
    qemu_bh_schedule(acb->bh);
3606
}
3607

    
3608
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3609
        int64_t sector_num, int nb_sectors,
3610
        BlockDriverCompletionFunc *cb, void *opaque)
3611
{
3612
    Coroutine *co;
3613
    BlockDriverAIOCBCoroutine *acb;
3614

    
3615
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3616

    
3617
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3618
    acb->req.sector = sector_num;
3619
    acb->req.nb_sectors = nb_sectors;
3620
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3621
    qemu_coroutine_enter(co, acb);
3622

    
3623
    return &acb->common;
3624
}
3625

    
3626
void bdrv_init(void)
3627
{
3628
    module_call_init(MODULE_INIT_BLOCK);
3629
}
3630

    
3631
void bdrv_init_with_whitelist(void)
3632
{
3633
    use_bdrv_whitelist = 1;
3634
    bdrv_init();
3635
}
3636

    
3637
void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3638
                   BlockDriverCompletionFunc *cb, void *opaque)
3639
{
3640
    BlockDriverAIOCB *acb;
3641

    
3642
    if (pool->free_aiocb) {
3643
        acb = pool->free_aiocb;
3644
        pool->free_aiocb = acb->next;
3645
    } else {
3646
        acb = g_malloc0(pool->aiocb_size);
3647
        acb->pool = pool;
3648
    }
3649
    acb->bs = bs;
3650
    acb->cb = cb;
3651
    acb->opaque = opaque;
3652
    return acb;
3653
}
3654

    
3655
void qemu_aio_release(void *p)
3656
{
3657
    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3658
    AIOPool *pool = acb->pool;
3659
    acb->next = pool->free_aiocb;
3660
    pool->free_aiocb = acb;
3661
}
3662

    
3663
/**************************************************************/
3664
/* Coroutine block device emulation */
3665

    
3666
typedef struct CoroutineIOCompletion {
3667
    Coroutine *coroutine;
3668
    int ret;
3669
} CoroutineIOCompletion;
3670

    
3671
static void bdrv_co_io_em_complete(void *opaque, int ret)
3672
{
3673
    CoroutineIOCompletion *co = opaque;
3674

    
3675
    co->ret = ret;
3676
    qemu_coroutine_enter(co->coroutine, NULL);
3677
}
3678

    
3679
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3680
                                      int nb_sectors, QEMUIOVector *iov,
3681
                                      bool is_write)
3682
{
3683
    CoroutineIOCompletion co = {
3684
        .coroutine = qemu_coroutine_self(),
3685
    };
3686
    BlockDriverAIOCB *acb;
3687

    
3688
    if (is_write) {
3689
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3690
                                       bdrv_co_io_em_complete, &co);
3691
    } else {
3692
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3693
                                      bdrv_co_io_em_complete, &co);
3694
    }
3695

    
3696
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3697
    if (!acb) {
3698
        return -EIO;
3699
    }
3700
    qemu_coroutine_yield();
3701

    
3702
    return co.ret;
3703
}
3704

    
3705
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3706
                                         int64_t sector_num, int nb_sectors,
3707
                                         QEMUIOVector *iov)
3708
{
3709
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3710
}
3711

    
3712
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3713
                                         int64_t sector_num, int nb_sectors,
3714
                                         QEMUIOVector *iov)
3715
{
3716
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3717
}
3718

    
3719
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3720
{
3721
    RwCo *rwco = opaque;
3722

    
3723
    rwco->ret = bdrv_co_flush(rwco->bs);
3724
}
3725

    
3726
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3727
{
3728
    int ret;
3729

    
3730
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3731
        return 0;
3732
    }
3733

    
3734
    /* Write back cached data to the OS even with cache=unsafe */
3735
    if (bs->drv->bdrv_co_flush_to_os) {
3736
        ret = bs->drv->bdrv_co_flush_to_os(bs);
3737
        if (ret < 0) {
3738
            return ret;
3739
        }
3740
    }
3741

    
3742
    /* But don't actually force it to the disk with cache=unsafe */
3743
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
3744
        return 0;
3745
    }
3746

    
3747
    if (bs->drv->bdrv_co_flush_to_disk) {
3748
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
3749
    } else if (bs->drv->bdrv_aio_flush) {
3750
        BlockDriverAIOCB *acb;
3751
        CoroutineIOCompletion co = {
3752
            .coroutine = qemu_coroutine_self(),
3753
        };
3754

    
3755
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3756
        if (acb == NULL) {
3757
            ret = -EIO;
3758
        } else {
3759
            qemu_coroutine_yield();
3760
            ret = co.ret;
3761
        }
3762
    } else {
3763
        /*
3764
         * Some block drivers always operate in either writethrough or unsafe
3765
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3766
         * know how the server works (because the behaviour is hardcoded or
3767
         * depends on server-side configuration), so we can't ensure that
3768
         * everything is safe on disk. Returning an error doesn't work because
3769
         * that would break guests even if the server operates in writethrough
3770
         * mode.
3771
         *
3772
         * Let's hope the user knows what he's doing.
3773
         */
3774
        ret = 0;
3775
    }
3776
    if (ret < 0) {
3777
        return ret;
3778
    }
3779

    
3780
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3781
     * in the case of cache=unsafe, so there are no useless flushes.
3782
     */
3783
    return bdrv_co_flush(bs->file);
3784
}
3785

    
3786
void bdrv_invalidate_cache(BlockDriverState *bs)
3787
{
3788
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3789
        bs->drv->bdrv_invalidate_cache(bs);
3790
    }
3791
}
3792

    
3793
void bdrv_invalidate_cache_all(void)
3794
{
3795
    BlockDriverState *bs;
3796

    
3797
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3798
        bdrv_invalidate_cache(bs);
3799
    }
3800
}
3801

    
3802
void bdrv_clear_incoming_migration_all(void)
3803
{
3804
    BlockDriverState *bs;
3805

    
3806
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3807
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3808
    }
3809
}
3810

    
3811
int bdrv_flush(BlockDriverState *bs)
3812
{
3813
    Coroutine *co;
3814
    RwCo rwco = {
3815
        .bs = bs,
3816
        .ret = NOT_DONE,
3817
    };
3818

    
3819
    if (qemu_in_coroutine()) {
3820
        /* Fast-path if already in coroutine context */
3821
        bdrv_flush_co_entry(&rwco);
3822
    } else {
3823
        co = qemu_coroutine_create(bdrv_flush_co_entry);
3824
        qemu_coroutine_enter(co, &rwco);
3825
        while (rwco.ret == NOT_DONE) {
3826
            qemu_aio_wait();
3827
        }
3828
    }
3829

    
3830
    return rwco.ret;
3831
}
3832

    
3833
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3834
{
3835
    RwCo *rwco = opaque;
3836

    
3837
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3838
}
3839

    
3840
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3841
                                 int nb_sectors)
3842
{
3843
    if (!bs->drv) {
3844
        return -ENOMEDIUM;
3845
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3846
        return -EIO;
3847
    } else if (bs->read_only) {
3848
        return -EROFS;
3849
    } else if (bs->drv->bdrv_co_discard) {
3850
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3851
    } else if (bs->drv->bdrv_aio_discard) {
3852
        BlockDriverAIOCB *acb;
3853
        CoroutineIOCompletion co = {
3854
            .coroutine = qemu_coroutine_self(),
3855
        };
3856

    
3857
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3858
                                        bdrv_co_io_em_complete, &co);
3859
        if (acb == NULL) {
3860
            return -EIO;
3861
        } else {
3862
            qemu_coroutine_yield();
3863
            return co.ret;
3864
        }
3865
    } else {
3866
        return 0;
3867
    }
3868
}
3869

    
3870
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3871
{
3872
    Coroutine *co;
3873
    RwCo rwco = {
3874
        .bs = bs,
3875
        .sector_num = sector_num,
3876
        .nb_sectors = nb_sectors,
3877
        .ret = NOT_DONE,
3878
    };
3879

    
3880
    if (qemu_in_coroutine()) {
3881
        /* Fast-path if already in coroutine context */
3882
        bdrv_discard_co_entry(&rwco);
3883
    } else {
3884
        co = qemu_coroutine_create(bdrv_discard_co_entry);
3885
        qemu_coroutine_enter(co, &rwco);
3886
        while (rwco.ret == NOT_DONE) {
3887
            qemu_aio_wait();
3888
        }
3889
    }
3890

    
3891
    return rwco.ret;
3892
}
3893

    
3894
/**************************************************************/
3895
/* removable device support */
3896

    
3897
/**
3898
 * Return TRUE if the media is present
3899
 */
3900
int bdrv_is_inserted(BlockDriverState *bs)
3901
{
3902
    BlockDriver *drv = bs->drv;
3903

    
3904
    if (!drv)
3905
        return 0;
3906
    if (!drv->bdrv_is_inserted)
3907
        return 1;
3908
    return drv->bdrv_is_inserted(bs);
3909
}
3910

    
3911
/**
3912
 * Return whether the media changed since the last call to this
3913
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3914
 */
3915
int bdrv_media_changed(BlockDriverState *bs)
3916
{
3917
    BlockDriver *drv = bs->drv;
3918

    
3919
    if (drv && drv->bdrv_media_changed) {
3920
        return drv->bdrv_media_changed(bs);
3921
    }
3922
    return -ENOTSUP;
3923
}
3924

    
3925
/**
3926
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3927
 */
3928
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3929
{
3930
    BlockDriver *drv = bs->drv;
3931

    
3932
    if (drv && drv->bdrv_eject) {
3933
        drv->bdrv_eject(bs, eject_flag);
3934
    }
3935

    
3936
    if (bs->device_name[0] != '\0') {
3937
        bdrv_emit_qmp_eject_event(bs, eject_flag);
3938
    }
3939
}
3940

    
3941
/**
3942
 * Lock or unlock the media (if it is locked, the user won't be able
3943
 * to eject it manually).
3944
 */
3945
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3946
{
3947
    BlockDriver *drv = bs->drv;
3948

    
3949
    trace_bdrv_lock_medium(bs, locked);
3950

    
3951
    if (drv && drv->bdrv_lock_medium) {
3952
        drv->bdrv_lock_medium(bs, locked);
3953
    }
3954
}
3955

    
3956
/* needed for generic scsi interface */
3957

    
3958
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3959
{
3960
    BlockDriver *drv = bs->drv;
3961

    
3962
    if (drv && drv->bdrv_ioctl)
3963
        return drv->bdrv_ioctl(bs, req, buf);
3964
    return -ENOTSUP;
3965
}
3966

    
3967
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3968
        unsigned long int req, void *buf,
3969
        BlockDriverCompletionFunc *cb, void *opaque)
3970
{
3971
    BlockDriver *drv = bs->drv;
3972

    
3973
    if (drv && drv->bdrv_aio_ioctl)
3974
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3975
    return NULL;
3976
}
3977

    
3978
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3979
{
3980
    bs->buffer_alignment = align;
3981
}
3982

    
3983
void *qemu_blockalign(BlockDriverState *bs, size_t size)
3984
{
3985
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3986
}
3987

    
3988
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3989
{
3990
    int64_t bitmap_size;
3991

    
3992
    bs->dirty_count = 0;
3993
    if (enable) {
3994
        if (!bs->dirty_bitmap) {
3995
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3996
                    BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3997
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3998

    
3999
            bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
4000
        }
4001
    } else {
4002
        if (bs->dirty_bitmap) {
4003
            g_free(bs->dirty_bitmap);
4004
            bs->dirty_bitmap = NULL;
4005
        }
4006
    }
4007
}
4008

    
4009
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4010
{
4011
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4012

    
4013
    if (bs->dirty_bitmap &&
4014
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4015
        return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
4016
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
4017
    } else {
4018
        return 0;
4019
    }
4020
}
4021

    
4022
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4023
                      int nr_sectors)
4024
{
4025
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4026
}
4027

    
4028
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4029
{
4030
    return bs->dirty_count;
4031
}
4032

    
4033
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4034
{
4035
    assert(bs->in_use != in_use);
4036
    bs->in_use = in_use;
4037
}
4038

    
4039
int bdrv_in_use(BlockDriverState *bs)
4040
{
4041
    return bs->in_use;
4042
}
4043

    
4044
void bdrv_iostatus_enable(BlockDriverState *bs)
4045
{
4046
    bs->iostatus_enabled = true;
4047
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4048
}
4049

    
4050
/* The I/O status is only enabled if the drive explicitly
4051
 * enables it _and_ the VM is configured to stop on errors */
4052
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4053
{
4054
    return (bs->iostatus_enabled &&
4055
           (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
4056
            bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
4057
            bs->on_read_error == BLOCK_ERR_STOP_ANY));
4058
}
4059

    
4060
void bdrv_iostatus_disable(BlockDriverState *bs)
4061
{
4062
    bs->iostatus_enabled = false;
4063
}
4064

    
4065
void bdrv_iostatus_reset(BlockDriverState *bs)
4066
{
4067
    if (bdrv_iostatus_is_enabled(bs)) {
4068
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4069
    }
4070
}
4071

    
4072
/* XXX: Today this is set by device models because it makes the implementation
4073
   quite simple. However, the block layer knows about the error, so it's
4074
   possible to implement this without device models being involved */
4075
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4076
{
4077
    if (bdrv_iostatus_is_enabled(bs) &&
4078
        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4079
        assert(error >= 0);
4080
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4081
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
4082
    }
4083
}
4084

    
4085
void
4086
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4087
        enum BlockAcctType type)
4088
{
4089
    assert(type < BDRV_MAX_IOTYPE);
4090

    
4091
    cookie->bytes = bytes;
4092
    cookie->start_time_ns = get_clock();
4093
    cookie->type = type;
4094
}
4095

    
4096
void
4097
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4098
{
4099
    assert(cookie->type < BDRV_MAX_IOTYPE);
4100

    
4101
    bs->nr_bytes[cookie->type] += cookie->bytes;
4102
    bs->nr_ops[cookie->type]++;
4103
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4104
}
4105

    
4106
int bdrv_img_create(const char *filename, const char *fmt,
4107
                    const char *base_filename, const char *base_fmt,
4108
                    char *options, uint64_t img_size, int flags)
4109
{
4110
    QEMUOptionParameter *param = NULL, *create_options = NULL;
4111
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
4112
    BlockDriverState *bs = NULL;
4113
    BlockDriver *drv, *proto_drv;
4114
    BlockDriver *backing_drv = NULL;
4115
    int ret = 0;
4116

    
4117
    /* Find driver and parse its options */
4118
    drv = bdrv_find_format(fmt);
4119
    if (!drv) {
4120
        error_report("Unknown file format '%s'", fmt);
4121
        ret = -EINVAL;
4122
        goto out;
4123
    }
4124

    
4125
    proto_drv = bdrv_find_protocol(filename);
4126
    if (!proto_drv) {
4127
        error_report("Unknown protocol '%s'", filename);
4128
        ret = -EINVAL;
4129
        goto out;
4130
    }
4131

    
4132
    create_options = append_option_parameters(create_options,
4133
                                              drv->create_options);
4134
    create_options = append_option_parameters(create_options,
4135
                                              proto_drv->create_options);
4136

    
4137
    /* Create parameter list with default values */
4138
    param = parse_option_parameters("", create_options, param);
4139

    
4140
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4141

    
4142
    /* Parse -o options */
4143
    if (options) {
4144
        param = parse_option_parameters(options, create_options, param);
4145
        if (param == NULL) {
4146
            error_report("Invalid options for file format '%s'.", fmt);
4147
            ret = -EINVAL;
4148
            goto out;
4149
        }
4150
    }
4151

    
4152
    if (base_filename) {
4153
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4154
                                 base_filename)) {
4155
            error_report("Backing file not supported for file format '%s'",
4156
                         fmt);
4157
            ret = -EINVAL;
4158
            goto out;
4159
        }
4160
    }
4161

    
4162
    if (base_fmt) {
4163
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4164
            error_report("Backing file format not supported for file "
4165
                         "format '%s'", fmt);
4166
            ret = -EINVAL;
4167
            goto out;
4168
        }
4169
    }
4170

    
4171
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4172
    if (backing_file && backing_file->value.s) {
4173
        if (!strcmp(filename, backing_file->value.s)) {
4174
            error_report("Error: Trying to create an image with the "
4175
                         "same filename as the backing file");
4176
            ret = -EINVAL;
4177
            goto out;
4178
        }
4179
    }
4180

    
4181
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4182
    if (backing_fmt && backing_fmt->value.s) {
4183
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4184
        if (!backing_drv) {
4185
            error_report("Unknown backing file format '%s'",
4186
                         backing_fmt->value.s);
4187
            ret = -EINVAL;
4188
            goto out;
4189
        }
4190
    }
4191

    
4192
    // The size for the image must always be specified, with one exception:
4193
    // If we are using a backing file, we can obtain the size from there
4194
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4195
    if (size && size->value.n == -1) {
4196
        if (backing_file && backing_file->value.s) {
4197
            uint64_t size;
4198
            char buf[32];
4199
            int back_flags;
4200

    
4201
            /* backing files always opened read-only */
4202
            back_flags =
4203
                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4204

    
4205
            bs = bdrv_new("");
4206

    
4207
            ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4208
            if (ret < 0) {
4209
                error_report("Could not open '%s'", backing_file->value.s);
4210
                goto out;
4211
            }
4212
            bdrv_get_geometry(bs, &size);
4213
            size *= 512;
4214

    
4215
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4216
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4217
        } else {
4218
            error_report("Image creation needs a size parameter");
4219
            ret = -EINVAL;
4220
            goto out;
4221
        }
4222
    }
4223

    
4224
    printf("Formatting '%s', fmt=%s ", filename, fmt);
4225
    print_option_parameters(param);
4226
    puts("");
4227

    
4228
    ret = bdrv_create(drv, filename, param);
4229

    
4230
    if (ret < 0) {
4231
        if (ret == -ENOTSUP) {
4232
            error_report("Formatting or formatting option not supported for "
4233
                         "file format '%s'", fmt);
4234
        } else if (ret == -EFBIG) {
4235
            error_report("The image size is too large for file format '%s'",
4236
                         fmt);
4237
        } else {
4238
            error_report("%s: error while creating %s: %s", filename, fmt,
4239
                         strerror(-ret));
4240
        }
4241
    }
4242

    
4243
out:
4244
    free_option_parameters(create_options);
4245
    free_option_parameters(param);
4246

    
4247
    if (bs) {
4248
        bdrv_delete(bs);
4249
    }
4250

    
4251
    return ret;
4252
}
4253

    
4254
void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4255
                       int64_t speed, BlockDriverCompletionFunc *cb,
4256
                       void *opaque, Error **errp)
4257
{
4258
    BlockJob *job;
4259

    
4260
    if (bs->job || bdrv_in_use(bs)) {
4261
        error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4262
        return NULL;
4263
    }
4264
    bdrv_set_in_use(bs, 1);
4265

    
4266
    job = g_malloc0(job_type->instance_size);
4267
    job->job_type      = job_type;
4268
    job->bs            = bs;
4269
    job->cb            = cb;
4270
    job->opaque        = opaque;
4271
    job->busy          = true;
4272
    bs->job = job;
4273

    
4274
    /* Only set speed when necessary to avoid NotSupported error */
4275
    if (speed != 0) {
4276
        Error *local_err = NULL;
4277

    
4278
        block_job_set_speed(job, speed, &local_err);
4279
        if (error_is_set(&local_err)) {
4280
            bs->job = NULL;
4281
            g_free(job);
4282
            bdrv_set_in_use(bs, 0);
4283
            error_propagate(errp, local_err);
4284
            return NULL;
4285
        }
4286
    }
4287
    return job;
4288
}
4289

    
4290
void block_job_complete(BlockJob *job, int ret)
4291
{
4292
    BlockDriverState *bs = job->bs;
4293

    
4294
    assert(bs->job == job);
4295
    job->cb(job->opaque, ret);
4296
    bs->job = NULL;
4297
    g_free(job);
4298
    bdrv_set_in_use(bs, 0);
4299
}
4300

    
4301
void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4302
{
4303
    Error *local_err = NULL;
4304

    
4305
    if (!job->job_type->set_speed) {
4306
        error_set(errp, QERR_NOT_SUPPORTED);
4307
        return;
4308
    }
4309
    job->job_type->set_speed(job, speed, &local_err);
4310
    if (error_is_set(&local_err)) {
4311
        error_propagate(errp, local_err);
4312
        return;
4313
    }
4314

    
4315
    job->speed = speed;
4316
}
4317

    
4318
void block_job_cancel(BlockJob *job)
4319
{
4320
    job->cancelled = true;
4321
    if (job->co && !job->busy) {
4322
        qemu_coroutine_enter(job->co, NULL);
4323
    }
4324
}
4325

    
4326
bool block_job_is_cancelled(BlockJob *job)
4327
{
4328
    return job->cancelled;
4329
}
4330

    
4331
struct BlockCancelData {
4332
    BlockJob *job;
4333
    BlockDriverCompletionFunc *cb;
4334
    void *opaque;
4335
    bool cancelled;
4336
    int ret;
4337
};
4338

    
4339
static void block_job_cancel_cb(void *opaque, int ret)
4340
{
4341
    struct BlockCancelData *data = opaque;
4342

    
4343
    data->cancelled = block_job_is_cancelled(data->job);
4344
    data->ret = ret;
4345
    data->cb(data->opaque, ret);
4346
}
4347

    
4348
int block_job_cancel_sync(BlockJob *job)
4349
{
4350
    struct BlockCancelData data;
4351
    BlockDriverState *bs = job->bs;
4352

    
4353
    assert(bs->job == job);
4354

    
4355
    /* Set up our own callback to store the result and chain to
4356
     * the original callback.
4357
     */
4358
    data.job = job;
4359
    data.cb = job->cb;
4360
    data.opaque = job->opaque;
4361
    data.ret = -EINPROGRESS;
4362
    job->cb = block_job_cancel_cb;
4363
    job->opaque = &data;
4364
    block_job_cancel(job);
4365
    while (data.ret == -EINPROGRESS) {
4366
        qemu_aio_wait();
4367
    }
4368
    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4369
}
4370

    
4371
void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4372
{
4373
    /* Check cancellation *before* setting busy = false, too!  */
4374
    if (!block_job_is_cancelled(job)) {
4375
        job->busy = false;
4376
        co_sleep_ns(clock, ns);
4377
        job->busy = true;
4378
    }
4379
}