Statistics
| Branch: | Revision:

root / block.c @ 9e6636c7

History | View | Annotate | Download (114.5 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block_int.h"
29
#include "module.h"
30
#include "qjson.h"
31
#include "qemu-coroutine.h"
32
#include "qmp-commands.h"
33
#include "qemu-timer.h"
34

    
35
#ifdef CONFIG_BSD
36
#include <sys/types.h>
37
#include <sys/stat.h>
38
#include <sys/ioctl.h>
39
#include <sys/queue.h>
40
#ifndef __DragonFly__
41
#include <sys/disk.h>
42
#endif
43
#endif
44

    
45
#ifdef _WIN32
46
#include <windows.h>
47
#endif
48

    
49
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50

    
51
typedef enum {
52
    BDRV_REQ_COPY_ON_READ = 0x1,
53
    BDRV_REQ_ZERO_WRITE   = 0x2,
54
} BdrvRequestFlags;
55

    
56
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59
        BlockDriverCompletionFunc *cb, void *opaque);
60
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64
                                         int64_t sector_num, int nb_sectors,
65
                                         QEMUIOVector *iov);
66
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71
    BdrvRequestFlags flags);
72
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76
                                               int64_t sector_num,
77
                                               QEMUIOVector *qiov,
78
                                               int nb_sectors,
79
                                               BlockDriverCompletionFunc *cb,
80
                                               void *opaque,
81
                                               bool is_write);
82
static void coroutine_fn bdrv_co_do_rw(void *opaque);
83
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84
    int64_t sector_num, int nb_sectors);
85

    
86
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87
        bool is_write, double elapsed_time, uint64_t *wait);
88
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89
        double elapsed_time, uint64_t *wait);
90
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91
        bool is_write, int64_t *wait);
92

    
93
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
95

    
96
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
98

    
99
/* The device to use for VM snapshots */
100
static BlockDriverState *bs_snapshots;
101

    
102
/* If non-zero, use only whitelisted block drivers */
103
static int use_bdrv_whitelist;
104

    
105
#ifdef _WIN32
106
static int is_windows_drive_prefix(const char *filename)
107
{
108
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110
            filename[1] == ':');
111
}
112

    
113
int is_windows_drive(const char *filename)
114
{
115
    if (is_windows_drive_prefix(filename) &&
116
        filename[2] == '\0')
117
        return 1;
118
    if (strstart(filename, "\\\\.\\", NULL) ||
119
        strstart(filename, "//./", NULL))
120
        return 1;
121
    return 0;
122
}
123
#endif
124

    
125
/* throttling disk I/O limits */
126
void bdrv_io_limits_disable(BlockDriverState *bs)
127
{
128
    bs->io_limits_enabled = false;
129

    
130
    while (qemu_co_queue_next(&bs->throttled_reqs));
131

    
132
    if (bs->block_timer) {
133
        qemu_del_timer(bs->block_timer);
134
        qemu_free_timer(bs->block_timer);
135
        bs->block_timer = NULL;
136
    }
137

    
138
    bs->slice_start = 0;
139
    bs->slice_end   = 0;
140
    bs->slice_time  = 0;
141
    memset(&bs->io_base, 0, sizeof(bs->io_base));
142
}
143

    
144
static void bdrv_block_timer(void *opaque)
145
{
146
    BlockDriverState *bs = opaque;
147

    
148
    qemu_co_queue_next(&bs->throttled_reqs);
149
}
150

    
151
void bdrv_io_limits_enable(BlockDriverState *bs)
152
{
153
    qemu_co_queue_init(&bs->throttled_reqs);
154
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
156
    bs->slice_start = qemu_get_clock_ns(vm_clock);
157
    bs->slice_end   = bs->slice_start + bs->slice_time;
158
    memset(&bs->io_base, 0, sizeof(bs->io_base));
159
    bs->io_limits_enabled = true;
160
}
161

    
162
bool bdrv_io_limits_enabled(BlockDriverState *bs)
163
{
164
    BlockIOLimit *io_limits = &bs->io_limits;
165
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
166
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
169
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171
}
172

    
173
static void bdrv_io_limits_intercept(BlockDriverState *bs,
174
                                     bool is_write, int nb_sectors)
175
{
176
    int64_t wait_time = -1;
177

    
178
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179
        qemu_co_queue_wait(&bs->throttled_reqs);
180
    }
181

    
182
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183
     * throttled requests will not be dequeued until the current request is
184
     * allowed to be serviced. So if the current request still exceeds the
185
     * limits, it will be inserted to the head. All requests followed it will
186
     * be still in throttled_reqs queue.
187
     */
188

    
189
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190
        qemu_mod_timer(bs->block_timer,
191
                       wait_time + qemu_get_clock_ns(vm_clock));
192
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193
    }
194

    
195
    qemu_co_queue_next(&bs->throttled_reqs);
196
}
197

    
198
/* check if the path starts with "<protocol>:" */
199
static int path_has_protocol(const char *path)
200
{
201
#ifdef _WIN32
202
    if (is_windows_drive(path) ||
203
        is_windows_drive_prefix(path)) {
204
        return 0;
205
    }
206
#endif
207

    
208
    return strchr(path, ':') != NULL;
209
}
210

    
211
int path_is_absolute(const char *path)
212
{
213
    const char *p;
214
#ifdef _WIN32
215
    /* specific case for names like: "\\.\d:" */
216
    if (*path == '/' || *path == '\\')
217
        return 1;
218
#endif
219
    p = strchr(path, ':');
220
    if (p)
221
        p++;
222
    else
223
        p = path;
224
#ifdef _WIN32
225
    return (*p == '/' || *p == '\\');
226
#else
227
    return (*p == '/');
228
#endif
229
}
230

    
231
/* if filename is absolute, just copy it to dest. Otherwise, build a
232
   path to it by considering it is relative to base_path. URL are
233
   supported. */
234
void path_combine(char *dest, int dest_size,
235
                  const char *base_path,
236
                  const char *filename)
237
{
238
    const char *p, *p1;
239
    int len;
240

    
241
    if (dest_size <= 0)
242
        return;
243
    if (path_is_absolute(filename)) {
244
        pstrcpy(dest, dest_size, filename);
245
    } else {
246
        p = strchr(base_path, ':');
247
        if (p)
248
            p++;
249
        else
250
            p = base_path;
251
        p1 = strrchr(base_path, '/');
252
#ifdef _WIN32
253
        {
254
            const char *p2;
255
            p2 = strrchr(base_path, '\\');
256
            if (!p1 || p2 > p1)
257
                p1 = p2;
258
        }
259
#endif
260
        if (p1)
261
            p1++;
262
        else
263
            p1 = base_path;
264
        if (p1 > p)
265
            p = p1;
266
        len = p - base_path;
267
        if (len > dest_size - 1)
268
            len = dest_size - 1;
269
        memcpy(dest, base_path, len);
270
        dest[len] = '\0';
271
        pstrcat(dest, dest_size, filename);
272
    }
273
}
274

    
275
void bdrv_register(BlockDriver *bdrv)
276
{
277
    /* Block drivers without coroutine functions need emulation */
278
    if (!bdrv->bdrv_co_readv) {
279
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
280
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
281

    
282
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
283
         * the block driver lacks aio we need to emulate that too.
284
         */
285
        if (!bdrv->bdrv_aio_readv) {
286
            /* add AIO emulation layer */
287
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
288
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
289
        }
290
    }
291

    
292
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
293
}
294

    
295
/* create a new block device (by default it is empty) */
296
BlockDriverState *bdrv_new(const char *device_name)
297
{
298
    BlockDriverState *bs;
299

    
300
    bs = g_malloc0(sizeof(BlockDriverState));
301
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
302
    if (device_name[0] != '\0') {
303
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
304
    }
305
    bdrv_iostatus_disable(bs);
306
    return bs;
307
}
308

    
309
BlockDriver *bdrv_find_format(const char *format_name)
310
{
311
    BlockDriver *drv1;
312
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
313
        if (!strcmp(drv1->format_name, format_name)) {
314
            return drv1;
315
        }
316
    }
317
    return NULL;
318
}
319

    
320
static int bdrv_is_whitelisted(BlockDriver *drv)
321
{
322
    static const char *whitelist[] = {
323
        CONFIG_BDRV_WHITELIST
324
    };
325
    const char **p;
326

    
327
    if (!whitelist[0])
328
        return 1;               /* no whitelist, anything goes */
329

    
330
    for (p = whitelist; *p; p++) {
331
        if (!strcmp(drv->format_name, *p)) {
332
            return 1;
333
        }
334
    }
335
    return 0;
336
}
337

    
338
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
339
{
340
    BlockDriver *drv = bdrv_find_format(format_name);
341
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
342
}
343

    
344
int bdrv_create(BlockDriver *drv, const char* filename,
345
    QEMUOptionParameter *options)
346
{
347
    if (!drv->bdrv_create)
348
        return -ENOTSUP;
349

    
350
    return drv->bdrv_create(filename, options);
351
}
352

    
353
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
354
{
355
    BlockDriver *drv;
356

    
357
    drv = bdrv_find_protocol(filename);
358
    if (drv == NULL) {
359
        return -ENOENT;
360
    }
361

    
362
    return bdrv_create(drv, filename, options);
363
}
364

    
365
#ifdef _WIN32
366
void get_tmp_filename(char *filename, int size)
367
{
368
    char temp_dir[MAX_PATH];
369

    
370
    GetTempPath(MAX_PATH, temp_dir);
371
    GetTempFileName(temp_dir, "qem", 0, filename);
372
}
373
#else
374
void get_tmp_filename(char *filename, int size)
375
{
376
    int fd;
377
    const char *tmpdir;
378
    /* XXX: race condition possible */
379
    tmpdir = getenv("TMPDIR");
380
    if (!tmpdir)
381
        tmpdir = "/tmp";
382
    snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
383
    fd = mkstemp(filename);
384
    close(fd);
385
}
386
#endif
387

    
388
/*
389
 * Detect host devices. By convention, /dev/cdrom[N] is always
390
 * recognized as a host CDROM.
391
 */
392
static BlockDriver *find_hdev_driver(const char *filename)
393
{
394
    int score_max = 0, score;
395
    BlockDriver *drv = NULL, *d;
396

    
397
    QLIST_FOREACH(d, &bdrv_drivers, list) {
398
        if (d->bdrv_probe_device) {
399
            score = d->bdrv_probe_device(filename);
400
            if (score > score_max) {
401
                score_max = score;
402
                drv = d;
403
            }
404
        }
405
    }
406

    
407
    return drv;
408
}
409

    
410
BlockDriver *bdrv_find_protocol(const char *filename)
411
{
412
    BlockDriver *drv1;
413
    char protocol[128];
414
    int len;
415
    const char *p;
416

    
417
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
418

    
419
    /*
420
     * XXX(hch): we really should not let host device detection
421
     * override an explicit protocol specification, but moving this
422
     * later breaks access to device names with colons in them.
423
     * Thanks to the brain-dead persistent naming schemes on udev-
424
     * based Linux systems those actually are quite common.
425
     */
426
    drv1 = find_hdev_driver(filename);
427
    if (drv1) {
428
        return drv1;
429
    }
430

    
431
    if (!path_has_protocol(filename)) {
432
        return bdrv_find_format("file");
433
    }
434
    p = strchr(filename, ':');
435
    assert(p != NULL);
436
    len = p - filename;
437
    if (len > sizeof(protocol) - 1)
438
        len = sizeof(protocol) - 1;
439
    memcpy(protocol, filename, len);
440
    protocol[len] = '\0';
441
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
442
        if (drv1->protocol_name &&
443
            !strcmp(drv1->protocol_name, protocol)) {
444
            return drv1;
445
        }
446
    }
447
    return NULL;
448
}
449

    
450
static int find_image_format(const char *filename, BlockDriver **pdrv)
451
{
452
    int ret, score, score_max;
453
    BlockDriver *drv1, *drv;
454
    uint8_t buf[2048];
455
    BlockDriverState *bs;
456

    
457
    ret = bdrv_file_open(&bs, filename, 0);
458
    if (ret < 0) {
459
        *pdrv = NULL;
460
        return ret;
461
    }
462

    
463
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
464
    if (bs->sg || !bdrv_is_inserted(bs)) {
465
        bdrv_delete(bs);
466
        drv = bdrv_find_format("raw");
467
        if (!drv) {
468
            ret = -ENOENT;
469
        }
470
        *pdrv = drv;
471
        return ret;
472
    }
473

    
474
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
475
    bdrv_delete(bs);
476
    if (ret < 0) {
477
        *pdrv = NULL;
478
        return ret;
479
    }
480

    
481
    score_max = 0;
482
    drv = NULL;
483
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
484
        if (drv1->bdrv_probe) {
485
            score = drv1->bdrv_probe(buf, ret, filename);
486
            if (score > score_max) {
487
                score_max = score;
488
                drv = drv1;
489
            }
490
        }
491
    }
492
    if (!drv) {
493
        ret = -ENOENT;
494
    }
495
    *pdrv = drv;
496
    return ret;
497
}
498

    
499
/**
500
 * Set the current 'total_sectors' value
501
 */
502
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
503
{
504
    BlockDriver *drv = bs->drv;
505

    
506
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
507
    if (bs->sg)
508
        return 0;
509

    
510
    /* query actual device if possible, otherwise just trust the hint */
511
    if (drv->bdrv_getlength) {
512
        int64_t length = drv->bdrv_getlength(bs);
513
        if (length < 0) {
514
            return length;
515
        }
516
        hint = length >> BDRV_SECTOR_BITS;
517
    }
518

    
519
    bs->total_sectors = hint;
520
    return 0;
521
}
522

    
523
/**
524
 * Set open flags for a given cache mode
525
 *
526
 * Return 0 on success, -1 if the cache mode was invalid.
527
 */
528
int bdrv_parse_cache_flags(const char *mode, int *flags)
529
{
530
    *flags &= ~BDRV_O_CACHE_MASK;
531

    
532
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
533
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
534
    } else if (!strcmp(mode, "directsync")) {
535
        *flags |= BDRV_O_NOCACHE;
536
    } else if (!strcmp(mode, "writeback")) {
537
        *flags |= BDRV_O_CACHE_WB;
538
    } else if (!strcmp(mode, "unsafe")) {
539
        *flags |= BDRV_O_CACHE_WB;
540
        *flags |= BDRV_O_NO_FLUSH;
541
    } else if (!strcmp(mode, "writethrough")) {
542
        /* this is the default */
543
    } else {
544
        return -1;
545
    }
546

    
547
    return 0;
548
}
549

    
550
/**
551
 * The copy-on-read flag is actually a reference count so multiple users may
552
 * use the feature without worrying about clobbering its previous state.
553
 * Copy-on-read stays enabled until all users have called to disable it.
554
 */
555
void bdrv_enable_copy_on_read(BlockDriverState *bs)
556
{
557
    bs->copy_on_read++;
558
}
559

    
560
void bdrv_disable_copy_on_read(BlockDriverState *bs)
561
{
562
    assert(bs->copy_on_read > 0);
563
    bs->copy_on_read--;
564
}
565

    
566
/*
567
 * Common part for opening disk images and files
568
 */
569
static int bdrv_open_common(BlockDriverState *bs, const char *filename,
570
    int flags, BlockDriver *drv)
571
{
572
    int ret, open_flags;
573

    
574
    assert(drv != NULL);
575

    
576
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
577

    
578
    bs->file = NULL;
579
    bs->total_sectors = 0;
580
    bs->encrypted = 0;
581
    bs->valid_key = 0;
582
    bs->sg = 0;
583
    bs->open_flags = flags;
584
    bs->growable = 0;
585
    bs->buffer_alignment = 512;
586

    
587
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
588
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
589
        bdrv_enable_copy_on_read(bs);
590
    }
591

    
592
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
593
    bs->backing_file[0] = '\0';
594

    
595
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
596
        return -ENOTSUP;
597
    }
598

    
599
    bs->drv = drv;
600
    bs->opaque = g_malloc0(drv->instance_size);
601

    
602
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
603

    
604
    /*
605
     * Clear flags that are internal to the block layer before opening the
606
     * image.
607
     */
608
    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
609

    
610
    /*
611
     * Snapshots should be writable.
612
     */
613
    if (bs->is_temporary) {
614
        open_flags |= BDRV_O_RDWR;
615
    }
616

    
617
    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
618

    
619
    /* Open the image, either directly or using a protocol */
620
    if (drv->bdrv_file_open) {
621
        ret = drv->bdrv_file_open(bs, filename, open_flags);
622
    } else {
623
        ret = bdrv_file_open(&bs->file, filename, open_flags);
624
        if (ret >= 0) {
625
            ret = drv->bdrv_open(bs, open_flags);
626
        }
627
    }
628

    
629
    if (ret < 0) {
630
        goto free_and_fail;
631
    }
632

    
633
    ret = refresh_total_sectors(bs, bs->total_sectors);
634
    if (ret < 0) {
635
        goto free_and_fail;
636
    }
637

    
638
#ifndef _WIN32
639
    if (bs->is_temporary) {
640
        unlink(filename);
641
    }
642
#endif
643
    return 0;
644

    
645
free_and_fail:
646
    if (bs->file) {
647
        bdrv_delete(bs->file);
648
        bs->file = NULL;
649
    }
650
    g_free(bs->opaque);
651
    bs->opaque = NULL;
652
    bs->drv = NULL;
653
    return ret;
654
}
655

    
656
/*
657
 * Opens a file using a protocol (file, host_device, nbd, ...)
658
 */
659
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
660
{
661
    BlockDriverState *bs;
662
    BlockDriver *drv;
663
    int ret;
664

    
665
    drv = bdrv_find_protocol(filename);
666
    if (!drv) {
667
        return -ENOENT;
668
    }
669

    
670
    bs = bdrv_new("");
671
    ret = bdrv_open_common(bs, filename, flags, drv);
672
    if (ret < 0) {
673
        bdrv_delete(bs);
674
        return ret;
675
    }
676
    bs->growable = 1;
677
    *pbs = bs;
678
    return 0;
679
}
680

    
681
/*
682
 * Opens a disk image (raw, qcow2, vmdk, ...)
683
 */
684
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
685
              BlockDriver *drv)
686
{
687
    int ret;
688
    char tmp_filename[PATH_MAX];
689

    
690
    if (flags & BDRV_O_SNAPSHOT) {
691
        BlockDriverState *bs1;
692
        int64_t total_size;
693
        int is_protocol = 0;
694
        BlockDriver *bdrv_qcow2;
695
        QEMUOptionParameter *options;
696
        char backing_filename[PATH_MAX];
697

    
698
        /* if snapshot, we create a temporary backing file and open it
699
           instead of opening 'filename' directly */
700

    
701
        /* if there is a backing file, use it */
702
        bs1 = bdrv_new("");
703
        ret = bdrv_open(bs1, filename, 0, drv);
704
        if (ret < 0) {
705
            bdrv_delete(bs1);
706
            return ret;
707
        }
708
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
709

    
710
        if (bs1->drv && bs1->drv->protocol_name)
711
            is_protocol = 1;
712

    
713
        bdrv_delete(bs1);
714

    
715
        get_tmp_filename(tmp_filename, sizeof(tmp_filename));
716

    
717
        /* Real path is meaningless for protocols */
718
        if (is_protocol)
719
            snprintf(backing_filename, sizeof(backing_filename),
720
                     "%s", filename);
721
        else if (!realpath(filename, backing_filename))
722
            return -errno;
723

    
724
        bdrv_qcow2 = bdrv_find_format("qcow2");
725
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
726

    
727
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
728
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
729
        if (drv) {
730
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
731
                drv->format_name);
732
        }
733

    
734
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
735
        free_option_parameters(options);
736
        if (ret < 0) {
737
            return ret;
738
        }
739

    
740
        filename = tmp_filename;
741
        drv = bdrv_qcow2;
742
        bs->is_temporary = 1;
743
    }
744

    
745
    /* Find the right image format driver */
746
    if (!drv) {
747
        ret = find_image_format(filename, &drv);
748
    }
749

    
750
    if (!drv) {
751
        goto unlink_and_fail;
752
    }
753

    
754
    /* Open the image */
755
    ret = bdrv_open_common(bs, filename, flags, drv);
756
    if (ret < 0) {
757
        goto unlink_and_fail;
758
    }
759

    
760
    /* If there is a backing file, use it */
761
    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
762
        char backing_filename[PATH_MAX];
763
        int back_flags;
764
        BlockDriver *back_drv = NULL;
765

    
766
        bs->backing_hd = bdrv_new("");
767

    
768
        if (path_has_protocol(bs->backing_file)) {
769
            pstrcpy(backing_filename, sizeof(backing_filename),
770
                    bs->backing_file);
771
        } else {
772
            path_combine(backing_filename, sizeof(backing_filename),
773
                         filename, bs->backing_file);
774
        }
775

    
776
        if (bs->backing_format[0] != '\0') {
777
            back_drv = bdrv_find_format(bs->backing_format);
778
        }
779

    
780
        /* backing files always opened read-only */
781
        back_flags =
782
            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
783

    
784
        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
785
        if (ret < 0) {
786
            bdrv_close(bs);
787
            return ret;
788
        }
789
        if (bs->is_temporary) {
790
            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
791
        } else {
792
            /* base image inherits from "parent" */
793
            bs->backing_hd->keep_read_only = bs->keep_read_only;
794
        }
795
    }
796

    
797
    if (!bdrv_key_required(bs)) {
798
        bdrv_dev_change_media_cb(bs, true);
799
    }
800

    
801
    /* throttling disk I/O limits */
802
    if (bs->io_limits_enabled) {
803
        bdrv_io_limits_enable(bs);
804
    }
805

    
806
    return 0;
807

    
808
unlink_and_fail:
809
    if (bs->is_temporary) {
810
        unlink(filename);
811
    }
812
    return ret;
813
}
814

    
815
void bdrv_close(BlockDriverState *bs)
816
{
817
    bdrv_flush(bs);
818
    if (bs->drv) {
819
        if (bs->job) {
820
            block_job_cancel_sync(bs->job);
821
        }
822
        bdrv_drain_all();
823

    
824
        if (bs == bs_snapshots) {
825
            bs_snapshots = NULL;
826
        }
827
        if (bs->backing_hd) {
828
            bdrv_delete(bs->backing_hd);
829
            bs->backing_hd = NULL;
830
        }
831
        bs->drv->bdrv_close(bs);
832
        g_free(bs->opaque);
833
#ifdef _WIN32
834
        if (bs->is_temporary) {
835
            unlink(bs->filename);
836
        }
837
#endif
838
        bs->opaque = NULL;
839
        bs->drv = NULL;
840
        bs->copy_on_read = 0;
841

    
842
        if (bs->file != NULL) {
843
            bdrv_close(bs->file);
844
        }
845

    
846
        bdrv_dev_change_media_cb(bs, false);
847
    }
848

    
849
    /*throttling disk I/O limits*/
850
    if (bs->io_limits_enabled) {
851
        bdrv_io_limits_disable(bs);
852
    }
853
}
854

    
855
void bdrv_close_all(void)
856
{
857
    BlockDriverState *bs;
858

    
859
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
860
        bdrv_close(bs);
861
    }
862
}
863

    
864
/*
865
 * Wait for pending requests to complete across all BlockDriverStates
866
 *
867
 * This function does not flush data to disk, use bdrv_flush_all() for that
868
 * after calling this function.
869
 */
870
void bdrv_drain_all(void)
871
{
872
    BlockDriverState *bs;
873

    
874
    qemu_aio_flush();
875

    
876
    /* If requests are still pending there is a bug somewhere */
877
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
878
        assert(QLIST_EMPTY(&bs->tracked_requests));
879
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
880
    }
881
}
882

    
883
/* make a BlockDriverState anonymous by removing from bdrv_state list.
884
   Also, NULL terminate the device_name to prevent double remove */
885
void bdrv_make_anon(BlockDriverState *bs)
886
{
887
    if (bs->device_name[0] != '\0') {
888
        QTAILQ_REMOVE(&bdrv_states, bs, list);
889
    }
890
    bs->device_name[0] = '\0';
891
}
892

    
893
/*
894
 * Add new bs contents at the top of an image chain while the chain is
895
 * live, while keeping required fields on the top layer.
896
 *
897
 * This will modify the BlockDriverState fields, and swap contents
898
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
899
 *
900
 * bs_new is required to be anonymous.
901
 *
902
 * This function does not create any image files.
903
 */
904
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
905
{
906
    BlockDriverState tmp;
907

    
908
    /* bs_new must be anonymous */
909
    assert(bs_new->device_name[0] == '\0');
910

    
911
    tmp = *bs_new;
912

    
913
    /* there are some fields that need to stay on the top layer: */
914

    
915
    /* dev info */
916
    tmp.dev_ops           = bs_top->dev_ops;
917
    tmp.dev_opaque        = bs_top->dev_opaque;
918
    tmp.dev               = bs_top->dev;
919
    tmp.buffer_alignment  = bs_top->buffer_alignment;
920
    tmp.copy_on_read      = bs_top->copy_on_read;
921

    
922
    /* i/o timing parameters */
923
    tmp.slice_time        = bs_top->slice_time;
924
    tmp.slice_start       = bs_top->slice_start;
925
    tmp.slice_end         = bs_top->slice_end;
926
    tmp.io_limits         = bs_top->io_limits;
927
    tmp.io_base           = bs_top->io_base;
928
    tmp.throttled_reqs    = bs_top->throttled_reqs;
929
    tmp.block_timer       = bs_top->block_timer;
930
    tmp.io_limits_enabled = bs_top->io_limits_enabled;
931

    
932
    /* geometry */
933
    tmp.cyls              = bs_top->cyls;
934
    tmp.heads             = bs_top->heads;
935
    tmp.secs              = bs_top->secs;
936
    tmp.translation       = bs_top->translation;
937

    
938
    /* r/w error */
939
    tmp.on_read_error     = bs_top->on_read_error;
940
    tmp.on_write_error    = bs_top->on_write_error;
941

    
942
    /* i/o status */
943
    tmp.iostatus_enabled  = bs_top->iostatus_enabled;
944
    tmp.iostatus          = bs_top->iostatus;
945

    
946
    /* keep the same entry in bdrv_states */
947
    pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
948
    tmp.list = bs_top->list;
949

    
950
    /* The contents of 'tmp' will become bs_top, as we are
951
     * swapping bs_new and bs_top contents. */
952
    tmp.backing_hd = bs_new;
953
    pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
954
    bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
955

    
956
    /* swap contents of the fixed new bs and the current top */
957
    *bs_new = *bs_top;
958
    *bs_top = tmp;
959

    
960
    /* device_name[] was carried over from the old bs_top.  bs_new
961
     * shouldn't be in bdrv_states, so we need to make device_name[]
962
     * reflect the anonymity of bs_new
963
     */
964
    bs_new->device_name[0] = '\0';
965

    
966
    /* clear the copied fields in the new backing file */
967
    bdrv_detach_dev(bs_new, bs_new->dev);
968

    
969
    qemu_co_queue_init(&bs_new->throttled_reqs);
970
    memset(&bs_new->io_base,   0, sizeof(bs_new->io_base));
971
    memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
972
    bdrv_iostatus_disable(bs_new);
973

    
974
    /* we don't use bdrv_io_limits_disable() for this, because we don't want
975
     * to affect or delete the block_timer, as it has been moved to bs_top */
976
    bs_new->io_limits_enabled = false;
977
    bs_new->block_timer       = NULL;
978
    bs_new->slice_time        = 0;
979
    bs_new->slice_start       = 0;
980
    bs_new->slice_end         = 0;
981
}
982

    
983
void bdrv_delete(BlockDriverState *bs)
984
{
985
    assert(!bs->dev);
986
    assert(!bs->job);
987
    assert(!bs->in_use);
988

    
989
    /* remove from list, if necessary */
990
    bdrv_make_anon(bs);
991

    
992
    bdrv_close(bs);
993
    if (bs->file != NULL) {
994
        bdrv_delete(bs->file);
995
    }
996

    
997
    assert(bs != bs_snapshots);
998
    g_free(bs);
999
}
1000

    
1001
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1002
/* TODO change to DeviceState *dev when all users are qdevified */
1003
{
1004
    if (bs->dev) {
1005
        return -EBUSY;
1006
    }
1007
    bs->dev = dev;
1008
    bdrv_iostatus_reset(bs);
1009
    return 0;
1010
}
1011

    
1012
/* TODO qdevified devices don't use this, remove when devices are qdevified */
1013
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1014
{
1015
    if (bdrv_attach_dev(bs, dev) < 0) {
1016
        abort();
1017
    }
1018
}
1019

    
1020
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1021
/* TODO change to DeviceState *dev when all users are qdevified */
1022
{
1023
    assert(bs->dev == dev);
1024
    bs->dev = NULL;
1025
    bs->dev_ops = NULL;
1026
    bs->dev_opaque = NULL;
1027
    bs->buffer_alignment = 512;
1028
}
1029

    
1030
/* TODO change to return DeviceState * when all users are qdevified */
1031
void *bdrv_get_attached_dev(BlockDriverState *bs)
1032
{
1033
    return bs->dev;
1034
}
1035

    
1036
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1037
                      void *opaque)
1038
{
1039
    bs->dev_ops = ops;
1040
    bs->dev_opaque = opaque;
1041
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1042
        bs_snapshots = NULL;
1043
    }
1044
}
1045

    
1046
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1047
                               BlockQMPEventAction action, int is_read)
1048
{
1049
    QObject *data;
1050
    const char *action_str;
1051

    
1052
    switch (action) {
1053
    case BDRV_ACTION_REPORT:
1054
        action_str = "report";
1055
        break;
1056
    case BDRV_ACTION_IGNORE:
1057
        action_str = "ignore";
1058
        break;
1059
    case BDRV_ACTION_STOP:
1060
        action_str = "stop";
1061
        break;
1062
    default:
1063
        abort();
1064
    }
1065

    
1066
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1067
                              bdrv->device_name,
1068
                              action_str,
1069
                              is_read ? "read" : "write");
1070
    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1071

    
1072
    qobject_decref(data);
1073
}
1074

    
1075
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1076
{
1077
    QObject *data;
1078

    
1079
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1080
                              bdrv_get_device_name(bs), ejected);
1081
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1082

    
1083
    qobject_decref(data);
1084
}
1085

    
1086
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1087
{
1088
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1089
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1090
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1091
        if (tray_was_closed) {
1092
            /* tray open */
1093
            bdrv_emit_qmp_eject_event(bs, true);
1094
        }
1095
        if (load) {
1096
            /* tray close */
1097
            bdrv_emit_qmp_eject_event(bs, false);
1098
        }
1099
    }
1100
}
1101

    
1102
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1103
{
1104
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1105
}
1106

    
1107
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1108
{
1109
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1110
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1111
    }
1112
}
1113

    
1114
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1115
{
1116
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1117
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1118
    }
1119
    return false;
1120
}
1121

    
1122
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1123
{
1124
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1125
        bs->dev_ops->resize_cb(bs->dev_opaque);
1126
    }
1127
}
1128

    
1129
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1130
{
1131
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1132
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1133
    }
1134
    return false;
1135
}
1136

    
1137
/*
1138
 * Run consistency checks on an image
1139
 *
1140
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1141
 * free of errors) or -errno when an internal error occurred. The results of the
1142
 * check are stored in res.
1143
 */
1144
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1145
{
1146
    if (bs->drv->bdrv_check == NULL) {
1147
        return -ENOTSUP;
1148
    }
1149

    
1150
    memset(res, 0, sizeof(*res));
1151
    return bs->drv->bdrv_check(bs, res);
1152
}
1153

    
1154
#define COMMIT_BUF_SECTORS 2048
1155

    
1156
/* commit COW file into the raw image */
1157
int bdrv_commit(BlockDriverState *bs)
1158
{
1159
    BlockDriver *drv = bs->drv;
1160
    BlockDriver *backing_drv;
1161
    int64_t sector, total_sectors;
1162
    int n, ro, open_flags;
1163
    int ret = 0, rw_ret = 0;
1164
    uint8_t *buf;
1165
    char filename[1024];
1166
    BlockDriverState *bs_rw, *bs_ro;
1167

    
1168
    if (!drv)
1169
        return -ENOMEDIUM;
1170
    
1171
    if (!bs->backing_hd) {
1172
        return -ENOTSUP;
1173
    }
1174

    
1175
    if (bs->backing_hd->keep_read_only) {
1176
        return -EACCES;
1177
    }
1178

    
1179
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1180
        return -EBUSY;
1181
    }
1182

    
1183
    backing_drv = bs->backing_hd->drv;
1184
    ro = bs->backing_hd->read_only;
1185
    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1186
    open_flags =  bs->backing_hd->open_flags;
1187

    
1188
    if (ro) {
1189
        /* re-open as RW */
1190
        bdrv_delete(bs->backing_hd);
1191
        bs->backing_hd = NULL;
1192
        bs_rw = bdrv_new("");
1193
        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1194
            backing_drv);
1195
        if (rw_ret < 0) {
1196
            bdrv_delete(bs_rw);
1197
            /* try to re-open read-only */
1198
            bs_ro = bdrv_new("");
1199
            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1200
                backing_drv);
1201
            if (ret < 0) {
1202
                bdrv_delete(bs_ro);
1203
                /* drive not functional anymore */
1204
                bs->drv = NULL;
1205
                return ret;
1206
            }
1207
            bs->backing_hd = bs_ro;
1208
            return rw_ret;
1209
        }
1210
        bs->backing_hd = bs_rw;
1211
    }
1212

    
1213
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1214
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1215

    
1216
    for (sector = 0; sector < total_sectors; sector += n) {
1217
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1218

    
1219
            if (bdrv_read(bs, sector, buf, n) != 0) {
1220
                ret = -EIO;
1221
                goto ro_cleanup;
1222
            }
1223

    
1224
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1225
                ret = -EIO;
1226
                goto ro_cleanup;
1227
            }
1228
        }
1229
    }
1230

    
1231
    if (drv->bdrv_make_empty) {
1232
        ret = drv->bdrv_make_empty(bs);
1233
        bdrv_flush(bs);
1234
    }
1235

    
1236
    /*
1237
     * Make sure all data we wrote to the backing device is actually
1238
     * stable on disk.
1239
     */
1240
    if (bs->backing_hd)
1241
        bdrv_flush(bs->backing_hd);
1242

    
1243
ro_cleanup:
1244
    g_free(buf);
1245

    
1246
    if (ro) {
1247
        /* re-open as RO */
1248
        bdrv_delete(bs->backing_hd);
1249
        bs->backing_hd = NULL;
1250
        bs_ro = bdrv_new("");
1251
        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1252
            backing_drv);
1253
        if (ret < 0) {
1254
            bdrv_delete(bs_ro);
1255
            /* drive not functional anymore */
1256
            bs->drv = NULL;
1257
            return ret;
1258
        }
1259
        bs->backing_hd = bs_ro;
1260
        bs->backing_hd->keep_read_only = 0;
1261
    }
1262

    
1263
    return ret;
1264
}
1265

    
1266
int bdrv_commit_all(void)
1267
{
1268
    BlockDriverState *bs;
1269

    
1270
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1271
        int ret = bdrv_commit(bs);
1272
        if (ret < 0) {
1273
            return ret;
1274
        }
1275
    }
1276
    return 0;
1277
}
1278

    
1279
struct BdrvTrackedRequest {
1280
    BlockDriverState *bs;
1281
    int64_t sector_num;
1282
    int nb_sectors;
1283
    bool is_write;
1284
    QLIST_ENTRY(BdrvTrackedRequest) list;
1285
    Coroutine *co; /* owner, used for deadlock detection */
1286
    CoQueue wait_queue; /* coroutines blocked on this request */
1287
};
1288

    
1289
/**
1290
 * Remove an active request from the tracked requests list
1291
 *
1292
 * This function should be called when a tracked request is completing.
1293
 */
1294
static void tracked_request_end(BdrvTrackedRequest *req)
1295
{
1296
    QLIST_REMOVE(req, list);
1297
    qemu_co_queue_restart_all(&req->wait_queue);
1298
}
1299

    
1300
/**
1301
 * Add an active request to the tracked requests list
1302
 */
1303
static void tracked_request_begin(BdrvTrackedRequest *req,
1304
                                  BlockDriverState *bs,
1305
                                  int64_t sector_num,
1306
                                  int nb_sectors, bool is_write)
1307
{
1308
    *req = (BdrvTrackedRequest){
1309
        .bs = bs,
1310
        .sector_num = sector_num,
1311
        .nb_sectors = nb_sectors,
1312
        .is_write = is_write,
1313
        .co = qemu_coroutine_self(),
1314
    };
1315

    
1316
    qemu_co_queue_init(&req->wait_queue);
1317

    
1318
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1319
}
1320

    
1321
/**
1322
 * Round a region to cluster boundaries
1323
 */
1324
static void round_to_clusters(BlockDriverState *bs,
1325
                              int64_t sector_num, int nb_sectors,
1326
                              int64_t *cluster_sector_num,
1327
                              int *cluster_nb_sectors)
1328
{
1329
    BlockDriverInfo bdi;
1330

    
1331
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1332
        *cluster_sector_num = sector_num;
1333
        *cluster_nb_sectors = nb_sectors;
1334
    } else {
1335
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1336
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1337
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1338
                                            nb_sectors, c);
1339
    }
1340
}
1341

    
1342
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1343
                                     int64_t sector_num, int nb_sectors) {
1344
    /*        aaaa   bbbb */
1345
    if (sector_num >= req->sector_num + req->nb_sectors) {
1346
        return false;
1347
    }
1348
    /* bbbb   aaaa        */
1349
    if (req->sector_num >= sector_num + nb_sectors) {
1350
        return false;
1351
    }
1352
    return true;
1353
}
1354

    
1355
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1356
        int64_t sector_num, int nb_sectors)
1357
{
1358
    BdrvTrackedRequest *req;
1359
    int64_t cluster_sector_num;
1360
    int cluster_nb_sectors;
1361
    bool retry;
1362

    
1363
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1364
     * that allocating writes will be serialized and not race with each other
1365
     * for the same cluster.  For example, in copy-on-read it ensures that the
1366
     * CoR read and write operations are atomic and guest writes cannot
1367
     * interleave between them.
1368
     */
1369
    round_to_clusters(bs, sector_num, nb_sectors,
1370
                      &cluster_sector_num, &cluster_nb_sectors);
1371

    
1372
    do {
1373
        retry = false;
1374
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1375
            if (tracked_request_overlaps(req, cluster_sector_num,
1376
                                         cluster_nb_sectors)) {
1377
                /* Hitting this means there was a reentrant request, for
1378
                 * example, a block driver issuing nested requests.  This must
1379
                 * never happen since it means deadlock.
1380
                 */
1381
                assert(qemu_coroutine_self() != req->co);
1382

    
1383
                qemu_co_queue_wait(&req->wait_queue);
1384
                retry = true;
1385
                break;
1386
            }
1387
        }
1388
    } while (retry);
1389
}
1390

    
1391
/*
1392
 * Return values:
1393
 * 0        - success
1394
 * -EINVAL  - backing format specified, but no file
1395
 * -ENOSPC  - can't update the backing file because no space is left in the
1396
 *            image file header
1397
 * -ENOTSUP - format driver doesn't support changing the backing file
1398
 */
1399
int bdrv_change_backing_file(BlockDriverState *bs,
1400
    const char *backing_file, const char *backing_fmt)
1401
{
1402
    BlockDriver *drv = bs->drv;
1403

    
1404
    if (drv->bdrv_change_backing_file != NULL) {
1405
        return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1406
    } else {
1407
        return -ENOTSUP;
1408
    }
1409
}
1410

    
1411
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1412
                                   size_t size)
1413
{
1414
    int64_t len;
1415

    
1416
    if (!bdrv_is_inserted(bs))
1417
        return -ENOMEDIUM;
1418

    
1419
    if (bs->growable)
1420
        return 0;
1421

    
1422
    len = bdrv_getlength(bs);
1423

    
1424
    if (offset < 0)
1425
        return -EIO;
1426

    
1427
    if ((offset > len) || (len - offset < size))
1428
        return -EIO;
1429

    
1430
    return 0;
1431
}
1432

    
1433
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1434
                              int nb_sectors)
1435
{
1436
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1437
                                   nb_sectors * BDRV_SECTOR_SIZE);
1438
}
1439

    
1440
typedef struct RwCo {
1441
    BlockDriverState *bs;
1442
    int64_t sector_num;
1443
    int nb_sectors;
1444
    QEMUIOVector *qiov;
1445
    bool is_write;
1446
    int ret;
1447
} RwCo;
1448

    
1449
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1450
{
1451
    RwCo *rwco = opaque;
1452

    
1453
    if (!rwco->is_write) {
1454
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1455
                                     rwco->nb_sectors, rwco->qiov, 0);
1456
    } else {
1457
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1458
                                      rwco->nb_sectors, rwco->qiov, 0);
1459
    }
1460
}
1461

    
1462
/*
1463
 * Process a synchronous request using coroutines
1464
 */
1465
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1466
                      int nb_sectors, bool is_write)
1467
{
1468
    QEMUIOVector qiov;
1469
    struct iovec iov = {
1470
        .iov_base = (void *)buf,
1471
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1472
    };
1473
    Coroutine *co;
1474
    RwCo rwco = {
1475
        .bs = bs,
1476
        .sector_num = sector_num,
1477
        .nb_sectors = nb_sectors,
1478
        .qiov = &qiov,
1479
        .is_write = is_write,
1480
        .ret = NOT_DONE,
1481
    };
1482

    
1483
    qemu_iovec_init_external(&qiov, &iov, 1);
1484

    
1485
    /**
1486
     * In sync call context, when the vcpu is blocked, this throttling timer
1487
     * will not fire; so the I/O throttling function has to be disabled here
1488
     * if it has been enabled.
1489
     */
1490
    if (bs->io_limits_enabled) {
1491
        fprintf(stderr, "Disabling I/O throttling on '%s' due "
1492
                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
1493
        bdrv_io_limits_disable(bs);
1494
    }
1495

    
1496
    if (qemu_in_coroutine()) {
1497
        /* Fast-path if already in coroutine context */
1498
        bdrv_rw_co_entry(&rwco);
1499
    } else {
1500
        co = qemu_coroutine_create(bdrv_rw_co_entry);
1501
        qemu_coroutine_enter(co, &rwco);
1502
        while (rwco.ret == NOT_DONE) {
1503
            qemu_aio_wait();
1504
        }
1505
    }
1506
    return rwco.ret;
1507
}
1508

    
1509
/* return < 0 if error. See bdrv_write() for the return codes */
1510
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1511
              uint8_t *buf, int nb_sectors)
1512
{
1513
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1514
}
1515

    
1516
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1517
                             int nb_sectors, int dirty)
1518
{
1519
    int64_t start, end;
1520
    unsigned long val, idx, bit;
1521

    
1522
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1523
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1524

    
1525
    for (; start <= end; start++) {
1526
        idx = start / (sizeof(unsigned long) * 8);
1527
        bit = start % (sizeof(unsigned long) * 8);
1528
        val = bs->dirty_bitmap[idx];
1529
        if (dirty) {
1530
            if (!(val & (1UL << bit))) {
1531
                bs->dirty_count++;
1532
                val |= 1UL << bit;
1533
            }
1534
        } else {
1535
            if (val & (1UL << bit)) {
1536
                bs->dirty_count--;
1537
                val &= ~(1UL << bit);
1538
            }
1539
        }
1540
        bs->dirty_bitmap[idx] = val;
1541
    }
1542
}
1543

    
1544
/* Return < 0 if error. Important errors are:
1545
  -EIO         generic I/O error (may happen for all errors)
1546
  -ENOMEDIUM   No media inserted.
1547
  -EINVAL      Invalid sector number or nb_sectors
1548
  -EACCES      Trying to write a read-only device
1549
*/
1550
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1551
               const uint8_t *buf, int nb_sectors)
1552
{
1553
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1554
}
1555

    
1556
int bdrv_pread(BlockDriverState *bs, int64_t offset,
1557
               void *buf, int count1)
1558
{
1559
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1560
    int len, nb_sectors, count;
1561
    int64_t sector_num;
1562
    int ret;
1563

    
1564
    count = count1;
1565
    /* first read to align to sector start */
1566
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1567
    if (len > count)
1568
        len = count;
1569
    sector_num = offset >> BDRV_SECTOR_BITS;
1570
    if (len > 0) {
1571
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1572
            return ret;
1573
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1574
        count -= len;
1575
        if (count == 0)
1576
            return count1;
1577
        sector_num++;
1578
        buf += len;
1579
    }
1580

    
1581
    /* read the sectors "in place" */
1582
    nb_sectors = count >> BDRV_SECTOR_BITS;
1583
    if (nb_sectors > 0) {
1584
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1585
            return ret;
1586
        sector_num += nb_sectors;
1587
        len = nb_sectors << BDRV_SECTOR_BITS;
1588
        buf += len;
1589
        count -= len;
1590
    }
1591

    
1592
    /* add data from the last sector */
1593
    if (count > 0) {
1594
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1595
            return ret;
1596
        memcpy(buf, tmp_buf, count);
1597
    }
1598
    return count1;
1599
}
1600

    
1601
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1602
                const void *buf, int count1)
1603
{
1604
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1605
    int len, nb_sectors, count;
1606
    int64_t sector_num;
1607
    int ret;
1608

    
1609
    count = count1;
1610
    /* first write to align to sector start */
1611
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1612
    if (len > count)
1613
        len = count;
1614
    sector_num = offset >> BDRV_SECTOR_BITS;
1615
    if (len > 0) {
1616
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1617
            return ret;
1618
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1619
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1620
            return ret;
1621
        count -= len;
1622
        if (count == 0)
1623
            return count1;
1624
        sector_num++;
1625
        buf += len;
1626
    }
1627

    
1628
    /* write the sectors "in place" */
1629
    nb_sectors = count >> BDRV_SECTOR_BITS;
1630
    if (nb_sectors > 0) {
1631
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1632
            return ret;
1633
        sector_num += nb_sectors;
1634
        len = nb_sectors << BDRV_SECTOR_BITS;
1635
        buf += len;
1636
        count -= len;
1637
    }
1638

    
1639
    /* add data from the last sector */
1640
    if (count > 0) {
1641
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1642
            return ret;
1643
        memcpy(tmp_buf, buf, count);
1644
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1645
            return ret;
1646
    }
1647
    return count1;
1648
}
1649

    
1650
/*
1651
 * Writes to the file and ensures that no writes are reordered across this
1652
 * request (acts as a barrier)
1653
 *
1654
 * Returns 0 on success, -errno in error cases.
1655
 */
1656
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1657
    const void *buf, int count)
1658
{
1659
    int ret;
1660

    
1661
    ret = bdrv_pwrite(bs, offset, buf, count);
1662
    if (ret < 0) {
1663
        return ret;
1664
    }
1665

    
1666
    /* No flush needed for cache modes that use O_DSYNC */
1667
    if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1668
        bdrv_flush(bs);
1669
    }
1670

    
1671
    return 0;
1672
}
1673

    
1674
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1675
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1676
{
1677
    /* Perform I/O through a temporary buffer so that users who scribble over
1678
     * their read buffer while the operation is in progress do not end up
1679
     * modifying the image file.  This is critical for zero-copy guest I/O
1680
     * where anything might happen inside guest memory.
1681
     */
1682
    void *bounce_buffer;
1683

    
1684
    BlockDriver *drv = bs->drv;
1685
    struct iovec iov;
1686
    QEMUIOVector bounce_qiov;
1687
    int64_t cluster_sector_num;
1688
    int cluster_nb_sectors;
1689
    size_t skip_bytes;
1690
    int ret;
1691

    
1692
    /* Cover entire cluster so no additional backing file I/O is required when
1693
     * allocating cluster in the image file.
1694
     */
1695
    round_to_clusters(bs, sector_num, nb_sectors,
1696
                      &cluster_sector_num, &cluster_nb_sectors);
1697

    
1698
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1699
                                   cluster_sector_num, cluster_nb_sectors);
1700

    
1701
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1702
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1703
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1704

    
1705
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1706
                             &bounce_qiov);
1707
    if (ret < 0) {
1708
        goto err;
1709
    }
1710

    
1711
    if (drv->bdrv_co_write_zeroes &&
1712
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
1713
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1714
                                      cluster_nb_sectors);
1715
    } else {
1716
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1717
                                  &bounce_qiov);
1718
    }
1719

    
1720
    if (ret < 0) {
1721
        /* It might be okay to ignore write errors for guest requests.  If this
1722
         * is a deliberate copy-on-read then we don't want to ignore the error.
1723
         * Simply report it in all cases.
1724
         */
1725
        goto err;
1726
    }
1727

    
1728
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1729
    qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1730
                           nb_sectors * BDRV_SECTOR_SIZE);
1731

    
1732
err:
1733
    qemu_vfree(bounce_buffer);
1734
    return ret;
1735
}
1736

    
1737
/*
1738
 * Handle a read request in coroutine context
1739
 */
1740
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1741
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1742
    BdrvRequestFlags flags)
1743
{
1744
    BlockDriver *drv = bs->drv;
1745
    BdrvTrackedRequest req;
1746
    int ret;
1747

    
1748
    if (!drv) {
1749
        return -ENOMEDIUM;
1750
    }
1751
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1752
        return -EIO;
1753
    }
1754

    
1755
    /* throttling disk read I/O */
1756
    if (bs->io_limits_enabled) {
1757
        bdrv_io_limits_intercept(bs, false, nb_sectors);
1758
    }
1759

    
1760
    if (bs->copy_on_read) {
1761
        flags |= BDRV_REQ_COPY_ON_READ;
1762
    }
1763
    if (flags & BDRV_REQ_COPY_ON_READ) {
1764
        bs->copy_on_read_in_flight++;
1765
    }
1766

    
1767
    if (bs->copy_on_read_in_flight) {
1768
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1769
    }
1770

    
1771
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1772

    
1773
    if (flags & BDRV_REQ_COPY_ON_READ) {
1774
        int pnum;
1775

    
1776
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1777
        if (ret < 0) {
1778
            goto out;
1779
        }
1780

    
1781
        if (!ret || pnum != nb_sectors) {
1782
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1783
            goto out;
1784
        }
1785
    }
1786

    
1787
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1788

    
1789
out:
1790
    tracked_request_end(&req);
1791

    
1792
    if (flags & BDRV_REQ_COPY_ON_READ) {
1793
        bs->copy_on_read_in_flight--;
1794
    }
1795

    
1796
    return ret;
1797
}
1798

    
1799
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1800
    int nb_sectors, QEMUIOVector *qiov)
1801
{
1802
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1803

    
1804
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1805
}
1806

    
1807
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1808
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1809
{
1810
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1811

    
1812
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1813
                            BDRV_REQ_COPY_ON_READ);
1814
}
1815

    
1816
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1817
    int64_t sector_num, int nb_sectors)
1818
{
1819
    BlockDriver *drv = bs->drv;
1820
    QEMUIOVector qiov;
1821
    struct iovec iov;
1822
    int ret;
1823

    
1824
    /* TODO Emulate only part of misaligned requests instead of letting block
1825
     * drivers return -ENOTSUP and emulate everything */
1826

    
1827
    /* First try the efficient write zeroes operation */
1828
    if (drv->bdrv_co_write_zeroes) {
1829
        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1830
        if (ret != -ENOTSUP) {
1831
            return ret;
1832
        }
1833
    }
1834

    
1835
    /* Fall back to bounce buffer if write zeroes is unsupported */
1836
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1837
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1838
    memset(iov.iov_base, 0, iov.iov_len);
1839
    qemu_iovec_init_external(&qiov, &iov, 1);
1840

    
1841
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1842

    
1843
    qemu_vfree(iov.iov_base);
1844
    return ret;
1845
}
1846

    
1847
/*
1848
 * Handle a write request in coroutine context
1849
 */
1850
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1851
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1852
    BdrvRequestFlags flags)
1853
{
1854
    BlockDriver *drv = bs->drv;
1855
    BdrvTrackedRequest req;
1856
    int ret;
1857

    
1858
    if (!bs->drv) {
1859
        return -ENOMEDIUM;
1860
    }
1861
    if (bs->read_only) {
1862
        return -EACCES;
1863
    }
1864
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1865
        return -EIO;
1866
    }
1867

    
1868
    /* throttling disk write I/O */
1869
    if (bs->io_limits_enabled) {
1870
        bdrv_io_limits_intercept(bs, true, nb_sectors);
1871
    }
1872

    
1873
    if (bs->copy_on_read_in_flight) {
1874
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1875
    }
1876

    
1877
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1878

    
1879
    if (flags & BDRV_REQ_ZERO_WRITE) {
1880
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1881
    } else {
1882
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1883
    }
1884

    
1885
    if (bs->dirty_bitmap) {
1886
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1887
    }
1888

    
1889
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1890
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
1891
    }
1892

    
1893
    tracked_request_end(&req);
1894

    
1895
    return ret;
1896
}
1897

    
1898
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1899
    int nb_sectors, QEMUIOVector *qiov)
1900
{
1901
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1902

    
1903
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1904
}
1905

    
1906
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1907
                                      int64_t sector_num, int nb_sectors)
1908
{
1909
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1910

    
1911
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1912
                             BDRV_REQ_ZERO_WRITE);
1913
}
1914

    
1915
/**
1916
 * Truncate file to 'offset' bytes (needed only for file protocols)
1917
 */
1918
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1919
{
1920
    BlockDriver *drv = bs->drv;
1921
    int ret;
1922
    if (!drv)
1923
        return -ENOMEDIUM;
1924
    if (!drv->bdrv_truncate)
1925
        return -ENOTSUP;
1926
    if (bs->read_only)
1927
        return -EACCES;
1928
    if (bdrv_in_use(bs))
1929
        return -EBUSY;
1930
    ret = drv->bdrv_truncate(bs, offset);
1931
    if (ret == 0) {
1932
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1933
        bdrv_dev_resize_cb(bs);
1934
    }
1935
    return ret;
1936
}
1937

    
1938
/**
1939
 * Length of a allocated file in bytes. Sparse files are counted by actual
1940
 * allocated space. Return < 0 if error or unknown.
1941
 */
1942
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1943
{
1944
    BlockDriver *drv = bs->drv;
1945
    if (!drv) {
1946
        return -ENOMEDIUM;
1947
    }
1948
    if (drv->bdrv_get_allocated_file_size) {
1949
        return drv->bdrv_get_allocated_file_size(bs);
1950
    }
1951
    if (bs->file) {
1952
        return bdrv_get_allocated_file_size(bs->file);
1953
    }
1954
    return -ENOTSUP;
1955
}
1956

    
1957
/**
1958
 * Length of a file in bytes. Return < 0 if error or unknown.
1959
 */
1960
int64_t bdrv_getlength(BlockDriverState *bs)
1961
{
1962
    BlockDriver *drv = bs->drv;
1963
    if (!drv)
1964
        return -ENOMEDIUM;
1965

    
1966
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1967
        if (drv->bdrv_getlength) {
1968
            return drv->bdrv_getlength(bs);
1969
        }
1970
    }
1971
    return bs->total_sectors * BDRV_SECTOR_SIZE;
1972
}
1973

    
1974
/* return 0 as number of sectors if no device present or error */
1975
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1976
{
1977
    int64_t length;
1978
    length = bdrv_getlength(bs);
1979
    if (length < 0)
1980
        length = 0;
1981
    else
1982
        length = length >> BDRV_SECTOR_BITS;
1983
    *nb_sectors_ptr = length;
1984
}
1985

    
1986
struct partition {
1987
        uint8_t boot_ind;           /* 0x80 - active */
1988
        uint8_t head;               /* starting head */
1989
        uint8_t sector;             /* starting sector */
1990
        uint8_t cyl;                /* starting cylinder */
1991
        uint8_t sys_ind;            /* What partition type */
1992
        uint8_t end_head;           /* end head */
1993
        uint8_t end_sector;         /* end sector */
1994
        uint8_t end_cyl;            /* end cylinder */
1995
        uint32_t start_sect;        /* starting sector counting from 0 */
1996
        uint32_t nr_sects;          /* nr of sectors in partition */
1997
} QEMU_PACKED;
1998

    
1999
/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2000
static int guess_disk_lchs(BlockDriverState *bs,
2001
                           int *pcylinders, int *pheads, int *psectors)
2002
{
2003
    uint8_t buf[BDRV_SECTOR_SIZE];
2004
    int ret, i, heads, sectors, cylinders;
2005
    struct partition *p;
2006
    uint32_t nr_sects;
2007
    uint64_t nb_sectors;
2008
    bool enabled;
2009

    
2010
    bdrv_get_geometry(bs, &nb_sectors);
2011

    
2012
    /**
2013
     * The function will be invoked during startup not only in sync I/O mode,
2014
     * but also in async I/O mode. So the I/O throttling function has to
2015
     * be disabled temporarily here, not permanently.
2016
     */
2017
    enabled = bs->io_limits_enabled;
2018
    bs->io_limits_enabled = false;
2019
    ret = bdrv_read(bs, 0, buf, 1);
2020
    bs->io_limits_enabled = enabled;
2021
    if (ret < 0)
2022
        return -1;
2023
    /* test msdos magic */
2024
    if (buf[510] != 0x55 || buf[511] != 0xaa)
2025
        return -1;
2026
    for(i = 0; i < 4; i++) {
2027
        p = ((struct partition *)(buf + 0x1be)) + i;
2028
        nr_sects = le32_to_cpu(p->nr_sects);
2029
        if (nr_sects && p->end_head) {
2030
            /* We make the assumption that the partition terminates on
2031
               a cylinder boundary */
2032
            heads = p->end_head + 1;
2033
            sectors = p->end_sector & 63;
2034
            if (sectors == 0)
2035
                continue;
2036
            cylinders = nb_sectors / (heads * sectors);
2037
            if (cylinders < 1 || cylinders > 16383)
2038
                continue;
2039
            *pheads = heads;
2040
            *psectors = sectors;
2041
            *pcylinders = cylinders;
2042
#if 0
2043
            printf("guessed geometry: LCHS=%d %d %d\n",
2044
                   cylinders, heads, sectors);
2045
#endif
2046
            return 0;
2047
        }
2048
    }
2049
    return -1;
2050
}
2051

    
2052
void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2053
{
2054
    int translation, lba_detected = 0;
2055
    int cylinders, heads, secs;
2056
    uint64_t nb_sectors;
2057

    
2058
    /* if a geometry hint is available, use it */
2059
    bdrv_get_geometry(bs, &nb_sectors);
2060
    bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2061
    translation = bdrv_get_translation_hint(bs);
2062
    if (cylinders != 0) {
2063
        *pcyls = cylinders;
2064
        *pheads = heads;
2065
        *psecs = secs;
2066
    } else {
2067
        if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2068
            if (heads > 16) {
2069
                /* if heads > 16, it means that a BIOS LBA
2070
                   translation was active, so the default
2071
                   hardware geometry is OK */
2072
                lba_detected = 1;
2073
                goto default_geometry;
2074
            } else {
2075
                *pcyls = cylinders;
2076
                *pheads = heads;
2077
                *psecs = secs;
2078
                /* disable any translation to be in sync with
2079
                   the logical geometry */
2080
                if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2081
                    bdrv_set_translation_hint(bs,
2082
                                              BIOS_ATA_TRANSLATION_NONE);
2083
                }
2084
            }
2085
        } else {
2086
        default_geometry:
2087
            /* if no geometry, use a standard physical disk geometry */
2088
            cylinders = nb_sectors / (16 * 63);
2089

    
2090
            if (cylinders > 16383)
2091
                cylinders = 16383;
2092
            else if (cylinders < 2)
2093
                cylinders = 2;
2094
            *pcyls = cylinders;
2095
            *pheads = 16;
2096
            *psecs = 63;
2097
            if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2098
                if ((*pcyls * *pheads) <= 131072) {
2099
                    bdrv_set_translation_hint(bs,
2100
                                              BIOS_ATA_TRANSLATION_LARGE);
2101
                } else {
2102
                    bdrv_set_translation_hint(bs,
2103
                                              BIOS_ATA_TRANSLATION_LBA);
2104
                }
2105
            }
2106
        }
2107
        bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2108
    }
2109
}
2110

    
2111
void bdrv_set_geometry_hint(BlockDriverState *bs,
2112
                            int cyls, int heads, int secs)
2113
{
2114
    bs->cyls = cyls;
2115
    bs->heads = heads;
2116
    bs->secs = secs;
2117
}
2118

    
2119
void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2120
{
2121
    bs->translation = translation;
2122
}
2123

    
2124
void bdrv_get_geometry_hint(BlockDriverState *bs,
2125
                            int *pcyls, int *pheads, int *psecs)
2126
{
2127
    *pcyls = bs->cyls;
2128
    *pheads = bs->heads;
2129
    *psecs = bs->secs;
2130
}
2131

    
2132
/* throttling disk io limits */
2133
void bdrv_set_io_limits(BlockDriverState *bs,
2134
                        BlockIOLimit *io_limits)
2135
{
2136
    bs->io_limits = *io_limits;
2137
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2138
}
2139

    
2140
/* Recognize floppy formats */
2141
typedef struct FDFormat {
2142
    FDriveType drive;
2143
    uint8_t last_sect;
2144
    uint8_t max_track;
2145
    uint8_t max_head;
2146
    FDriveRate rate;
2147
} FDFormat;
2148

    
2149
static const FDFormat fd_formats[] = {
2150
    /* First entry is default format */
2151
    /* 1.44 MB 3"1/2 floppy disks */
2152
    { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2153
    { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2154
    { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2155
    { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2156
    { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2157
    { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2158
    { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2159
    { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2160
    /* 2.88 MB 3"1/2 floppy disks */
2161
    { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2162
    { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2163
    { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2164
    { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2165
    { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2166
    /* 720 kB 3"1/2 floppy disks */
2167
    { FDRIVE_DRV_144,  9, 80, 1, FDRIVE_RATE_250K, },
2168
    { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2169
    { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2170
    { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2171
    { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2172
    { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2173
    /* 1.2 MB 5"1/4 floppy disks */
2174
    { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2175
    { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2176
    { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2177
    { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2178
    { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2179
    /* 720 kB 5"1/4 floppy disks */
2180
    { FDRIVE_DRV_120,  9, 80, 1, FDRIVE_RATE_250K, },
2181
    { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2182
    /* 360 kB 5"1/4 floppy disks */
2183
    { FDRIVE_DRV_120,  9, 40, 1, FDRIVE_RATE_300K, },
2184
    { FDRIVE_DRV_120,  9, 40, 0, FDRIVE_RATE_300K, },
2185
    { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2186
    { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2187
    /* 320 kB 5"1/4 floppy disks */
2188
    { FDRIVE_DRV_120,  8, 40, 1, FDRIVE_RATE_250K, },
2189
    { FDRIVE_DRV_120,  8, 40, 0, FDRIVE_RATE_250K, },
2190
    /* 360 kB must match 5"1/4 better than 3"1/2... */
2191
    { FDRIVE_DRV_144,  9, 80, 0, FDRIVE_RATE_250K, },
2192
    /* end */
2193
    { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2194
};
2195

    
2196
void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2197
                                   int *max_track, int *last_sect,
2198
                                   FDriveType drive_in, FDriveType *drive,
2199
                                   FDriveRate *rate)
2200
{
2201
    const FDFormat *parse;
2202
    uint64_t nb_sectors, size;
2203
    int i, first_match, match;
2204

    
2205
    bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2206
    if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2207
        /* User defined disk */
2208
        *rate = FDRIVE_RATE_500K;
2209
    } else {
2210
        bdrv_get_geometry(bs, &nb_sectors);
2211
        match = -1;
2212
        first_match = -1;
2213
        for (i = 0; ; i++) {
2214
            parse = &fd_formats[i];
2215
            if (parse->drive == FDRIVE_DRV_NONE) {
2216
                break;
2217
            }
2218
            if (drive_in == parse->drive ||
2219
                drive_in == FDRIVE_DRV_NONE) {
2220
                size = (parse->max_head + 1) * parse->max_track *
2221
                    parse->last_sect;
2222
                if (nb_sectors == size) {
2223
                    match = i;
2224
                    break;
2225
                }
2226
                if (first_match == -1) {
2227
                    first_match = i;
2228
                }
2229
            }
2230
        }
2231
        if (match == -1) {
2232
            if (first_match == -1) {
2233
                match = 1;
2234
            } else {
2235
                match = first_match;
2236
            }
2237
            parse = &fd_formats[match];
2238
        }
2239
        *nb_heads = parse->max_head + 1;
2240
        *max_track = parse->max_track;
2241
        *last_sect = parse->last_sect;
2242
        *drive = parse->drive;
2243
        *rate = parse->rate;
2244
    }
2245
}
2246

    
2247
int bdrv_get_translation_hint(BlockDriverState *bs)
2248
{
2249
    return bs->translation;
2250
}
2251

    
2252
void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2253
                       BlockErrorAction on_write_error)
2254
{
2255
    bs->on_read_error = on_read_error;
2256
    bs->on_write_error = on_write_error;
2257
}
2258

    
2259
BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2260
{
2261
    return is_read ? bs->on_read_error : bs->on_write_error;
2262
}
2263

    
2264
int bdrv_is_read_only(BlockDriverState *bs)
2265
{
2266
    return bs->read_only;
2267
}
2268

    
2269
int bdrv_is_sg(BlockDriverState *bs)
2270
{
2271
    return bs->sg;
2272
}
2273

    
2274
int bdrv_enable_write_cache(BlockDriverState *bs)
2275
{
2276
    return bs->enable_write_cache;
2277
}
2278

    
2279
int bdrv_is_encrypted(BlockDriverState *bs)
2280
{
2281
    if (bs->backing_hd && bs->backing_hd->encrypted)
2282
        return 1;
2283
    return bs->encrypted;
2284
}
2285

    
2286
int bdrv_key_required(BlockDriverState *bs)
2287
{
2288
    BlockDriverState *backing_hd = bs->backing_hd;
2289

    
2290
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2291
        return 1;
2292
    return (bs->encrypted && !bs->valid_key);
2293
}
2294

    
2295
int bdrv_set_key(BlockDriverState *bs, const char *key)
2296
{
2297
    int ret;
2298
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2299
        ret = bdrv_set_key(bs->backing_hd, key);
2300
        if (ret < 0)
2301
            return ret;
2302
        if (!bs->encrypted)
2303
            return 0;
2304
    }
2305
    if (!bs->encrypted) {
2306
        return -EINVAL;
2307
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2308
        return -ENOMEDIUM;
2309
    }
2310
    ret = bs->drv->bdrv_set_key(bs, key);
2311
    if (ret < 0) {
2312
        bs->valid_key = 0;
2313
    } else if (!bs->valid_key) {
2314
        bs->valid_key = 1;
2315
        /* call the change callback now, we skipped it on open */
2316
        bdrv_dev_change_media_cb(bs, true);
2317
    }
2318
    return ret;
2319
}
2320

    
2321
void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2322
{
2323
    if (!bs->drv) {
2324
        buf[0] = '\0';
2325
    } else {
2326
        pstrcpy(buf, buf_size, bs->drv->format_name);
2327
    }
2328
}
2329

    
2330
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2331
                         void *opaque)
2332
{
2333
    BlockDriver *drv;
2334

    
2335
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2336
        it(opaque, drv->format_name);
2337
    }
2338
}
2339

    
2340
BlockDriverState *bdrv_find(const char *name)
2341
{
2342
    BlockDriverState *bs;
2343

    
2344
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2345
        if (!strcmp(name, bs->device_name)) {
2346
            return bs;
2347
        }
2348
    }
2349
    return NULL;
2350
}
2351

    
2352
BlockDriverState *bdrv_next(BlockDriverState *bs)
2353
{
2354
    if (!bs) {
2355
        return QTAILQ_FIRST(&bdrv_states);
2356
    }
2357
    return QTAILQ_NEXT(bs, list);
2358
}
2359

    
2360
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2361
{
2362
    BlockDriverState *bs;
2363

    
2364
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2365
        it(opaque, bs);
2366
    }
2367
}
2368

    
2369
const char *bdrv_get_device_name(BlockDriverState *bs)
2370
{
2371
    return bs->device_name;
2372
}
2373

    
2374
void bdrv_flush_all(void)
2375
{
2376
    BlockDriverState *bs;
2377

    
2378
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2379
        bdrv_flush(bs);
2380
    }
2381
}
2382

    
2383
int bdrv_has_zero_init(BlockDriverState *bs)
2384
{
2385
    assert(bs->drv);
2386

    
2387
    if (bs->drv->bdrv_has_zero_init) {
2388
        return bs->drv->bdrv_has_zero_init(bs);
2389
    }
2390

    
2391
    return 1;
2392
}
2393

    
2394
typedef struct BdrvCoIsAllocatedData {
2395
    BlockDriverState *bs;
2396
    int64_t sector_num;
2397
    int nb_sectors;
2398
    int *pnum;
2399
    int ret;
2400
    bool done;
2401
} BdrvCoIsAllocatedData;
2402

    
2403
/*
2404
 * Returns true iff the specified sector is present in the disk image. Drivers
2405
 * not implementing the functionality are assumed to not support backing files,
2406
 * hence all their sectors are reported as allocated.
2407
 *
2408
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2409
 * and 'pnum' is set to 0.
2410
 *
2411
 * 'pnum' is set to the number of sectors (including and immediately following
2412
 * the specified sector) that are known to be in the same
2413
 * allocated/unallocated state.
2414
 *
2415
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2416
 * beyond the end of the disk image it will be clamped.
2417
 */
2418
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2419
                                      int nb_sectors, int *pnum)
2420
{
2421
    int64_t n;
2422

    
2423
    if (sector_num >= bs->total_sectors) {
2424
        *pnum = 0;
2425
        return 0;
2426
    }
2427

    
2428
    n = bs->total_sectors - sector_num;
2429
    if (n < nb_sectors) {
2430
        nb_sectors = n;
2431
    }
2432

    
2433
    if (!bs->drv->bdrv_co_is_allocated) {
2434
        *pnum = nb_sectors;
2435
        return 1;
2436
    }
2437

    
2438
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2439
}
2440

    
2441
/* Coroutine wrapper for bdrv_is_allocated() */
2442
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2443
{
2444
    BdrvCoIsAllocatedData *data = opaque;
2445
    BlockDriverState *bs = data->bs;
2446

    
2447
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2448
                                     data->pnum);
2449
    data->done = true;
2450
}
2451

    
2452
/*
2453
 * Synchronous wrapper around bdrv_co_is_allocated().
2454
 *
2455
 * See bdrv_co_is_allocated() for details.
2456
 */
2457
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2458
                      int *pnum)
2459
{
2460
    Coroutine *co;
2461
    BdrvCoIsAllocatedData data = {
2462
        .bs = bs,
2463
        .sector_num = sector_num,
2464
        .nb_sectors = nb_sectors,
2465
        .pnum = pnum,
2466
        .done = false,
2467
    };
2468

    
2469
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2470
    qemu_coroutine_enter(co, &data);
2471
    while (!data.done) {
2472
        qemu_aio_wait();
2473
    }
2474
    return data.ret;
2475
}
2476

    
2477
BlockInfoList *qmp_query_block(Error **errp)
2478
{
2479
    BlockInfoList *head = NULL, *cur_item = NULL;
2480
    BlockDriverState *bs;
2481

    
2482
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2483
        BlockInfoList *info = g_malloc0(sizeof(*info));
2484

    
2485
        info->value = g_malloc0(sizeof(*info->value));
2486
        info->value->device = g_strdup(bs->device_name);
2487
        info->value->type = g_strdup("unknown");
2488
        info->value->locked = bdrv_dev_is_medium_locked(bs);
2489
        info->value->removable = bdrv_dev_has_removable_media(bs);
2490

    
2491
        if (bdrv_dev_has_removable_media(bs)) {
2492
            info->value->has_tray_open = true;
2493
            info->value->tray_open = bdrv_dev_is_tray_open(bs);
2494
        }
2495

    
2496
        if (bdrv_iostatus_is_enabled(bs)) {
2497
            info->value->has_io_status = true;
2498
            info->value->io_status = bs->iostatus;
2499
        }
2500

    
2501
        if (bs->drv) {
2502
            info->value->has_inserted = true;
2503
            info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2504
            info->value->inserted->file = g_strdup(bs->filename);
2505
            info->value->inserted->ro = bs->read_only;
2506
            info->value->inserted->drv = g_strdup(bs->drv->format_name);
2507
            info->value->inserted->encrypted = bs->encrypted;
2508
            if (bs->backing_file[0]) {
2509
                info->value->inserted->has_backing_file = true;
2510
                info->value->inserted->backing_file = g_strdup(bs->backing_file);
2511
            }
2512

    
2513
            if (bs->io_limits_enabled) {
2514
                info->value->inserted->bps =
2515
                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2516
                info->value->inserted->bps_rd =
2517
                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2518
                info->value->inserted->bps_wr =
2519
                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2520
                info->value->inserted->iops =
2521
                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2522
                info->value->inserted->iops_rd =
2523
                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2524
                info->value->inserted->iops_wr =
2525
                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2526
            }
2527
        }
2528

    
2529
        /* XXX: waiting for the qapi to support GSList */
2530
        if (!cur_item) {
2531
            head = cur_item = info;
2532
        } else {
2533
            cur_item->next = info;
2534
            cur_item = info;
2535
        }
2536
    }
2537

    
2538
    return head;
2539
}
2540

    
2541
/* Consider exposing this as a full fledged QMP command */
2542
static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2543
{
2544
    BlockStats *s;
2545

    
2546
    s = g_malloc0(sizeof(*s));
2547

    
2548
    if (bs->device_name[0]) {
2549
        s->has_device = true;
2550
        s->device = g_strdup(bs->device_name);
2551
    }
2552

    
2553
    s->stats = g_malloc0(sizeof(*s->stats));
2554
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2555
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2556
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2557
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2558
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2559
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2560
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2561
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2562
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2563

    
2564
    if (bs->file) {
2565
        s->has_parent = true;
2566
        s->parent = qmp_query_blockstat(bs->file, NULL);
2567
    }
2568

    
2569
    return s;
2570
}
2571

    
2572
BlockStatsList *qmp_query_blockstats(Error **errp)
2573
{
2574
    BlockStatsList *head = NULL, *cur_item = NULL;
2575
    BlockDriverState *bs;
2576

    
2577
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2578
        BlockStatsList *info = g_malloc0(sizeof(*info));
2579
        info->value = qmp_query_blockstat(bs, NULL);
2580

    
2581
        /* XXX: waiting for the qapi to support GSList */
2582
        if (!cur_item) {
2583
            head = cur_item = info;
2584
        } else {
2585
            cur_item->next = info;
2586
            cur_item = info;
2587
        }
2588
    }
2589

    
2590
    return head;
2591
}
2592

    
2593
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2594
{
2595
    if (bs->backing_hd && bs->backing_hd->encrypted)
2596
        return bs->backing_file;
2597
    else if (bs->encrypted)
2598
        return bs->filename;
2599
    else
2600
        return NULL;
2601
}
2602

    
2603
void bdrv_get_backing_filename(BlockDriverState *bs,
2604
                               char *filename, int filename_size)
2605
{
2606
    pstrcpy(filename, filename_size, bs->backing_file);
2607
}
2608

    
2609
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2610
                          const uint8_t *buf, int nb_sectors)
2611
{
2612
    BlockDriver *drv = bs->drv;
2613
    if (!drv)
2614
        return -ENOMEDIUM;
2615
    if (!drv->bdrv_write_compressed)
2616
        return -ENOTSUP;
2617
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2618
        return -EIO;
2619

    
2620
    if (bs->dirty_bitmap) {
2621
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2622
    }
2623

    
2624
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2625
}
2626

    
2627
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2628
{
2629
    BlockDriver *drv = bs->drv;
2630
    if (!drv)
2631
        return -ENOMEDIUM;
2632
    if (!drv->bdrv_get_info)
2633
        return -ENOTSUP;
2634
    memset(bdi, 0, sizeof(*bdi));
2635
    return drv->bdrv_get_info(bs, bdi);
2636
}
2637

    
2638
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2639
                      int64_t pos, int size)
2640
{
2641
    BlockDriver *drv = bs->drv;
2642
    if (!drv)
2643
        return -ENOMEDIUM;
2644
    if (drv->bdrv_save_vmstate)
2645
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
2646
    if (bs->file)
2647
        return bdrv_save_vmstate(bs->file, buf, pos, size);
2648
    return -ENOTSUP;
2649
}
2650

    
2651
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2652
                      int64_t pos, int size)
2653
{
2654
    BlockDriver *drv = bs->drv;
2655
    if (!drv)
2656
        return -ENOMEDIUM;
2657
    if (drv->bdrv_load_vmstate)
2658
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
2659
    if (bs->file)
2660
        return bdrv_load_vmstate(bs->file, buf, pos, size);
2661
    return -ENOTSUP;
2662
}
2663

    
2664
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2665
{
2666
    BlockDriver *drv = bs->drv;
2667

    
2668
    if (!drv || !drv->bdrv_debug_event) {
2669
        return;
2670
    }
2671

    
2672
    return drv->bdrv_debug_event(bs, event);
2673

    
2674
}
2675

    
2676
/**************************************************************/
2677
/* handling of snapshots */
2678

    
2679
int bdrv_can_snapshot(BlockDriverState *bs)
2680
{
2681
    BlockDriver *drv = bs->drv;
2682
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2683
        return 0;
2684
    }
2685

    
2686
    if (!drv->bdrv_snapshot_create) {
2687
        if (bs->file != NULL) {
2688
            return bdrv_can_snapshot(bs->file);
2689
        }
2690
        return 0;
2691
    }
2692

    
2693
    return 1;
2694
}
2695

    
2696
int bdrv_is_snapshot(BlockDriverState *bs)
2697
{
2698
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2699
}
2700

    
2701
BlockDriverState *bdrv_snapshots(void)
2702
{
2703
    BlockDriverState *bs;
2704

    
2705
    if (bs_snapshots) {
2706
        return bs_snapshots;
2707
    }
2708

    
2709
    bs = NULL;
2710
    while ((bs = bdrv_next(bs))) {
2711
        if (bdrv_can_snapshot(bs)) {
2712
            bs_snapshots = bs;
2713
            return bs;
2714
        }
2715
    }
2716
    return NULL;
2717
}
2718

    
2719
int bdrv_snapshot_create(BlockDriverState *bs,
2720
                         QEMUSnapshotInfo *sn_info)
2721
{
2722
    BlockDriver *drv = bs->drv;
2723
    if (!drv)
2724
        return -ENOMEDIUM;
2725
    if (drv->bdrv_snapshot_create)
2726
        return drv->bdrv_snapshot_create(bs, sn_info);
2727
    if (bs->file)
2728
        return bdrv_snapshot_create(bs->file, sn_info);
2729
    return -ENOTSUP;
2730
}
2731

    
2732
int bdrv_snapshot_goto(BlockDriverState *bs,
2733
                       const char *snapshot_id)
2734
{
2735
    BlockDriver *drv = bs->drv;
2736
    int ret, open_ret;
2737

    
2738
    if (!drv)
2739
        return -ENOMEDIUM;
2740
    if (drv->bdrv_snapshot_goto)
2741
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
2742

    
2743
    if (bs->file) {
2744
        drv->bdrv_close(bs);
2745
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2746
        open_ret = drv->bdrv_open(bs, bs->open_flags);
2747
        if (open_ret < 0) {
2748
            bdrv_delete(bs->file);
2749
            bs->drv = NULL;
2750
            return open_ret;
2751
        }
2752
        return ret;
2753
    }
2754

    
2755
    return -ENOTSUP;
2756
}
2757

    
2758
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2759
{
2760
    BlockDriver *drv = bs->drv;
2761
    if (!drv)
2762
        return -ENOMEDIUM;
2763
    if (drv->bdrv_snapshot_delete)
2764
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
2765
    if (bs->file)
2766
        return bdrv_snapshot_delete(bs->file, snapshot_id);
2767
    return -ENOTSUP;
2768
}
2769

    
2770
int bdrv_snapshot_list(BlockDriverState *bs,
2771
                       QEMUSnapshotInfo **psn_info)
2772
{
2773
    BlockDriver *drv = bs->drv;
2774
    if (!drv)
2775
        return -ENOMEDIUM;
2776
    if (drv->bdrv_snapshot_list)
2777
        return drv->bdrv_snapshot_list(bs, psn_info);
2778
    if (bs->file)
2779
        return bdrv_snapshot_list(bs->file, psn_info);
2780
    return -ENOTSUP;
2781
}
2782

    
2783
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2784
        const char *snapshot_name)
2785
{
2786
    BlockDriver *drv = bs->drv;
2787
    if (!drv) {
2788
        return -ENOMEDIUM;
2789
    }
2790
    if (!bs->read_only) {
2791
        return -EINVAL;
2792
    }
2793
    if (drv->bdrv_snapshot_load_tmp) {
2794
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2795
    }
2796
    return -ENOTSUP;
2797
}
2798

    
2799
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2800
        const char *backing_file)
2801
{
2802
    if (!bs->drv) {
2803
        return NULL;
2804
    }
2805

    
2806
    if (bs->backing_hd) {
2807
        if (strcmp(bs->backing_file, backing_file) == 0) {
2808
            return bs->backing_hd;
2809
        } else {
2810
            return bdrv_find_backing_image(bs->backing_hd, backing_file);
2811
        }
2812
    }
2813

    
2814
    return NULL;
2815
}
2816

    
2817
#define NB_SUFFIXES 4
2818

    
2819
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2820
{
2821
    static const char suffixes[NB_SUFFIXES] = "KMGT";
2822
    int64_t base;
2823
    int i;
2824

    
2825
    if (size <= 999) {
2826
        snprintf(buf, buf_size, "%" PRId64, size);
2827
    } else {
2828
        base = 1024;
2829
        for(i = 0; i < NB_SUFFIXES; i++) {
2830
            if (size < (10 * base)) {
2831
                snprintf(buf, buf_size, "%0.1f%c",
2832
                         (double)size / base,
2833
                         suffixes[i]);
2834
                break;
2835
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2836
                snprintf(buf, buf_size, "%" PRId64 "%c",
2837
                         ((size + (base >> 1)) / base),
2838
                         suffixes[i]);
2839
                break;
2840
            }
2841
            base = base * 1024;
2842
        }
2843
    }
2844
    return buf;
2845
}
2846

    
2847
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2848
{
2849
    char buf1[128], date_buf[128], clock_buf[128];
2850
#ifdef _WIN32
2851
    struct tm *ptm;
2852
#else
2853
    struct tm tm;
2854
#endif
2855
    time_t ti;
2856
    int64_t secs;
2857

    
2858
    if (!sn) {
2859
        snprintf(buf, buf_size,
2860
                 "%-10s%-20s%7s%20s%15s",
2861
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2862
    } else {
2863
        ti = sn->date_sec;
2864
#ifdef _WIN32
2865
        ptm = localtime(&ti);
2866
        strftime(date_buf, sizeof(date_buf),
2867
                 "%Y-%m-%d %H:%M:%S", ptm);
2868
#else
2869
        localtime_r(&ti, &tm);
2870
        strftime(date_buf, sizeof(date_buf),
2871
                 "%Y-%m-%d %H:%M:%S", &tm);
2872
#endif
2873
        secs = sn->vm_clock_nsec / 1000000000;
2874
        snprintf(clock_buf, sizeof(clock_buf),
2875
                 "%02d:%02d:%02d.%03d",
2876
                 (int)(secs / 3600),
2877
                 (int)((secs / 60) % 60),
2878
                 (int)(secs % 60),
2879
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2880
        snprintf(buf, buf_size,
2881
                 "%-10s%-20s%7s%20s%15s",
2882
                 sn->id_str, sn->name,
2883
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2884
                 date_buf,
2885
                 clock_buf);
2886
    }
2887
    return buf;
2888
}
2889

    
2890
/**************************************************************/
2891
/* async I/Os */
2892

    
2893
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2894
                                 QEMUIOVector *qiov, int nb_sectors,
2895
                                 BlockDriverCompletionFunc *cb, void *opaque)
2896
{
2897
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2898

    
2899
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2900
                                 cb, opaque, false);
2901
}
2902

    
2903
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2904
                                  QEMUIOVector *qiov, int nb_sectors,
2905
                                  BlockDriverCompletionFunc *cb, void *opaque)
2906
{
2907
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2908

    
2909
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2910
                                 cb, opaque, true);
2911
}
2912

    
2913

    
2914
typedef struct MultiwriteCB {
2915
    int error;
2916
    int num_requests;
2917
    int num_callbacks;
2918
    struct {
2919
        BlockDriverCompletionFunc *cb;
2920
        void *opaque;
2921
        QEMUIOVector *free_qiov;
2922
    } callbacks[];
2923
} MultiwriteCB;
2924

    
2925
static void multiwrite_user_cb(MultiwriteCB *mcb)
2926
{
2927
    int i;
2928

    
2929
    for (i = 0; i < mcb->num_callbacks; i++) {
2930
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2931
        if (mcb->callbacks[i].free_qiov) {
2932
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2933
        }
2934
        g_free(mcb->callbacks[i].free_qiov);
2935
    }
2936
}
2937

    
2938
static void multiwrite_cb(void *opaque, int ret)
2939
{
2940
    MultiwriteCB *mcb = opaque;
2941

    
2942
    trace_multiwrite_cb(mcb, ret);
2943

    
2944
    if (ret < 0 && !mcb->error) {
2945
        mcb->error = ret;
2946
    }
2947

    
2948
    mcb->num_requests--;
2949
    if (mcb->num_requests == 0) {
2950
        multiwrite_user_cb(mcb);
2951
        g_free(mcb);
2952
    }
2953
}
2954

    
2955
static int multiwrite_req_compare(const void *a, const void *b)
2956
{
2957
    const BlockRequest *req1 = a, *req2 = b;
2958

    
2959
    /*
2960
     * Note that we can't simply subtract req2->sector from req1->sector
2961
     * here as that could overflow the return value.
2962
     */
2963
    if (req1->sector > req2->sector) {
2964
        return 1;
2965
    } else if (req1->sector < req2->sector) {
2966
        return -1;
2967
    } else {
2968
        return 0;
2969
    }
2970
}
2971

    
2972
/*
2973
 * Takes a bunch of requests and tries to merge them. Returns the number of
2974
 * requests that remain after merging.
2975
 */
2976
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2977
    int num_reqs, MultiwriteCB *mcb)
2978
{
2979
    int i, outidx;
2980

    
2981
    // Sort requests by start sector
2982
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2983

    
2984
    // Check if adjacent requests touch the same clusters. If so, combine them,
2985
    // filling up gaps with zero sectors.
2986
    outidx = 0;
2987
    for (i = 1; i < num_reqs; i++) {
2988
        int merge = 0;
2989
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2990

    
2991
        // Handle exactly sequential writes and overlapping writes.
2992
        if (reqs[i].sector <= oldreq_last) {
2993
            merge = 1;
2994
        }
2995

    
2996
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2997
            merge = 0;
2998
        }
2999

    
3000
        if (merge) {
3001
            size_t size;
3002
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3003
            qemu_iovec_init(qiov,
3004
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3005

    
3006
            // Add the first request to the merged one. If the requests are
3007
            // overlapping, drop the last sectors of the first request.
3008
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
3009
            qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3010

    
3011
            // We should need to add any zeros between the two requests
3012
            assert (reqs[i].sector <= oldreq_last);
3013

    
3014
            // Add the second request
3015
            qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3016

    
3017
            reqs[outidx].nb_sectors = qiov->size >> 9;
3018
            reqs[outidx].qiov = qiov;
3019

    
3020
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3021
        } else {
3022
            outidx++;
3023
            reqs[outidx].sector     = reqs[i].sector;
3024
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3025
            reqs[outidx].qiov       = reqs[i].qiov;
3026
        }
3027
    }
3028

    
3029
    return outidx + 1;
3030
}
3031

    
3032
/*
3033
 * Submit multiple AIO write requests at once.
3034
 *
3035
 * On success, the function returns 0 and all requests in the reqs array have
3036
 * been submitted. In error case this function returns -1, and any of the
3037
 * requests may or may not be submitted yet. In particular, this means that the
3038
 * callback will be called for some of the requests, for others it won't. The
3039
 * caller must check the error field of the BlockRequest to wait for the right
3040
 * callbacks (if error != 0, no callback will be called).
3041
 *
3042
 * The implementation may modify the contents of the reqs array, e.g. to merge
3043
 * requests. However, the fields opaque and error are left unmodified as they
3044
 * are used to signal failure for a single request to the caller.
3045
 */
3046
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3047
{
3048
    MultiwriteCB *mcb;
3049
    int i;
3050

    
3051
    /* don't submit writes if we don't have a medium */
3052
    if (bs->drv == NULL) {
3053
        for (i = 0; i < num_reqs; i++) {
3054
            reqs[i].error = -ENOMEDIUM;
3055
        }
3056
        return -1;
3057
    }
3058

    
3059
    if (num_reqs == 0) {
3060
        return 0;
3061
    }
3062

    
3063
    // Create MultiwriteCB structure
3064
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3065
    mcb->num_requests = 0;
3066
    mcb->num_callbacks = num_reqs;
3067

    
3068
    for (i = 0; i < num_reqs; i++) {
3069
        mcb->callbacks[i].cb = reqs[i].cb;
3070
        mcb->callbacks[i].opaque = reqs[i].opaque;
3071
    }
3072

    
3073
    // Check for mergable requests
3074
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3075

    
3076
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3077

    
3078
    /* Run the aio requests. */
3079
    mcb->num_requests = num_reqs;
3080
    for (i = 0; i < num_reqs; i++) {
3081
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3082
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3083
    }
3084

    
3085
    return 0;
3086
}
3087

    
3088
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3089
{
3090
    acb->pool->cancel(acb);
3091
}
3092

    
3093
/* block I/O throttling */
3094
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3095
                 bool is_write, double elapsed_time, uint64_t *wait)
3096
{
3097
    uint64_t bps_limit = 0;
3098
    double   bytes_limit, bytes_base, bytes_res;
3099
    double   slice_time, wait_time;
3100

    
3101
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3102
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3103
    } else if (bs->io_limits.bps[is_write]) {
3104
        bps_limit = bs->io_limits.bps[is_write];
3105
    } else {
3106
        if (wait) {
3107
            *wait = 0;
3108
        }
3109

    
3110
        return false;
3111
    }
3112

    
3113
    slice_time = bs->slice_end - bs->slice_start;
3114
    slice_time /= (NANOSECONDS_PER_SECOND);
3115
    bytes_limit = bps_limit * slice_time;
3116
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3117
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3118
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3119
    }
3120

    
3121
    /* bytes_base: the bytes of data which have been read/written; and
3122
     *             it is obtained from the history statistic info.
3123
     * bytes_res: the remaining bytes of data which need to be read/written.
3124
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3125
     *             the total time for completing reading/writting all data.
3126
     */
3127
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3128

    
3129
    if (bytes_base + bytes_res <= bytes_limit) {
3130
        if (wait) {
3131
            *wait = 0;
3132
        }
3133

    
3134
        return false;
3135
    }
3136

    
3137
    /* Calc approx time to dispatch */
3138
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3139

    
3140
    /* When the I/O rate at runtime exceeds the limits,
3141
     * bs->slice_end need to be extended in order that the current statistic
3142
     * info can be kept until the timer fire, so it is increased and tuned
3143
     * based on the result of experiment.
3144
     */
3145
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3146
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3147
    if (wait) {
3148
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3149
    }
3150

    
3151
    return true;
3152
}
3153

    
3154
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3155
                             double elapsed_time, uint64_t *wait)
3156
{
3157
    uint64_t iops_limit = 0;
3158
    double   ios_limit, ios_base;
3159
    double   slice_time, wait_time;
3160

    
3161
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3162
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3163
    } else if (bs->io_limits.iops[is_write]) {
3164
        iops_limit = bs->io_limits.iops[is_write];
3165
    } else {
3166
        if (wait) {
3167
            *wait = 0;
3168
        }
3169

    
3170
        return false;
3171
    }
3172

    
3173
    slice_time = bs->slice_end - bs->slice_start;
3174
    slice_time /= (NANOSECONDS_PER_SECOND);
3175
    ios_limit  = iops_limit * slice_time;
3176
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3177
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3178
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3179
    }
3180

    
3181
    if (ios_base + 1 <= ios_limit) {
3182
        if (wait) {
3183
            *wait = 0;
3184
        }
3185

    
3186
        return false;
3187
    }
3188

    
3189
    /* Calc approx time to dispatch */
3190
    wait_time = (ios_base + 1) / iops_limit;
3191
    if (wait_time > elapsed_time) {
3192
        wait_time = wait_time - elapsed_time;
3193
    } else {
3194
        wait_time = 0;
3195
    }
3196

    
3197
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3198
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3199
    if (wait) {
3200
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3201
    }
3202

    
3203
    return true;
3204
}
3205

    
3206
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3207
                           bool is_write, int64_t *wait)
3208
{
3209
    int64_t  now, max_wait;
3210
    uint64_t bps_wait = 0, iops_wait = 0;
3211
    double   elapsed_time;
3212
    int      bps_ret, iops_ret;
3213

    
3214
    now = qemu_get_clock_ns(vm_clock);
3215
    if ((bs->slice_start < now)
3216
        && (bs->slice_end > now)) {
3217
        bs->slice_end = now + bs->slice_time;
3218
    } else {
3219
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3220
        bs->slice_start = now;
3221
        bs->slice_end   = now + bs->slice_time;
3222

    
3223
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3224
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3225

    
3226
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3227
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3228
    }
3229

    
3230
    elapsed_time  = now - bs->slice_start;
3231
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3232

    
3233
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3234
                                      is_write, elapsed_time, &bps_wait);
3235
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3236
                                      elapsed_time, &iops_wait);
3237
    if (bps_ret || iops_ret) {
3238
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3239
        if (wait) {
3240
            *wait = max_wait;
3241
        }
3242

    
3243
        now = qemu_get_clock_ns(vm_clock);
3244
        if (bs->slice_end < now + max_wait) {
3245
            bs->slice_end = now + max_wait;
3246
        }
3247

    
3248
        return true;
3249
    }
3250

    
3251
    if (wait) {
3252
        *wait = 0;
3253
    }
3254

    
3255
    return false;
3256
}
3257

    
3258
/**************************************************************/
3259
/* async block device emulation */
3260

    
3261
typedef struct BlockDriverAIOCBSync {
3262
    BlockDriverAIOCB common;
3263
    QEMUBH *bh;
3264
    int ret;
3265
    /* vector translation state */
3266
    QEMUIOVector *qiov;
3267
    uint8_t *bounce;
3268
    int is_write;
3269
} BlockDriverAIOCBSync;
3270

    
3271
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3272
{
3273
    BlockDriverAIOCBSync *acb =
3274
        container_of(blockacb, BlockDriverAIOCBSync, common);
3275
    qemu_bh_delete(acb->bh);
3276
    acb->bh = NULL;
3277
    qemu_aio_release(acb);
3278
}
3279

    
3280
static AIOPool bdrv_em_aio_pool = {
3281
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3282
    .cancel             = bdrv_aio_cancel_em,
3283
};
3284

    
3285
static void bdrv_aio_bh_cb(void *opaque)
3286
{
3287
    BlockDriverAIOCBSync *acb = opaque;
3288

    
3289
    if (!acb->is_write)
3290
        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3291
    qemu_vfree(acb->bounce);
3292
    acb->common.cb(acb->common.opaque, acb->ret);
3293
    qemu_bh_delete(acb->bh);
3294
    acb->bh = NULL;
3295
    qemu_aio_release(acb);
3296
}
3297

    
3298
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3299
                                            int64_t sector_num,
3300
                                            QEMUIOVector *qiov,
3301
                                            int nb_sectors,
3302
                                            BlockDriverCompletionFunc *cb,
3303
                                            void *opaque,
3304
                                            int is_write)
3305

    
3306
{
3307
    BlockDriverAIOCBSync *acb;
3308

    
3309
    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3310
    acb->is_write = is_write;
3311
    acb->qiov = qiov;
3312
    acb->bounce = qemu_blockalign(bs, qiov->size);
3313
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3314

    
3315
    if (is_write) {
3316
        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3317
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3318
    } else {
3319
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3320
    }
3321

    
3322
    qemu_bh_schedule(acb->bh);
3323

    
3324
    return &acb->common;
3325
}
3326

    
3327
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3328
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3329
        BlockDriverCompletionFunc *cb, void *opaque)
3330
{
3331
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3332
}
3333

    
3334
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3335
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3336
        BlockDriverCompletionFunc *cb, void *opaque)
3337
{
3338
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3339
}
3340

    
3341

    
3342
typedef struct BlockDriverAIOCBCoroutine {
3343
    BlockDriverAIOCB common;
3344
    BlockRequest req;
3345
    bool is_write;
3346
    QEMUBH* bh;
3347
} BlockDriverAIOCBCoroutine;
3348

    
3349
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3350
{
3351
    qemu_aio_flush();
3352
}
3353

    
3354
static AIOPool bdrv_em_co_aio_pool = {
3355
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3356
    .cancel             = bdrv_aio_co_cancel_em,
3357
};
3358

    
3359
static void bdrv_co_em_bh(void *opaque)
3360
{
3361
    BlockDriverAIOCBCoroutine *acb = opaque;
3362

    
3363
    acb->common.cb(acb->common.opaque, acb->req.error);
3364
    qemu_bh_delete(acb->bh);
3365
    qemu_aio_release(acb);
3366
}
3367

    
3368
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3369
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3370
{
3371
    BlockDriverAIOCBCoroutine *acb = opaque;
3372
    BlockDriverState *bs = acb->common.bs;
3373

    
3374
    if (!acb->is_write) {
3375
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3376
            acb->req.nb_sectors, acb->req.qiov, 0);
3377
    } else {
3378
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3379
            acb->req.nb_sectors, acb->req.qiov, 0);
3380
    }
3381

    
3382
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3383
    qemu_bh_schedule(acb->bh);
3384
}
3385

    
3386
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3387
                                               int64_t sector_num,
3388
                                               QEMUIOVector *qiov,
3389
                                               int nb_sectors,
3390
                                               BlockDriverCompletionFunc *cb,
3391
                                               void *opaque,
3392
                                               bool is_write)
3393
{
3394
    Coroutine *co;
3395
    BlockDriverAIOCBCoroutine *acb;
3396

    
3397
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3398
    acb->req.sector = sector_num;
3399
    acb->req.nb_sectors = nb_sectors;
3400
    acb->req.qiov = qiov;
3401
    acb->is_write = is_write;
3402

    
3403
    co = qemu_coroutine_create(bdrv_co_do_rw);
3404
    qemu_coroutine_enter(co, acb);
3405

    
3406
    return &acb->common;
3407
}
3408

    
3409
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3410
{
3411
    BlockDriverAIOCBCoroutine *acb = opaque;
3412
    BlockDriverState *bs = acb->common.bs;
3413

    
3414
    acb->req.error = bdrv_co_flush(bs);
3415
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3416
    qemu_bh_schedule(acb->bh);
3417
}
3418

    
3419
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3420
        BlockDriverCompletionFunc *cb, void *opaque)
3421
{
3422
    trace_bdrv_aio_flush(bs, opaque);
3423

    
3424
    Coroutine *co;
3425
    BlockDriverAIOCBCoroutine *acb;
3426

    
3427
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3428
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3429
    qemu_coroutine_enter(co, acb);
3430

    
3431
    return &acb->common;
3432
}
3433

    
3434
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3435
{
3436
    BlockDriverAIOCBCoroutine *acb = opaque;
3437
    BlockDriverState *bs = acb->common.bs;
3438

    
3439
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3440
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3441
    qemu_bh_schedule(acb->bh);
3442
}
3443

    
3444
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3445
        int64_t sector_num, int nb_sectors,
3446
        BlockDriverCompletionFunc *cb, void *opaque)
3447
{
3448
    Coroutine *co;
3449
    BlockDriverAIOCBCoroutine *acb;
3450

    
3451
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3452

    
3453
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3454
    acb->req.sector = sector_num;
3455
    acb->req.nb_sectors = nb_sectors;
3456
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3457
    qemu_coroutine_enter(co, acb);
3458

    
3459
    return &acb->common;
3460
}
3461

    
3462
void bdrv_init(void)
3463
{
3464
    module_call_init(MODULE_INIT_BLOCK);
3465
}
3466

    
3467
void bdrv_init_with_whitelist(void)
3468
{
3469
    use_bdrv_whitelist = 1;
3470
    bdrv_init();
3471
}
3472

    
3473
void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3474
                   BlockDriverCompletionFunc *cb, void *opaque)
3475
{
3476
    BlockDriverAIOCB *acb;
3477

    
3478
    if (pool->free_aiocb) {
3479
        acb = pool->free_aiocb;
3480
        pool->free_aiocb = acb->next;
3481
    } else {
3482
        acb = g_malloc0(pool->aiocb_size);
3483
        acb->pool = pool;
3484
    }
3485
    acb->bs = bs;
3486
    acb->cb = cb;
3487
    acb->opaque = opaque;
3488
    return acb;
3489
}
3490

    
3491
void qemu_aio_release(void *p)
3492
{
3493
    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3494
    AIOPool *pool = acb->pool;
3495
    acb->next = pool->free_aiocb;
3496
    pool->free_aiocb = acb;
3497
}
3498

    
3499
/**************************************************************/
3500
/* Coroutine block device emulation */
3501

    
3502
typedef struct CoroutineIOCompletion {
3503
    Coroutine *coroutine;
3504
    int ret;
3505
} CoroutineIOCompletion;
3506

    
3507
static void bdrv_co_io_em_complete(void *opaque, int ret)
3508
{
3509
    CoroutineIOCompletion *co = opaque;
3510

    
3511
    co->ret = ret;
3512
    qemu_coroutine_enter(co->coroutine, NULL);
3513
}
3514

    
3515
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3516
                                      int nb_sectors, QEMUIOVector *iov,
3517
                                      bool is_write)
3518
{
3519
    CoroutineIOCompletion co = {
3520
        .coroutine = qemu_coroutine_self(),
3521
    };
3522
    BlockDriverAIOCB *acb;
3523

    
3524
    if (is_write) {
3525
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3526
                                       bdrv_co_io_em_complete, &co);
3527
    } else {
3528
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3529
                                      bdrv_co_io_em_complete, &co);
3530
    }
3531

    
3532
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3533
    if (!acb) {
3534
        return -EIO;
3535
    }
3536
    qemu_coroutine_yield();
3537

    
3538
    return co.ret;
3539
}
3540

    
3541
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3542
                                         int64_t sector_num, int nb_sectors,
3543
                                         QEMUIOVector *iov)
3544
{
3545
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3546
}
3547

    
3548
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3549
                                         int64_t sector_num, int nb_sectors,
3550
                                         QEMUIOVector *iov)
3551
{
3552
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3553
}
3554

    
3555
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3556
{
3557
    RwCo *rwco = opaque;
3558

    
3559
    rwco->ret = bdrv_co_flush(rwco->bs);
3560
}
3561

    
3562
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3563
{
3564
    int ret;
3565

    
3566
    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3567
        return 0;
3568
    }
3569

    
3570
    /* Write back cached data to the OS even with cache=unsafe */
3571
    if (bs->drv->bdrv_co_flush_to_os) {
3572
        ret = bs->drv->bdrv_co_flush_to_os(bs);
3573
        if (ret < 0) {
3574
            return ret;
3575
        }
3576
    }
3577

    
3578
    /* But don't actually force it to the disk with cache=unsafe */
3579
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
3580
        return 0;
3581
    }
3582

    
3583
    if (bs->drv->bdrv_co_flush_to_disk) {
3584
        ret = bs->drv->bdrv_co_flush_to_disk(bs);
3585
    } else if (bs->drv->bdrv_aio_flush) {
3586
        BlockDriverAIOCB *acb;
3587
        CoroutineIOCompletion co = {
3588
            .coroutine = qemu_coroutine_self(),
3589
        };
3590

    
3591
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3592
        if (acb == NULL) {
3593
            ret = -EIO;
3594
        } else {
3595
            qemu_coroutine_yield();
3596
            ret = co.ret;
3597
        }
3598
    } else {
3599
        /*
3600
         * Some block drivers always operate in either writethrough or unsafe
3601
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3602
         * know how the server works (because the behaviour is hardcoded or
3603
         * depends on server-side configuration), so we can't ensure that
3604
         * everything is safe on disk. Returning an error doesn't work because
3605
         * that would break guests even if the server operates in writethrough
3606
         * mode.
3607
         *
3608
         * Let's hope the user knows what he's doing.
3609
         */
3610
        ret = 0;
3611
    }
3612
    if (ret < 0) {
3613
        return ret;
3614
    }
3615

    
3616
    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3617
     * in the case of cache=unsafe, so there are no useless flushes.
3618
     */
3619
    return bdrv_co_flush(bs->file);
3620
}
3621

    
3622
void bdrv_invalidate_cache(BlockDriverState *bs)
3623
{
3624
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3625
        bs->drv->bdrv_invalidate_cache(bs);
3626
    }
3627
}
3628

    
3629
void bdrv_invalidate_cache_all(void)
3630
{
3631
    BlockDriverState *bs;
3632

    
3633
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3634
        bdrv_invalidate_cache(bs);
3635
    }
3636
}
3637

    
3638
void bdrv_clear_incoming_migration_all(void)
3639
{
3640
    BlockDriverState *bs;
3641

    
3642
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3643
        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3644
    }
3645
}
3646

    
3647
int bdrv_flush(BlockDriverState *bs)
3648
{
3649
    Coroutine *co;
3650
    RwCo rwco = {
3651
        .bs = bs,
3652
        .ret = NOT_DONE,
3653
    };
3654

    
3655
    if (qemu_in_coroutine()) {
3656
        /* Fast-path if already in coroutine context */
3657
        bdrv_flush_co_entry(&rwco);
3658
    } else {
3659
        co = qemu_coroutine_create(bdrv_flush_co_entry);
3660
        qemu_coroutine_enter(co, &rwco);
3661
        while (rwco.ret == NOT_DONE) {
3662
            qemu_aio_wait();
3663
        }
3664
    }
3665

    
3666
    return rwco.ret;
3667
}
3668

    
3669
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3670
{
3671
    RwCo *rwco = opaque;
3672

    
3673
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3674
}
3675

    
3676
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3677
                                 int nb_sectors)
3678
{
3679
    if (!bs->drv) {
3680
        return -ENOMEDIUM;
3681
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3682
        return -EIO;
3683
    } else if (bs->read_only) {
3684
        return -EROFS;
3685
    } else if (bs->drv->bdrv_co_discard) {
3686
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3687
    } else if (bs->drv->bdrv_aio_discard) {
3688
        BlockDriverAIOCB *acb;
3689
        CoroutineIOCompletion co = {
3690
            .coroutine = qemu_coroutine_self(),
3691
        };
3692

    
3693
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3694
                                        bdrv_co_io_em_complete, &co);
3695
        if (acb == NULL) {
3696
            return -EIO;
3697
        } else {
3698
            qemu_coroutine_yield();
3699
            return co.ret;
3700
        }
3701
    } else {
3702
        return 0;
3703
    }
3704
}
3705

    
3706
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3707
{
3708
    Coroutine *co;
3709
    RwCo rwco = {
3710
        .bs = bs,
3711
        .sector_num = sector_num,
3712
        .nb_sectors = nb_sectors,
3713
        .ret = NOT_DONE,
3714
    };
3715

    
3716
    if (qemu_in_coroutine()) {
3717
        /* Fast-path if already in coroutine context */
3718
        bdrv_discard_co_entry(&rwco);
3719
    } else {
3720
        co = qemu_coroutine_create(bdrv_discard_co_entry);
3721
        qemu_coroutine_enter(co, &rwco);
3722
        while (rwco.ret == NOT_DONE) {
3723
            qemu_aio_wait();
3724
        }
3725
    }
3726

    
3727
    return rwco.ret;
3728
}
3729

    
3730
/**************************************************************/
3731
/* removable device support */
3732

    
3733
/**
3734
 * Return TRUE if the media is present
3735
 */
3736
int bdrv_is_inserted(BlockDriverState *bs)
3737
{
3738
    BlockDriver *drv = bs->drv;
3739

    
3740
    if (!drv)
3741
        return 0;
3742
    if (!drv->bdrv_is_inserted)
3743
        return 1;
3744
    return drv->bdrv_is_inserted(bs);
3745
}
3746

    
3747
/**
3748
 * Return whether the media changed since the last call to this
3749
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3750
 */
3751
int bdrv_media_changed(BlockDriverState *bs)
3752
{
3753
    BlockDriver *drv = bs->drv;
3754

    
3755
    if (drv && drv->bdrv_media_changed) {
3756
        return drv->bdrv_media_changed(bs);
3757
    }
3758
    return -ENOTSUP;
3759
}
3760

    
3761
/**
3762
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3763
 */
3764
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3765
{
3766
    BlockDriver *drv = bs->drv;
3767

    
3768
    if (drv && drv->bdrv_eject) {
3769
        drv->bdrv_eject(bs, eject_flag);
3770
    }
3771

    
3772
    if (bs->device_name[0] != '\0') {
3773
        bdrv_emit_qmp_eject_event(bs, eject_flag);
3774
    }
3775
}
3776

    
3777
/**
3778
 * Lock or unlock the media (if it is locked, the user won't be able
3779
 * to eject it manually).
3780
 */
3781
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3782
{
3783
    BlockDriver *drv = bs->drv;
3784

    
3785
    trace_bdrv_lock_medium(bs, locked);
3786

    
3787
    if (drv && drv->bdrv_lock_medium) {
3788
        drv->bdrv_lock_medium(bs, locked);
3789
    }
3790
}
3791

    
3792
/* needed for generic scsi interface */
3793

    
3794
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3795
{
3796
    BlockDriver *drv = bs->drv;
3797

    
3798
    if (drv && drv->bdrv_ioctl)
3799
        return drv->bdrv_ioctl(bs, req, buf);
3800
    return -ENOTSUP;
3801
}
3802

    
3803
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3804
        unsigned long int req, void *buf,
3805
        BlockDriverCompletionFunc *cb, void *opaque)
3806
{
3807
    BlockDriver *drv = bs->drv;
3808

    
3809
    if (drv && drv->bdrv_aio_ioctl)
3810
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3811
    return NULL;
3812
}
3813

    
3814
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3815
{
3816
    bs->buffer_alignment = align;
3817
}
3818

    
3819
void *qemu_blockalign(BlockDriverState *bs, size_t size)
3820
{
3821
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3822
}
3823

    
3824
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3825
{
3826
    int64_t bitmap_size;
3827

    
3828
    bs->dirty_count = 0;
3829
    if (enable) {
3830
        if (!bs->dirty_bitmap) {
3831
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3832
                    BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3833
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3834

    
3835
            bs->dirty_bitmap = g_malloc0(bitmap_size);
3836
        }
3837
    } else {
3838
        if (bs->dirty_bitmap) {
3839
            g_free(bs->dirty_bitmap);
3840
            bs->dirty_bitmap = NULL;
3841
        }
3842
    }
3843
}
3844

    
3845
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3846
{
3847
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3848

    
3849
    if (bs->dirty_bitmap &&
3850
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3851
        return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3852
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
3853
    } else {
3854
        return 0;
3855
    }
3856
}
3857

    
3858
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3859
                      int nr_sectors)
3860
{
3861
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3862
}
3863

    
3864
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3865
{
3866
    return bs->dirty_count;
3867
}
3868

    
3869
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3870
{
3871
    assert(bs->in_use != in_use);
3872
    bs->in_use = in_use;
3873
}
3874

    
3875
int bdrv_in_use(BlockDriverState *bs)
3876
{
3877
    return bs->in_use;
3878
}
3879

    
3880
void bdrv_iostatus_enable(BlockDriverState *bs)
3881
{
3882
    bs->iostatus_enabled = true;
3883
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3884
}
3885

    
3886
/* The I/O status is only enabled if the drive explicitly
3887
 * enables it _and_ the VM is configured to stop on errors */
3888
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3889
{
3890
    return (bs->iostatus_enabled &&
3891
           (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3892
            bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3893
            bs->on_read_error == BLOCK_ERR_STOP_ANY));
3894
}
3895

    
3896
void bdrv_iostatus_disable(BlockDriverState *bs)
3897
{
3898
    bs->iostatus_enabled = false;
3899
}
3900

    
3901
void bdrv_iostatus_reset(BlockDriverState *bs)
3902
{
3903
    if (bdrv_iostatus_is_enabled(bs)) {
3904
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3905
    }
3906
}
3907

    
3908
/* XXX: Today this is set by device models because it makes the implementation
3909
   quite simple. However, the block layer knows about the error, so it's
3910
   possible to implement this without device models being involved */
3911
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3912
{
3913
    if (bdrv_iostatus_is_enabled(bs) &&
3914
        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3915
        assert(error >= 0);
3916
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3917
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
3918
    }
3919
}
3920

    
3921
void
3922
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3923
        enum BlockAcctType type)
3924
{
3925
    assert(type < BDRV_MAX_IOTYPE);
3926

    
3927
    cookie->bytes = bytes;
3928
    cookie->start_time_ns = get_clock();
3929
    cookie->type = type;
3930
}
3931

    
3932
void
3933
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3934
{
3935
    assert(cookie->type < BDRV_MAX_IOTYPE);
3936

    
3937
    bs->nr_bytes[cookie->type] += cookie->bytes;
3938
    bs->nr_ops[cookie->type]++;
3939
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3940
}
3941

    
3942
int bdrv_img_create(const char *filename, const char *fmt,
3943
                    const char *base_filename, const char *base_fmt,
3944
                    char *options, uint64_t img_size, int flags)
3945
{
3946
    QEMUOptionParameter *param = NULL, *create_options = NULL;
3947
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
3948
    BlockDriverState *bs = NULL;
3949
    BlockDriver *drv, *proto_drv;
3950
    BlockDriver *backing_drv = NULL;
3951
    int ret = 0;
3952

    
3953
    /* Find driver and parse its options */
3954
    drv = bdrv_find_format(fmt);
3955
    if (!drv) {
3956
        error_report("Unknown file format '%s'", fmt);
3957
        ret = -EINVAL;
3958
        goto out;
3959
    }
3960

    
3961
    proto_drv = bdrv_find_protocol(filename);
3962
    if (!proto_drv) {
3963
        error_report("Unknown protocol '%s'", filename);
3964
        ret = -EINVAL;
3965
        goto out;
3966
    }
3967

    
3968
    create_options = append_option_parameters(create_options,
3969
                                              drv->create_options);
3970
    create_options = append_option_parameters(create_options,
3971
                                              proto_drv->create_options);
3972

    
3973
    /* Create parameter list with default values */
3974
    param = parse_option_parameters("", create_options, param);
3975

    
3976
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3977

    
3978
    /* Parse -o options */
3979
    if (options) {
3980
        param = parse_option_parameters(options, create_options, param);
3981
        if (param == NULL) {
3982
            error_report("Invalid options for file format '%s'.", fmt);
3983
            ret = -EINVAL;
3984
            goto out;
3985
        }
3986
    }
3987

    
3988
    if (base_filename) {
3989
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3990
                                 base_filename)) {
3991
            error_report("Backing file not supported for file format '%s'",
3992
                         fmt);
3993
            ret = -EINVAL;
3994
            goto out;
3995
        }
3996
    }
3997

    
3998
    if (base_fmt) {
3999
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4000
            error_report("Backing file format not supported for file "
4001
                         "format '%s'", fmt);
4002
            ret = -EINVAL;
4003
            goto out;
4004
        }
4005
    }
4006

    
4007
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4008
    if (backing_file && backing_file->value.s) {
4009
        if (!strcmp(filename, backing_file->value.s)) {
4010
            error_report("Error: Trying to create an image with the "
4011
                         "same filename as the backing file");
4012
            ret = -EINVAL;
4013
            goto out;
4014
        }
4015
    }
4016

    
4017
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4018
    if (backing_fmt && backing_fmt->value.s) {
4019
        backing_drv = bdrv_find_format(backing_fmt->value.s);
4020
        if (!backing_drv) {
4021
            error_report("Unknown backing file format '%s'",
4022
                         backing_fmt->value.s);
4023
            ret = -EINVAL;
4024
            goto out;
4025
        }
4026
    }
4027

    
4028
    // The size for the image must always be specified, with one exception:
4029
    // If we are using a backing file, we can obtain the size from there
4030
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
4031
    if (size && size->value.n == -1) {
4032
        if (backing_file && backing_file->value.s) {
4033
            uint64_t size;
4034
            char buf[32];
4035

    
4036
            bs = bdrv_new("");
4037

    
4038
            ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
4039
            if (ret < 0) {
4040
                error_report("Could not open '%s'", backing_file->value.s);
4041
                goto out;
4042
            }
4043
            bdrv_get_geometry(bs, &size);
4044
            size *= 512;
4045

    
4046
            snprintf(buf, sizeof(buf), "%" PRId64, size);
4047
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4048
        } else {
4049
            error_report("Image creation needs a size parameter");
4050
            ret = -EINVAL;
4051
            goto out;
4052
        }
4053
    }
4054

    
4055
    printf("Formatting '%s', fmt=%s ", filename, fmt);
4056
    print_option_parameters(param);
4057
    puts("");
4058

    
4059
    ret = bdrv_create(drv, filename, param);
4060

    
4061
    if (ret < 0) {
4062
        if (ret == -ENOTSUP) {
4063
            error_report("Formatting or formatting option not supported for "
4064
                         "file format '%s'", fmt);
4065
        } else if (ret == -EFBIG) {
4066
            error_report("The image size is too large for file format '%s'",
4067
                         fmt);
4068
        } else {
4069
            error_report("%s: error while creating %s: %s", filename, fmt,
4070
                         strerror(-ret));
4071
        }
4072
    }
4073

    
4074
out:
4075
    free_option_parameters(create_options);
4076
    free_option_parameters(param);
4077

    
4078
    if (bs) {
4079
        bdrv_delete(bs);
4080
    }
4081

    
4082
    return ret;
4083
}
4084

    
4085
void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4086
                       BlockDriverCompletionFunc *cb, void *opaque,
4087
                       Error **errp)
4088
{
4089
    BlockJob *job;
4090

    
4091
    if (bs->job || bdrv_in_use(bs)) {
4092
        error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4093
        return NULL;
4094
    }
4095
    bdrv_set_in_use(bs, 1);
4096

    
4097
    job = g_malloc0(job_type->instance_size);
4098
    job->job_type      = job_type;
4099
    job->bs            = bs;
4100
    job->cb            = cb;
4101
    job->opaque        = opaque;
4102
    bs->job = job;
4103
    return job;
4104
}
4105

    
4106
void block_job_complete(BlockJob *job, int ret)
4107
{
4108
    BlockDriverState *bs = job->bs;
4109

    
4110
    assert(bs->job == job);
4111
    job->cb(job->opaque, ret);
4112
    bs->job = NULL;
4113
    g_free(job);
4114
    bdrv_set_in_use(bs, 0);
4115
}
4116

    
4117
void block_job_set_speed(BlockJob *job, int64_t value, Error **errp)
4118
{
4119
    Error *local_err = NULL;
4120

    
4121
    if (!job->job_type->set_speed) {
4122
        error_set(errp, QERR_NOT_SUPPORTED);
4123
        return;
4124
    }
4125
    job->job_type->set_speed(job, value, &local_err);
4126
    if (error_is_set(&local_err)) {
4127
        error_propagate(errp, local_err);
4128
        return;
4129
    }
4130

    
4131
    job->speed = value;
4132
}
4133

    
4134
void block_job_cancel(BlockJob *job)
4135
{
4136
    job->cancelled = true;
4137
}
4138

    
4139
bool block_job_is_cancelled(BlockJob *job)
4140
{
4141
    return job->cancelled;
4142
}
4143

    
4144
void block_job_cancel_sync(BlockJob *job)
4145
{
4146
    BlockDriverState *bs = job->bs;
4147

    
4148
    assert(bs->job == job);
4149
    block_job_cancel(job);
4150
    while (bs->job != NULL && bs->job->busy) {
4151
        qemu_aio_wait();
4152
    }
4153
}