Statistics
| Branch: | Revision:

root / block.c @ 7e680753

History | View | Annotate | Download (112 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block_int.h"
29
#include "module.h"
30
#include "qjson.h"
31
#include "qemu-coroutine.h"
32
#include "qmp-commands.h"
33
#include "qemu-timer.h"
34

    
35
#ifdef CONFIG_BSD
36
#include <sys/types.h>
37
#include <sys/stat.h>
38
#include <sys/ioctl.h>
39
#include <sys/queue.h>
40
#ifndef __DragonFly__
41
#include <sys/disk.h>
42
#endif
43
#endif
44

    
45
#ifdef _WIN32
46
#include <windows.h>
47
#endif
48

    
49
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50

    
51
typedef enum {
52
    BDRV_REQ_COPY_ON_READ = 0x1,
53
    BDRV_REQ_ZERO_WRITE   = 0x2,
54
} BdrvRequestFlags;
55

    
56
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59
        BlockDriverCompletionFunc *cb, void *opaque);
60
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64
                                         int64_t sector_num, int nb_sectors,
65
                                         QEMUIOVector *iov);
66
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71
    BdrvRequestFlags flags);
72
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76
                                               int64_t sector_num,
77
                                               QEMUIOVector *qiov,
78
                                               int nb_sectors,
79
                                               BlockDriverCompletionFunc *cb,
80
                                               void *opaque,
81
                                               bool is_write);
82
static void coroutine_fn bdrv_co_do_rw(void *opaque);
83

    
84
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
85
        bool is_write, double elapsed_time, uint64_t *wait);
86
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
87
        double elapsed_time, uint64_t *wait);
88
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
89
        bool is_write, int64_t *wait);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
95
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
96

    
97
/* The device to use for VM snapshots */
98
static BlockDriverState *bs_snapshots;
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_io_limits_disable(BlockDriverState *bs)
125
{
126
    bs->io_limits_enabled = false;
127

    
128
    while (qemu_co_queue_next(&bs->throttled_reqs));
129

    
130
    if (bs->block_timer) {
131
        qemu_del_timer(bs->block_timer);
132
        qemu_free_timer(bs->block_timer);
133
        bs->block_timer = NULL;
134
    }
135

    
136
    bs->slice_start = 0;
137
    bs->slice_end   = 0;
138
    bs->slice_time  = 0;
139
    memset(&bs->io_base, 0, sizeof(bs->io_base));
140
}
141

    
142
static void bdrv_block_timer(void *opaque)
143
{
144
    BlockDriverState *bs = opaque;
145

    
146
    qemu_co_queue_next(&bs->throttled_reqs);
147
}
148

    
149
void bdrv_io_limits_enable(BlockDriverState *bs)
150
{
151
    qemu_co_queue_init(&bs->throttled_reqs);
152
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
154
    bs->slice_start = qemu_get_clock_ns(vm_clock);
155
    bs->slice_end   = bs->slice_start + bs->slice_time;
156
    memset(&bs->io_base, 0, sizeof(bs->io_base));
157
    bs->io_limits_enabled = true;
158
}
159

    
160
bool bdrv_io_limits_enabled(BlockDriverState *bs)
161
{
162
    BlockIOLimit *io_limits = &bs->io_limits;
163
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
164
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
165
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
166
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
167
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
168
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
169
}
170

    
171
static void bdrv_io_limits_intercept(BlockDriverState *bs,
172
                                     bool is_write, int nb_sectors)
173
{
174
    int64_t wait_time = -1;
175

    
176
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
177
        qemu_co_queue_wait(&bs->throttled_reqs);
178
    }
179

    
180
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
181
     * throttled requests will not be dequeued until the current request is
182
     * allowed to be serviced. So if the current request still exceeds the
183
     * limits, it will be inserted to the head. All requests followed it will
184
     * be still in throttled_reqs queue.
185
     */
186

    
187
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
188
        qemu_mod_timer(bs->block_timer,
189
                       wait_time + qemu_get_clock_ns(vm_clock));
190
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
191
    }
192

    
193
    qemu_co_queue_next(&bs->throttled_reqs);
194
}
195

    
196
/* check if the path starts with "<protocol>:" */
197
static int path_has_protocol(const char *path)
198
{
199
#ifdef _WIN32
200
    if (is_windows_drive(path) ||
201
        is_windows_drive_prefix(path)) {
202
        return 0;
203
    }
204
#endif
205

    
206
    return strchr(path, ':') != NULL;
207
}
208

    
209
int path_is_absolute(const char *path)
210
{
211
    const char *p;
212
#ifdef _WIN32
213
    /* specific case for names like: "\\.\d:" */
214
    if (*path == '/' || *path == '\\')
215
        return 1;
216
#endif
217
    p = strchr(path, ':');
218
    if (p)
219
        p++;
220
    else
221
        p = path;
222
#ifdef _WIN32
223
    return (*p == '/' || *p == '\\');
224
#else
225
    return (*p == '/');
226
#endif
227
}
228

    
229
/* if filename is absolute, just copy it to dest. Otherwise, build a
230
   path to it by considering it is relative to base_path. URL are
231
   supported. */
232
void path_combine(char *dest, int dest_size,
233
                  const char *base_path,
234
                  const char *filename)
235
{
236
    const char *p, *p1;
237
    int len;
238

    
239
    if (dest_size <= 0)
240
        return;
241
    if (path_is_absolute(filename)) {
242
        pstrcpy(dest, dest_size, filename);
243
    } else {
244
        p = strchr(base_path, ':');
245
        if (p)
246
            p++;
247
        else
248
            p = base_path;
249
        p1 = strrchr(base_path, '/');
250
#ifdef _WIN32
251
        {
252
            const char *p2;
253
            p2 = strrchr(base_path, '\\');
254
            if (!p1 || p2 > p1)
255
                p1 = p2;
256
        }
257
#endif
258
        if (p1)
259
            p1++;
260
        else
261
            p1 = base_path;
262
        if (p1 > p)
263
            p = p1;
264
        len = p - base_path;
265
        if (len > dest_size - 1)
266
            len = dest_size - 1;
267
        memcpy(dest, base_path, len);
268
        dest[len] = '\0';
269
        pstrcat(dest, dest_size, filename);
270
    }
271
}
272

    
273
void bdrv_register(BlockDriver *bdrv)
274
{
275
    /* Block drivers without coroutine functions need emulation */
276
    if (!bdrv->bdrv_co_readv) {
277
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
278
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
279

    
280
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
281
         * the block driver lacks aio we need to emulate that too.
282
         */
283
        if (!bdrv->bdrv_aio_readv) {
284
            /* add AIO emulation layer */
285
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
286
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
287
        }
288
    }
289

    
290
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
291
}
292

    
293
/* create a new block device (by default it is empty) */
294
BlockDriverState *bdrv_new(const char *device_name)
295
{
296
    BlockDriverState *bs;
297

    
298
    bs = g_malloc0(sizeof(BlockDriverState));
299
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
300
    if (device_name[0] != '\0') {
301
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
302
    }
303
    bdrv_iostatus_disable(bs);
304
    return bs;
305
}
306

    
307
BlockDriver *bdrv_find_format(const char *format_name)
308
{
309
    BlockDriver *drv1;
310
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
311
        if (!strcmp(drv1->format_name, format_name)) {
312
            return drv1;
313
        }
314
    }
315
    return NULL;
316
}
317

    
318
static int bdrv_is_whitelisted(BlockDriver *drv)
319
{
320
    static const char *whitelist[] = {
321
        CONFIG_BDRV_WHITELIST
322
    };
323
    const char **p;
324

    
325
    if (!whitelist[0])
326
        return 1;               /* no whitelist, anything goes */
327

    
328
    for (p = whitelist; *p; p++) {
329
        if (!strcmp(drv->format_name, *p)) {
330
            return 1;
331
        }
332
    }
333
    return 0;
334
}
335

    
336
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
337
{
338
    BlockDriver *drv = bdrv_find_format(format_name);
339
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
340
}
341

    
342
int bdrv_create(BlockDriver *drv, const char* filename,
343
    QEMUOptionParameter *options)
344
{
345
    if (!drv->bdrv_create)
346
        return -ENOTSUP;
347

    
348
    return drv->bdrv_create(filename, options);
349
}
350

    
351
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
352
{
353
    BlockDriver *drv;
354

    
355
    drv = bdrv_find_protocol(filename);
356
    if (drv == NULL) {
357
        return -ENOENT;
358
    }
359

    
360
    return bdrv_create(drv, filename, options);
361
}
362

    
363
#ifdef _WIN32
364
void get_tmp_filename(char *filename, int size)
365
{
366
    char temp_dir[MAX_PATH];
367

    
368
    GetTempPath(MAX_PATH, temp_dir);
369
    GetTempFileName(temp_dir, "qem", 0, filename);
370
}
371
#else
372
void get_tmp_filename(char *filename, int size)
373
{
374
    int fd;
375
    const char *tmpdir;
376
    /* XXX: race condition possible */
377
    tmpdir = getenv("TMPDIR");
378
    if (!tmpdir)
379
        tmpdir = "/tmp";
380
    snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
381
    fd = mkstemp(filename);
382
    close(fd);
383
}
384
#endif
385

    
386
/*
387
 * Detect host devices. By convention, /dev/cdrom[N] is always
388
 * recognized as a host CDROM.
389
 */
390
static BlockDriver *find_hdev_driver(const char *filename)
391
{
392
    int score_max = 0, score;
393
    BlockDriver *drv = NULL, *d;
394

    
395
    QLIST_FOREACH(d, &bdrv_drivers, list) {
396
        if (d->bdrv_probe_device) {
397
            score = d->bdrv_probe_device(filename);
398
            if (score > score_max) {
399
                score_max = score;
400
                drv = d;
401
            }
402
        }
403
    }
404

    
405
    return drv;
406
}
407

    
408
BlockDriver *bdrv_find_protocol(const char *filename)
409
{
410
    BlockDriver *drv1;
411
    char protocol[128];
412
    int len;
413
    const char *p;
414

    
415
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
416

    
417
    /*
418
     * XXX(hch): we really should not let host device detection
419
     * override an explicit protocol specification, but moving this
420
     * later breaks access to device names with colons in them.
421
     * Thanks to the brain-dead persistent naming schemes on udev-
422
     * based Linux systems those actually are quite common.
423
     */
424
    drv1 = find_hdev_driver(filename);
425
    if (drv1) {
426
        return drv1;
427
    }
428

    
429
    if (!path_has_protocol(filename)) {
430
        return bdrv_find_format("file");
431
    }
432
    p = strchr(filename, ':');
433
    assert(p != NULL);
434
    len = p - filename;
435
    if (len > sizeof(protocol) - 1)
436
        len = sizeof(protocol) - 1;
437
    memcpy(protocol, filename, len);
438
    protocol[len] = '\0';
439
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
440
        if (drv1->protocol_name &&
441
            !strcmp(drv1->protocol_name, protocol)) {
442
            return drv1;
443
        }
444
    }
445
    return NULL;
446
}
447

    
448
static int find_image_format(const char *filename, BlockDriver **pdrv)
449
{
450
    int ret, score, score_max;
451
    BlockDriver *drv1, *drv;
452
    uint8_t buf[2048];
453
    BlockDriverState *bs;
454

    
455
    ret = bdrv_file_open(&bs, filename, 0);
456
    if (ret < 0) {
457
        *pdrv = NULL;
458
        return ret;
459
    }
460

    
461
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
462
    if (bs->sg || !bdrv_is_inserted(bs)) {
463
        bdrv_delete(bs);
464
        drv = bdrv_find_format("raw");
465
        if (!drv) {
466
            ret = -ENOENT;
467
        }
468
        *pdrv = drv;
469
        return ret;
470
    }
471

    
472
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
473
    bdrv_delete(bs);
474
    if (ret < 0) {
475
        *pdrv = NULL;
476
        return ret;
477
    }
478

    
479
    score_max = 0;
480
    drv = NULL;
481
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482
        if (drv1->bdrv_probe) {
483
            score = drv1->bdrv_probe(buf, ret, filename);
484
            if (score > score_max) {
485
                score_max = score;
486
                drv = drv1;
487
            }
488
        }
489
    }
490
    if (!drv) {
491
        ret = -ENOENT;
492
    }
493
    *pdrv = drv;
494
    return ret;
495
}
496

    
497
/**
498
 * Set the current 'total_sectors' value
499
 */
500
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
501
{
502
    BlockDriver *drv = bs->drv;
503

    
504
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
505
    if (bs->sg)
506
        return 0;
507

    
508
    /* query actual device if possible, otherwise just trust the hint */
509
    if (drv->bdrv_getlength) {
510
        int64_t length = drv->bdrv_getlength(bs);
511
        if (length < 0) {
512
            return length;
513
        }
514
        hint = length >> BDRV_SECTOR_BITS;
515
    }
516

    
517
    bs->total_sectors = hint;
518
    return 0;
519
}
520

    
521
/**
522
 * Set open flags for a given cache mode
523
 *
524
 * Return 0 on success, -1 if the cache mode was invalid.
525
 */
526
int bdrv_parse_cache_flags(const char *mode, int *flags)
527
{
528
    *flags &= ~BDRV_O_CACHE_MASK;
529

    
530
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
531
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
532
    } else if (!strcmp(mode, "directsync")) {
533
        *flags |= BDRV_O_NOCACHE;
534
    } else if (!strcmp(mode, "writeback")) {
535
        *flags |= BDRV_O_CACHE_WB;
536
    } else if (!strcmp(mode, "unsafe")) {
537
        *flags |= BDRV_O_CACHE_WB;
538
        *flags |= BDRV_O_NO_FLUSH;
539
    } else if (!strcmp(mode, "writethrough")) {
540
        /* this is the default */
541
    } else {
542
        return -1;
543
    }
544

    
545
    return 0;
546
}
547

    
548
/**
549
 * The copy-on-read flag is actually a reference count so multiple users may
550
 * use the feature without worrying about clobbering its previous state.
551
 * Copy-on-read stays enabled until all users have called to disable it.
552
 */
553
void bdrv_enable_copy_on_read(BlockDriverState *bs)
554
{
555
    bs->copy_on_read++;
556
}
557

    
558
void bdrv_disable_copy_on_read(BlockDriverState *bs)
559
{
560
    assert(bs->copy_on_read > 0);
561
    bs->copy_on_read--;
562
}
563

    
564
/*
565
 * Common part for opening disk images and files
566
 */
567
static int bdrv_open_common(BlockDriverState *bs, const char *filename,
568
    int flags, BlockDriver *drv)
569
{
570
    int ret, open_flags;
571

    
572
    assert(drv != NULL);
573

    
574
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
575

    
576
    bs->file = NULL;
577
    bs->total_sectors = 0;
578
    bs->encrypted = 0;
579
    bs->valid_key = 0;
580
    bs->sg = 0;
581
    bs->open_flags = flags;
582
    bs->growable = 0;
583
    bs->buffer_alignment = 512;
584

    
585
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
586
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
587
        bdrv_enable_copy_on_read(bs);
588
    }
589

    
590
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
591
    bs->backing_file[0] = '\0';
592

    
593
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
594
        return -ENOTSUP;
595
    }
596

    
597
    bs->drv = drv;
598
    bs->opaque = g_malloc0(drv->instance_size);
599

    
600
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
601

    
602
    /*
603
     * Clear flags that are internal to the block layer before opening the
604
     * image.
605
     */
606
    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
607

    
608
    /*
609
     * Snapshots should be writable.
610
     */
611
    if (bs->is_temporary) {
612
        open_flags |= BDRV_O_RDWR;
613
    }
614

    
615
    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
616

    
617
    /* Open the image, either directly or using a protocol */
618
    if (drv->bdrv_file_open) {
619
        ret = drv->bdrv_file_open(bs, filename, open_flags);
620
    } else {
621
        ret = bdrv_file_open(&bs->file, filename, open_flags);
622
        if (ret >= 0) {
623
            ret = drv->bdrv_open(bs, open_flags);
624
        }
625
    }
626

    
627
    if (ret < 0) {
628
        goto free_and_fail;
629
    }
630

    
631
    ret = refresh_total_sectors(bs, bs->total_sectors);
632
    if (ret < 0) {
633
        goto free_and_fail;
634
    }
635

    
636
#ifndef _WIN32
637
    if (bs->is_temporary) {
638
        unlink(filename);
639
    }
640
#endif
641
    return 0;
642

    
643
free_and_fail:
644
    if (bs->file) {
645
        bdrv_delete(bs->file);
646
        bs->file = NULL;
647
    }
648
    g_free(bs->opaque);
649
    bs->opaque = NULL;
650
    bs->drv = NULL;
651
    return ret;
652
}
653

    
654
/*
655
 * Opens a file using a protocol (file, host_device, nbd, ...)
656
 */
657
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
658
{
659
    BlockDriverState *bs;
660
    BlockDriver *drv;
661
    int ret;
662

    
663
    drv = bdrv_find_protocol(filename);
664
    if (!drv) {
665
        return -ENOENT;
666
    }
667

    
668
    bs = bdrv_new("");
669
    ret = bdrv_open_common(bs, filename, flags, drv);
670
    if (ret < 0) {
671
        bdrv_delete(bs);
672
        return ret;
673
    }
674
    bs->growable = 1;
675
    *pbs = bs;
676
    return 0;
677
}
678

    
679
/*
680
 * Opens a disk image (raw, qcow2, vmdk, ...)
681
 */
682
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
683
              BlockDriver *drv)
684
{
685
    int ret;
686
    char tmp_filename[PATH_MAX];
687

    
688
    if (flags & BDRV_O_SNAPSHOT) {
689
        BlockDriverState *bs1;
690
        int64_t total_size;
691
        int is_protocol = 0;
692
        BlockDriver *bdrv_qcow2;
693
        QEMUOptionParameter *options;
694
        char backing_filename[PATH_MAX];
695

    
696
        /* if snapshot, we create a temporary backing file and open it
697
           instead of opening 'filename' directly */
698

    
699
        /* if there is a backing file, use it */
700
        bs1 = bdrv_new("");
701
        ret = bdrv_open(bs1, filename, 0, drv);
702
        if (ret < 0) {
703
            bdrv_delete(bs1);
704
            return ret;
705
        }
706
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
707

    
708
        if (bs1->drv && bs1->drv->protocol_name)
709
            is_protocol = 1;
710

    
711
        bdrv_delete(bs1);
712

    
713
        get_tmp_filename(tmp_filename, sizeof(tmp_filename));
714

    
715
        /* Real path is meaningless for protocols */
716
        if (is_protocol)
717
            snprintf(backing_filename, sizeof(backing_filename),
718
                     "%s", filename);
719
        else if (!realpath(filename, backing_filename))
720
            return -errno;
721

    
722
        bdrv_qcow2 = bdrv_find_format("qcow2");
723
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
724

    
725
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
726
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
727
        if (drv) {
728
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
729
                drv->format_name);
730
        }
731

    
732
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
733
        free_option_parameters(options);
734
        if (ret < 0) {
735
            return ret;
736
        }
737

    
738
        filename = tmp_filename;
739
        drv = bdrv_qcow2;
740
        bs->is_temporary = 1;
741
    }
742

    
743
    /* Find the right image format driver */
744
    if (!drv) {
745
        ret = find_image_format(filename, &drv);
746
    }
747

    
748
    if (!drv) {
749
        goto unlink_and_fail;
750
    }
751

    
752
    /* Open the image */
753
    ret = bdrv_open_common(bs, filename, flags, drv);
754
    if (ret < 0) {
755
        goto unlink_and_fail;
756
    }
757

    
758
    /* If there is a backing file, use it */
759
    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
760
        char backing_filename[PATH_MAX];
761
        int back_flags;
762
        BlockDriver *back_drv = NULL;
763

    
764
        bs->backing_hd = bdrv_new("");
765

    
766
        if (path_has_protocol(bs->backing_file)) {
767
            pstrcpy(backing_filename, sizeof(backing_filename),
768
                    bs->backing_file);
769
        } else {
770
            path_combine(backing_filename, sizeof(backing_filename),
771
                         filename, bs->backing_file);
772
        }
773

    
774
        if (bs->backing_format[0] != '\0') {
775
            back_drv = bdrv_find_format(bs->backing_format);
776
        }
777

    
778
        /* backing files always opened read-only */
779
        back_flags =
780
            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
781

    
782
        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
783
        if (ret < 0) {
784
            bdrv_close(bs);
785
            return ret;
786
        }
787
        if (bs->is_temporary) {
788
            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
789
        } else {
790
            /* base image inherits from "parent" */
791
            bs->backing_hd->keep_read_only = bs->keep_read_only;
792
        }
793
    }
794

    
795
    if (!bdrv_key_required(bs)) {
796
        bdrv_dev_change_media_cb(bs, true);
797
    }
798

    
799
    /* throttling disk I/O limits */
800
    if (bs->io_limits_enabled) {
801
        bdrv_io_limits_enable(bs);
802
    }
803

    
804
    return 0;
805

    
806
unlink_and_fail:
807
    if (bs->is_temporary) {
808
        unlink(filename);
809
    }
810
    return ret;
811
}
812

    
813
void bdrv_close(BlockDriverState *bs)
814
{
815
    if (bs->drv) {
816
        if (bs == bs_snapshots) {
817
            bs_snapshots = NULL;
818
        }
819
        if (bs->backing_hd) {
820
            bdrv_delete(bs->backing_hd);
821
            bs->backing_hd = NULL;
822
        }
823
        bs->drv->bdrv_close(bs);
824
        g_free(bs->opaque);
825
#ifdef _WIN32
826
        if (bs->is_temporary) {
827
            unlink(bs->filename);
828
        }
829
#endif
830
        bs->opaque = NULL;
831
        bs->drv = NULL;
832
        bs->copy_on_read = 0;
833

    
834
        if (bs->file != NULL) {
835
            bdrv_close(bs->file);
836
        }
837

    
838
        bdrv_dev_change_media_cb(bs, false);
839
    }
840

    
841
    /*throttling disk I/O limits*/
842
    if (bs->io_limits_enabled) {
843
        bdrv_io_limits_disable(bs);
844
    }
845
}
846

    
847
void bdrv_close_all(void)
848
{
849
    BlockDriverState *bs;
850

    
851
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
852
        bdrv_close(bs);
853
    }
854
}
855

    
856
/*
857
 * Wait for pending requests to complete across all BlockDriverStates
858
 *
859
 * This function does not flush data to disk, use bdrv_flush_all() for that
860
 * after calling this function.
861
 */
862
void bdrv_drain_all(void)
863
{
864
    BlockDriverState *bs;
865

    
866
    qemu_aio_flush();
867

    
868
    /* If requests are still pending there is a bug somewhere */
869
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
870
        assert(QLIST_EMPTY(&bs->tracked_requests));
871
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
872
    }
873
}
874

    
875
/* make a BlockDriverState anonymous by removing from bdrv_state list.
876
   Also, NULL terminate the device_name to prevent double remove */
877
void bdrv_make_anon(BlockDriverState *bs)
878
{
879
    if (bs->device_name[0] != '\0') {
880
        QTAILQ_REMOVE(&bdrv_states, bs, list);
881
    }
882
    bs->device_name[0] = '\0';
883
}
884

    
885
/*
886
 * Add new bs contents at the top of an image chain while the chain is
887
 * live, while keeping required fields on the top layer.
888
 *
889
 * This will modify the BlockDriverState fields, and swap contents
890
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
891
 *
892
 * This function does not create any image files.
893
 */
894
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
895
{
896
    BlockDriverState tmp;
897

    
898
    /* the new bs must not be in bdrv_states */
899
    bdrv_make_anon(bs_new);
900

    
901
    tmp = *bs_new;
902

    
903
    /* there are some fields that need to stay on the top layer: */
904

    
905
    /* dev info */
906
    tmp.dev_ops           = bs_top->dev_ops;
907
    tmp.dev_opaque        = bs_top->dev_opaque;
908
    tmp.dev               = bs_top->dev;
909
    tmp.buffer_alignment  = bs_top->buffer_alignment;
910
    tmp.copy_on_read      = bs_top->copy_on_read;
911

    
912
    /* i/o timing parameters */
913
    tmp.slice_time        = bs_top->slice_time;
914
    tmp.slice_start       = bs_top->slice_start;
915
    tmp.slice_end         = bs_top->slice_end;
916
    tmp.io_limits         = bs_top->io_limits;
917
    tmp.io_base           = bs_top->io_base;
918
    tmp.throttled_reqs    = bs_top->throttled_reqs;
919
    tmp.block_timer       = bs_top->block_timer;
920
    tmp.io_limits_enabled = bs_top->io_limits_enabled;
921

    
922
    /* geometry */
923
    tmp.cyls              = bs_top->cyls;
924
    tmp.heads             = bs_top->heads;
925
    tmp.secs              = bs_top->secs;
926
    tmp.translation       = bs_top->translation;
927

    
928
    /* r/w error */
929
    tmp.on_read_error     = bs_top->on_read_error;
930
    tmp.on_write_error    = bs_top->on_write_error;
931

    
932
    /* i/o status */
933
    tmp.iostatus_enabled  = bs_top->iostatus_enabled;
934
    tmp.iostatus          = bs_top->iostatus;
935

    
936
    /* keep the same entry in bdrv_states */
937
    pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
938
    tmp.list = bs_top->list;
939

    
940
    /* The contents of 'tmp' will become bs_top, as we are
941
     * swapping bs_new and bs_top contents. */
942
    tmp.backing_hd = bs_new;
943
    pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
944

    
945
    /* swap contents of the fixed new bs and the current top */
946
    *bs_new = *bs_top;
947
    *bs_top = tmp;
948

    
949
    /* clear the copied fields in the new backing file */
950
    bdrv_detach_dev(bs_new, bs_new->dev);
951

    
952
    qemu_co_queue_init(&bs_new->throttled_reqs);
953
    memset(&bs_new->io_base,   0, sizeof(bs_new->io_base));
954
    memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
955
    bdrv_iostatus_disable(bs_new);
956

    
957
    /* we don't use bdrv_io_limits_disable() for this, because we don't want
958
     * to affect or delete the block_timer, as it has been moved to bs_top */
959
    bs_new->io_limits_enabled = false;
960
    bs_new->block_timer       = NULL;
961
    bs_new->slice_time        = 0;
962
    bs_new->slice_start       = 0;
963
    bs_new->slice_end         = 0;
964
}
965

    
966
void bdrv_delete(BlockDriverState *bs)
967
{
968
    assert(!bs->dev);
969

    
970
    /* remove from list, if necessary */
971
    bdrv_make_anon(bs);
972

    
973
    bdrv_close(bs);
974
    if (bs->file != NULL) {
975
        bdrv_delete(bs->file);
976
    }
977

    
978
    assert(bs != bs_snapshots);
979
    g_free(bs);
980
}
981

    
982
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
983
/* TODO change to DeviceState *dev when all users are qdevified */
984
{
985
    if (bs->dev) {
986
        return -EBUSY;
987
    }
988
    bs->dev = dev;
989
    bdrv_iostatus_reset(bs);
990
    return 0;
991
}
992

    
993
/* TODO qdevified devices don't use this, remove when devices are qdevified */
994
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
995
{
996
    if (bdrv_attach_dev(bs, dev) < 0) {
997
        abort();
998
    }
999
}
1000

    
1001
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1002
/* TODO change to DeviceState *dev when all users are qdevified */
1003
{
1004
    assert(bs->dev == dev);
1005
    bs->dev = NULL;
1006
    bs->dev_ops = NULL;
1007
    bs->dev_opaque = NULL;
1008
    bs->buffer_alignment = 512;
1009
}
1010

    
1011
/* TODO change to return DeviceState * when all users are qdevified */
1012
void *bdrv_get_attached_dev(BlockDriverState *bs)
1013
{
1014
    return bs->dev;
1015
}
1016

    
1017
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1018
                      void *opaque)
1019
{
1020
    bs->dev_ops = ops;
1021
    bs->dev_opaque = opaque;
1022
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1023
        bs_snapshots = NULL;
1024
    }
1025
}
1026

    
1027
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1028
                               BlockQMPEventAction action, int is_read)
1029
{
1030
    QObject *data;
1031
    const char *action_str;
1032

    
1033
    switch (action) {
1034
    case BDRV_ACTION_REPORT:
1035
        action_str = "report";
1036
        break;
1037
    case BDRV_ACTION_IGNORE:
1038
        action_str = "ignore";
1039
        break;
1040
    case BDRV_ACTION_STOP:
1041
        action_str = "stop";
1042
        break;
1043
    default:
1044
        abort();
1045
    }
1046

    
1047
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1048
                              bdrv->device_name,
1049
                              action_str,
1050
                              is_read ? "read" : "write");
1051
    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1052

    
1053
    qobject_decref(data);
1054
}
1055

    
1056
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1057
{
1058
    QObject *data;
1059

    
1060
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1061
                              bdrv_get_device_name(bs), ejected);
1062
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1063

    
1064
    qobject_decref(data);
1065
}
1066

    
1067
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1068
{
1069
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1070
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1071
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1072
        if (tray_was_closed) {
1073
            /* tray open */
1074
            bdrv_emit_qmp_eject_event(bs, true);
1075
        }
1076
        if (load) {
1077
            /* tray close */
1078
            bdrv_emit_qmp_eject_event(bs, false);
1079
        }
1080
    }
1081
}
1082

    
1083
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1084
{
1085
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1086
}
1087

    
1088
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1089
{
1090
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1091
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1092
    }
1093
}
1094

    
1095
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1096
{
1097
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1098
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1099
    }
1100
    return false;
1101
}
1102

    
1103
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1104
{
1105
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1106
        bs->dev_ops->resize_cb(bs->dev_opaque);
1107
    }
1108
}
1109

    
1110
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1111
{
1112
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1113
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1114
    }
1115
    return false;
1116
}
1117

    
1118
/*
1119
 * Run consistency checks on an image
1120
 *
1121
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1122
 * free of errors) or -errno when an internal error occurred. The results of the
1123
 * check are stored in res.
1124
 */
1125
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1126
{
1127
    if (bs->drv->bdrv_check == NULL) {
1128
        return -ENOTSUP;
1129
    }
1130

    
1131
    memset(res, 0, sizeof(*res));
1132
    return bs->drv->bdrv_check(bs, res);
1133
}
1134

    
1135
#define COMMIT_BUF_SECTORS 2048
1136

    
1137
/* commit COW file into the raw image */
1138
int bdrv_commit(BlockDriverState *bs)
1139
{
1140
    BlockDriver *drv = bs->drv;
1141
    BlockDriver *backing_drv;
1142
    int64_t sector, total_sectors;
1143
    int n, ro, open_flags;
1144
    int ret = 0, rw_ret = 0;
1145
    uint8_t *buf;
1146
    char filename[1024];
1147
    BlockDriverState *bs_rw, *bs_ro;
1148

    
1149
    if (!drv)
1150
        return -ENOMEDIUM;
1151
    
1152
    if (!bs->backing_hd) {
1153
        return -ENOTSUP;
1154
    }
1155

    
1156
    if (bs->backing_hd->keep_read_only) {
1157
        return -EACCES;
1158
    }
1159

    
1160
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1161
        return -EBUSY;
1162
    }
1163

    
1164
    backing_drv = bs->backing_hd->drv;
1165
    ro = bs->backing_hd->read_only;
1166
    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1167
    open_flags =  bs->backing_hd->open_flags;
1168

    
1169
    if (ro) {
1170
        /* re-open as RW */
1171
        bdrv_delete(bs->backing_hd);
1172
        bs->backing_hd = NULL;
1173
        bs_rw = bdrv_new("");
1174
        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1175
            backing_drv);
1176
        if (rw_ret < 0) {
1177
            bdrv_delete(bs_rw);
1178
            /* try to re-open read-only */
1179
            bs_ro = bdrv_new("");
1180
            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1181
                backing_drv);
1182
            if (ret < 0) {
1183
                bdrv_delete(bs_ro);
1184
                /* drive not functional anymore */
1185
                bs->drv = NULL;
1186
                return ret;
1187
            }
1188
            bs->backing_hd = bs_ro;
1189
            return rw_ret;
1190
        }
1191
        bs->backing_hd = bs_rw;
1192
    }
1193

    
1194
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1195
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1196

    
1197
    for (sector = 0; sector < total_sectors; sector += n) {
1198
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1199

    
1200
            if (bdrv_read(bs, sector, buf, n) != 0) {
1201
                ret = -EIO;
1202
                goto ro_cleanup;
1203
            }
1204

    
1205
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1206
                ret = -EIO;
1207
                goto ro_cleanup;
1208
            }
1209
        }
1210
    }
1211

    
1212
    if (drv->bdrv_make_empty) {
1213
        ret = drv->bdrv_make_empty(bs);
1214
        bdrv_flush(bs);
1215
    }
1216

    
1217
    /*
1218
     * Make sure all data we wrote to the backing device is actually
1219
     * stable on disk.
1220
     */
1221
    if (bs->backing_hd)
1222
        bdrv_flush(bs->backing_hd);
1223

    
1224
ro_cleanup:
1225
    g_free(buf);
1226

    
1227
    if (ro) {
1228
        /* re-open as RO */
1229
        bdrv_delete(bs->backing_hd);
1230
        bs->backing_hd = NULL;
1231
        bs_ro = bdrv_new("");
1232
        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1233
            backing_drv);
1234
        if (ret < 0) {
1235
            bdrv_delete(bs_ro);
1236
            /* drive not functional anymore */
1237
            bs->drv = NULL;
1238
            return ret;
1239
        }
1240
        bs->backing_hd = bs_ro;
1241
        bs->backing_hd->keep_read_only = 0;
1242
    }
1243

    
1244
    return ret;
1245
}
1246

    
1247
void bdrv_commit_all(void)
1248
{
1249
    BlockDriverState *bs;
1250

    
1251
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1252
        bdrv_commit(bs);
1253
    }
1254
}
1255

    
1256
struct BdrvTrackedRequest {
1257
    BlockDriverState *bs;
1258
    int64_t sector_num;
1259
    int nb_sectors;
1260
    bool is_write;
1261
    QLIST_ENTRY(BdrvTrackedRequest) list;
1262
    Coroutine *co; /* owner, used for deadlock detection */
1263
    CoQueue wait_queue; /* coroutines blocked on this request */
1264
};
1265

    
1266
/**
1267
 * Remove an active request from the tracked requests list
1268
 *
1269
 * This function should be called when a tracked request is completing.
1270
 */
1271
static void tracked_request_end(BdrvTrackedRequest *req)
1272
{
1273
    QLIST_REMOVE(req, list);
1274
    qemu_co_queue_restart_all(&req->wait_queue);
1275
}
1276

    
1277
/**
1278
 * Add an active request to the tracked requests list
1279
 */
1280
static void tracked_request_begin(BdrvTrackedRequest *req,
1281
                                  BlockDriverState *bs,
1282
                                  int64_t sector_num,
1283
                                  int nb_sectors, bool is_write)
1284
{
1285
    *req = (BdrvTrackedRequest){
1286
        .bs = bs,
1287
        .sector_num = sector_num,
1288
        .nb_sectors = nb_sectors,
1289
        .is_write = is_write,
1290
        .co = qemu_coroutine_self(),
1291
    };
1292

    
1293
    qemu_co_queue_init(&req->wait_queue);
1294

    
1295
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1296
}
1297

    
1298
/**
1299
 * Round a region to cluster boundaries
1300
 */
1301
static void round_to_clusters(BlockDriverState *bs,
1302
                              int64_t sector_num, int nb_sectors,
1303
                              int64_t *cluster_sector_num,
1304
                              int *cluster_nb_sectors)
1305
{
1306
    BlockDriverInfo bdi;
1307

    
1308
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1309
        *cluster_sector_num = sector_num;
1310
        *cluster_nb_sectors = nb_sectors;
1311
    } else {
1312
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1313
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1314
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1315
                                            nb_sectors, c);
1316
    }
1317
}
1318

    
1319
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1320
                                     int64_t sector_num, int nb_sectors) {
1321
    /*        aaaa   bbbb */
1322
    if (sector_num >= req->sector_num + req->nb_sectors) {
1323
        return false;
1324
    }
1325
    /* bbbb   aaaa        */
1326
    if (req->sector_num >= sector_num + nb_sectors) {
1327
        return false;
1328
    }
1329
    return true;
1330
}
1331

    
1332
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1333
        int64_t sector_num, int nb_sectors)
1334
{
1335
    BdrvTrackedRequest *req;
1336
    int64_t cluster_sector_num;
1337
    int cluster_nb_sectors;
1338
    bool retry;
1339

    
1340
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1341
     * that allocating writes will be serialized and not race with each other
1342
     * for the same cluster.  For example, in copy-on-read it ensures that the
1343
     * CoR read and write operations are atomic and guest writes cannot
1344
     * interleave between them.
1345
     */
1346
    round_to_clusters(bs, sector_num, nb_sectors,
1347
                      &cluster_sector_num, &cluster_nb_sectors);
1348

    
1349
    do {
1350
        retry = false;
1351
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1352
            if (tracked_request_overlaps(req, cluster_sector_num,
1353
                                         cluster_nb_sectors)) {
1354
                /* Hitting this means there was a reentrant request, for
1355
                 * example, a block driver issuing nested requests.  This must
1356
                 * never happen since it means deadlock.
1357
                 */
1358
                assert(qemu_coroutine_self() != req->co);
1359

    
1360
                qemu_co_queue_wait(&req->wait_queue);
1361
                retry = true;
1362
                break;
1363
            }
1364
        }
1365
    } while (retry);
1366
}
1367

    
1368
/*
1369
 * Return values:
1370
 * 0        - success
1371
 * -EINVAL  - backing format specified, but no file
1372
 * -ENOSPC  - can't update the backing file because no space is left in the
1373
 *            image file header
1374
 * -ENOTSUP - format driver doesn't support changing the backing file
1375
 */
1376
int bdrv_change_backing_file(BlockDriverState *bs,
1377
    const char *backing_file, const char *backing_fmt)
1378
{
1379
    BlockDriver *drv = bs->drv;
1380

    
1381
    if (drv->bdrv_change_backing_file != NULL) {
1382
        return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1383
    } else {
1384
        return -ENOTSUP;
1385
    }
1386
}
1387

    
1388
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1389
                                   size_t size)
1390
{
1391
    int64_t len;
1392

    
1393
    if (!bdrv_is_inserted(bs))
1394
        return -ENOMEDIUM;
1395

    
1396
    if (bs->growable)
1397
        return 0;
1398

    
1399
    len = bdrv_getlength(bs);
1400

    
1401
    if (offset < 0)
1402
        return -EIO;
1403

    
1404
    if ((offset > len) || (len - offset < size))
1405
        return -EIO;
1406

    
1407
    return 0;
1408
}
1409

    
1410
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1411
                              int nb_sectors)
1412
{
1413
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1414
                                   nb_sectors * BDRV_SECTOR_SIZE);
1415
}
1416

    
1417
typedef struct RwCo {
1418
    BlockDriverState *bs;
1419
    int64_t sector_num;
1420
    int nb_sectors;
1421
    QEMUIOVector *qiov;
1422
    bool is_write;
1423
    int ret;
1424
} RwCo;
1425

    
1426
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1427
{
1428
    RwCo *rwco = opaque;
1429

    
1430
    if (!rwco->is_write) {
1431
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1432
                                     rwco->nb_sectors, rwco->qiov, 0);
1433
    } else {
1434
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1435
                                      rwco->nb_sectors, rwco->qiov, 0);
1436
    }
1437
}
1438

    
1439
/*
1440
 * Process a synchronous request using coroutines
1441
 */
1442
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1443
                      int nb_sectors, bool is_write)
1444
{
1445
    QEMUIOVector qiov;
1446
    struct iovec iov = {
1447
        .iov_base = (void *)buf,
1448
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1449
    };
1450
    Coroutine *co;
1451
    RwCo rwco = {
1452
        .bs = bs,
1453
        .sector_num = sector_num,
1454
        .nb_sectors = nb_sectors,
1455
        .qiov = &qiov,
1456
        .is_write = is_write,
1457
        .ret = NOT_DONE,
1458
    };
1459

    
1460
    qemu_iovec_init_external(&qiov, &iov, 1);
1461

    
1462
    if (qemu_in_coroutine()) {
1463
        /* Fast-path if already in coroutine context */
1464
        bdrv_rw_co_entry(&rwco);
1465
    } else {
1466
        co = qemu_coroutine_create(bdrv_rw_co_entry);
1467
        qemu_coroutine_enter(co, &rwco);
1468
        while (rwco.ret == NOT_DONE) {
1469
            qemu_aio_wait();
1470
        }
1471
    }
1472
    return rwco.ret;
1473
}
1474

    
1475
/* return < 0 if error. See bdrv_write() for the return codes */
1476
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1477
              uint8_t *buf, int nb_sectors)
1478
{
1479
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1480
}
1481

    
1482
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1483
                             int nb_sectors, int dirty)
1484
{
1485
    int64_t start, end;
1486
    unsigned long val, idx, bit;
1487

    
1488
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1489
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1490

    
1491
    for (; start <= end; start++) {
1492
        idx = start / (sizeof(unsigned long) * 8);
1493
        bit = start % (sizeof(unsigned long) * 8);
1494
        val = bs->dirty_bitmap[idx];
1495
        if (dirty) {
1496
            if (!(val & (1UL << bit))) {
1497
                bs->dirty_count++;
1498
                val |= 1UL << bit;
1499
            }
1500
        } else {
1501
            if (val & (1UL << bit)) {
1502
                bs->dirty_count--;
1503
                val &= ~(1UL << bit);
1504
            }
1505
        }
1506
        bs->dirty_bitmap[idx] = val;
1507
    }
1508
}
1509

    
1510
/* Return < 0 if error. Important errors are:
1511
  -EIO         generic I/O error (may happen for all errors)
1512
  -ENOMEDIUM   No media inserted.
1513
  -EINVAL      Invalid sector number or nb_sectors
1514
  -EACCES      Trying to write a read-only device
1515
*/
1516
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1517
               const uint8_t *buf, int nb_sectors)
1518
{
1519
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1520
}
1521

    
1522
int bdrv_pread(BlockDriverState *bs, int64_t offset,
1523
               void *buf, int count1)
1524
{
1525
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1526
    int len, nb_sectors, count;
1527
    int64_t sector_num;
1528
    int ret;
1529

    
1530
    count = count1;
1531
    /* first read to align to sector start */
1532
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1533
    if (len > count)
1534
        len = count;
1535
    sector_num = offset >> BDRV_SECTOR_BITS;
1536
    if (len > 0) {
1537
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1538
            return ret;
1539
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1540
        count -= len;
1541
        if (count == 0)
1542
            return count1;
1543
        sector_num++;
1544
        buf += len;
1545
    }
1546

    
1547
    /* read the sectors "in place" */
1548
    nb_sectors = count >> BDRV_SECTOR_BITS;
1549
    if (nb_sectors > 0) {
1550
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1551
            return ret;
1552
        sector_num += nb_sectors;
1553
        len = nb_sectors << BDRV_SECTOR_BITS;
1554
        buf += len;
1555
        count -= len;
1556
    }
1557

    
1558
    /* add data from the last sector */
1559
    if (count > 0) {
1560
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1561
            return ret;
1562
        memcpy(buf, tmp_buf, count);
1563
    }
1564
    return count1;
1565
}
1566

    
1567
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1568
                const void *buf, int count1)
1569
{
1570
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1571
    int len, nb_sectors, count;
1572
    int64_t sector_num;
1573
    int ret;
1574

    
1575
    count = count1;
1576
    /* first write to align to sector start */
1577
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1578
    if (len > count)
1579
        len = count;
1580
    sector_num = offset >> BDRV_SECTOR_BITS;
1581
    if (len > 0) {
1582
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1583
            return ret;
1584
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1585
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1586
            return ret;
1587
        count -= len;
1588
        if (count == 0)
1589
            return count1;
1590
        sector_num++;
1591
        buf += len;
1592
    }
1593

    
1594
    /* write the sectors "in place" */
1595
    nb_sectors = count >> BDRV_SECTOR_BITS;
1596
    if (nb_sectors > 0) {
1597
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1598
            return ret;
1599
        sector_num += nb_sectors;
1600
        len = nb_sectors << BDRV_SECTOR_BITS;
1601
        buf += len;
1602
        count -= len;
1603
    }
1604

    
1605
    /* add data from the last sector */
1606
    if (count > 0) {
1607
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1608
            return ret;
1609
        memcpy(tmp_buf, buf, count);
1610
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1611
            return ret;
1612
    }
1613
    return count1;
1614
}
1615

    
1616
/*
1617
 * Writes to the file and ensures that no writes are reordered across this
1618
 * request (acts as a barrier)
1619
 *
1620
 * Returns 0 on success, -errno in error cases.
1621
 */
1622
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1623
    const void *buf, int count)
1624
{
1625
    int ret;
1626

    
1627
    ret = bdrv_pwrite(bs, offset, buf, count);
1628
    if (ret < 0) {
1629
        return ret;
1630
    }
1631

    
1632
    /* No flush needed for cache modes that use O_DSYNC */
1633
    if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1634
        bdrv_flush(bs);
1635
    }
1636

    
1637
    return 0;
1638
}
1639

    
1640
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1641
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1642
{
1643
    /* Perform I/O through a temporary buffer so that users who scribble over
1644
     * their read buffer while the operation is in progress do not end up
1645
     * modifying the image file.  This is critical for zero-copy guest I/O
1646
     * where anything might happen inside guest memory.
1647
     */
1648
    void *bounce_buffer;
1649

    
1650
    BlockDriver *drv = bs->drv;
1651
    struct iovec iov;
1652
    QEMUIOVector bounce_qiov;
1653
    int64_t cluster_sector_num;
1654
    int cluster_nb_sectors;
1655
    size_t skip_bytes;
1656
    int ret;
1657

    
1658
    /* Cover entire cluster so no additional backing file I/O is required when
1659
     * allocating cluster in the image file.
1660
     */
1661
    round_to_clusters(bs, sector_num, nb_sectors,
1662
                      &cluster_sector_num, &cluster_nb_sectors);
1663

    
1664
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1665
                                   cluster_sector_num, cluster_nb_sectors);
1666

    
1667
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1668
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1669
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1670

    
1671
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1672
                             &bounce_qiov);
1673
    if (ret < 0) {
1674
        goto err;
1675
    }
1676

    
1677
    if (drv->bdrv_co_write_zeroes &&
1678
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
1679
        ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
1680
                                        cluster_nb_sectors);
1681
    } else {
1682
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1683
                                  &bounce_qiov);
1684
    }
1685

    
1686
    if (ret < 0) {
1687
        /* It might be okay to ignore write errors for guest requests.  If this
1688
         * is a deliberate copy-on-read then we don't want to ignore the error.
1689
         * Simply report it in all cases.
1690
         */
1691
        goto err;
1692
    }
1693

    
1694
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1695
    qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1696
                           nb_sectors * BDRV_SECTOR_SIZE);
1697

    
1698
err:
1699
    qemu_vfree(bounce_buffer);
1700
    return ret;
1701
}
1702

    
1703
/*
1704
 * Handle a read request in coroutine context
1705
 */
1706
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1707
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1708
    BdrvRequestFlags flags)
1709
{
1710
    BlockDriver *drv = bs->drv;
1711
    BdrvTrackedRequest req;
1712
    int ret;
1713

    
1714
    if (!drv) {
1715
        return -ENOMEDIUM;
1716
    }
1717
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1718
        return -EIO;
1719
    }
1720

    
1721
    /* throttling disk read I/O */
1722
    if (bs->io_limits_enabled) {
1723
        bdrv_io_limits_intercept(bs, false, nb_sectors);
1724
    }
1725

    
1726
    if (bs->copy_on_read) {
1727
        flags |= BDRV_REQ_COPY_ON_READ;
1728
    }
1729
    if (flags & BDRV_REQ_COPY_ON_READ) {
1730
        bs->copy_on_read_in_flight++;
1731
    }
1732

    
1733
    if (bs->copy_on_read_in_flight) {
1734
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1735
    }
1736

    
1737
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1738

    
1739
    if (flags & BDRV_REQ_COPY_ON_READ) {
1740
        int pnum;
1741

    
1742
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1743
        if (ret < 0) {
1744
            goto out;
1745
        }
1746

    
1747
        if (!ret || pnum != nb_sectors) {
1748
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1749
            goto out;
1750
        }
1751
    }
1752

    
1753
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1754

    
1755
out:
1756
    tracked_request_end(&req);
1757

    
1758
    if (flags & BDRV_REQ_COPY_ON_READ) {
1759
        bs->copy_on_read_in_flight--;
1760
    }
1761

    
1762
    return ret;
1763
}
1764

    
1765
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1766
    int nb_sectors, QEMUIOVector *qiov)
1767
{
1768
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1769

    
1770
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1771
}
1772

    
1773
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1774
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1775
{
1776
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1777

    
1778
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1779
                            BDRV_REQ_COPY_ON_READ);
1780
}
1781

    
1782
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1783
    int64_t sector_num, int nb_sectors)
1784
{
1785
    BlockDriver *drv = bs->drv;
1786
    QEMUIOVector qiov;
1787
    struct iovec iov;
1788
    int ret;
1789

    
1790
    /* First try the efficient write zeroes operation */
1791
    if (drv->bdrv_co_write_zeroes) {
1792
        return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1793
    }
1794

    
1795
    /* Fall back to bounce buffer if write zeroes is unsupported */
1796
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1797
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1798
    memset(iov.iov_base, 0, iov.iov_len);
1799
    qemu_iovec_init_external(&qiov, &iov, 1);
1800

    
1801
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1802

    
1803
    qemu_vfree(iov.iov_base);
1804
    return ret;
1805
}
1806

    
1807
/*
1808
 * Handle a write request in coroutine context
1809
 */
1810
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1811
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1812
    BdrvRequestFlags flags)
1813
{
1814
    BlockDriver *drv = bs->drv;
1815
    BdrvTrackedRequest req;
1816
    int ret;
1817

    
1818
    if (!bs->drv) {
1819
        return -ENOMEDIUM;
1820
    }
1821
    if (bs->read_only) {
1822
        return -EACCES;
1823
    }
1824
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1825
        return -EIO;
1826
    }
1827

    
1828
    /* throttling disk write I/O */
1829
    if (bs->io_limits_enabled) {
1830
        bdrv_io_limits_intercept(bs, true, nb_sectors);
1831
    }
1832

    
1833
    if (bs->copy_on_read_in_flight) {
1834
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1835
    }
1836

    
1837
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1838

    
1839
    if (flags & BDRV_REQ_ZERO_WRITE) {
1840
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1841
    } else {
1842
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1843
    }
1844

    
1845
    if (bs->dirty_bitmap) {
1846
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1847
    }
1848

    
1849
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1850
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
1851
    }
1852

    
1853
    tracked_request_end(&req);
1854

    
1855
    return ret;
1856
}
1857

    
1858
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1859
    int nb_sectors, QEMUIOVector *qiov)
1860
{
1861
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1862

    
1863
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1864
}
1865

    
1866
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1867
                                      int64_t sector_num, int nb_sectors)
1868
{
1869
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1870

    
1871
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1872
                             BDRV_REQ_ZERO_WRITE);
1873
}
1874

    
1875
/**
1876
 * Truncate file to 'offset' bytes (needed only for file protocols)
1877
 */
1878
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1879
{
1880
    BlockDriver *drv = bs->drv;
1881
    int ret;
1882
    if (!drv)
1883
        return -ENOMEDIUM;
1884
    if (!drv->bdrv_truncate)
1885
        return -ENOTSUP;
1886
    if (bs->read_only)
1887
        return -EACCES;
1888
    if (bdrv_in_use(bs))
1889
        return -EBUSY;
1890
    ret = drv->bdrv_truncate(bs, offset);
1891
    if (ret == 0) {
1892
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1893
        bdrv_dev_resize_cb(bs);
1894
    }
1895
    return ret;
1896
}
1897

    
1898
/**
1899
 * Length of a allocated file in bytes. Sparse files are counted by actual
1900
 * allocated space. Return < 0 if error or unknown.
1901
 */
1902
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1903
{
1904
    BlockDriver *drv = bs->drv;
1905
    if (!drv) {
1906
        return -ENOMEDIUM;
1907
    }
1908
    if (drv->bdrv_get_allocated_file_size) {
1909
        return drv->bdrv_get_allocated_file_size(bs);
1910
    }
1911
    if (bs->file) {
1912
        return bdrv_get_allocated_file_size(bs->file);
1913
    }
1914
    return -ENOTSUP;
1915
}
1916

    
1917
/**
1918
 * Length of a file in bytes. Return < 0 if error or unknown.
1919
 */
1920
int64_t bdrv_getlength(BlockDriverState *bs)
1921
{
1922
    BlockDriver *drv = bs->drv;
1923
    if (!drv)
1924
        return -ENOMEDIUM;
1925

    
1926
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1927
        if (drv->bdrv_getlength) {
1928
            return drv->bdrv_getlength(bs);
1929
        }
1930
    }
1931
    return bs->total_sectors * BDRV_SECTOR_SIZE;
1932
}
1933

    
1934
/* return 0 as number of sectors if no device present or error */
1935
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1936
{
1937
    int64_t length;
1938
    length = bdrv_getlength(bs);
1939
    if (length < 0)
1940
        length = 0;
1941
    else
1942
        length = length >> BDRV_SECTOR_BITS;
1943
    *nb_sectors_ptr = length;
1944
}
1945

    
1946
struct partition {
1947
        uint8_t boot_ind;           /* 0x80 - active */
1948
        uint8_t head;               /* starting head */
1949
        uint8_t sector;             /* starting sector */
1950
        uint8_t cyl;                /* starting cylinder */
1951
        uint8_t sys_ind;            /* What partition type */
1952
        uint8_t end_head;           /* end head */
1953
        uint8_t end_sector;         /* end sector */
1954
        uint8_t end_cyl;            /* end cylinder */
1955
        uint32_t start_sect;        /* starting sector counting from 0 */
1956
        uint32_t nr_sects;          /* nr of sectors in partition */
1957
} QEMU_PACKED;
1958

    
1959
/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1960
static int guess_disk_lchs(BlockDriverState *bs,
1961
                           int *pcylinders, int *pheads, int *psectors)
1962
{
1963
    uint8_t buf[BDRV_SECTOR_SIZE];
1964
    int ret, i, heads, sectors, cylinders;
1965
    struct partition *p;
1966
    uint32_t nr_sects;
1967
    uint64_t nb_sectors;
1968

    
1969
    bdrv_get_geometry(bs, &nb_sectors);
1970

    
1971
    ret = bdrv_read(bs, 0, buf, 1);
1972
    if (ret < 0)
1973
        return -1;
1974
    /* test msdos magic */
1975
    if (buf[510] != 0x55 || buf[511] != 0xaa)
1976
        return -1;
1977
    for(i = 0; i < 4; i++) {
1978
        p = ((struct partition *)(buf + 0x1be)) + i;
1979
        nr_sects = le32_to_cpu(p->nr_sects);
1980
        if (nr_sects && p->end_head) {
1981
            /* We make the assumption that the partition terminates on
1982
               a cylinder boundary */
1983
            heads = p->end_head + 1;
1984
            sectors = p->end_sector & 63;
1985
            if (sectors == 0)
1986
                continue;
1987
            cylinders = nb_sectors / (heads * sectors);
1988
            if (cylinders < 1 || cylinders > 16383)
1989
                continue;
1990
            *pheads = heads;
1991
            *psectors = sectors;
1992
            *pcylinders = cylinders;
1993
#if 0
1994
            printf("guessed geometry: LCHS=%d %d %d\n",
1995
                   cylinders, heads, sectors);
1996
#endif
1997
            return 0;
1998
        }
1999
    }
2000
    return -1;
2001
}
2002

    
2003
void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2004
{
2005
    int translation, lba_detected = 0;
2006
    int cylinders, heads, secs;
2007
    uint64_t nb_sectors;
2008

    
2009
    /* if a geometry hint is available, use it */
2010
    bdrv_get_geometry(bs, &nb_sectors);
2011
    bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2012
    translation = bdrv_get_translation_hint(bs);
2013
    if (cylinders != 0) {
2014
        *pcyls = cylinders;
2015
        *pheads = heads;
2016
        *psecs = secs;
2017
    } else {
2018
        if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2019
            if (heads > 16) {
2020
                /* if heads > 16, it means that a BIOS LBA
2021
                   translation was active, so the default
2022
                   hardware geometry is OK */
2023
                lba_detected = 1;
2024
                goto default_geometry;
2025
            } else {
2026
                *pcyls = cylinders;
2027
                *pheads = heads;
2028
                *psecs = secs;
2029
                /* disable any translation to be in sync with
2030
                   the logical geometry */
2031
                if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2032
                    bdrv_set_translation_hint(bs,
2033
                                              BIOS_ATA_TRANSLATION_NONE);
2034
                }
2035
            }
2036
        } else {
2037
        default_geometry:
2038
            /* if no geometry, use a standard physical disk geometry */
2039
            cylinders = nb_sectors / (16 * 63);
2040

    
2041
            if (cylinders > 16383)
2042
                cylinders = 16383;
2043
            else if (cylinders < 2)
2044
                cylinders = 2;
2045
            *pcyls = cylinders;
2046
            *pheads = 16;
2047
            *psecs = 63;
2048
            if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2049
                if ((*pcyls * *pheads) <= 131072) {
2050
                    bdrv_set_translation_hint(bs,
2051
                                              BIOS_ATA_TRANSLATION_LARGE);
2052
                } else {
2053
                    bdrv_set_translation_hint(bs,
2054
                                              BIOS_ATA_TRANSLATION_LBA);
2055
                }
2056
            }
2057
        }
2058
        bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2059
    }
2060
}
2061

    
2062
void bdrv_set_geometry_hint(BlockDriverState *bs,
2063
                            int cyls, int heads, int secs)
2064
{
2065
    bs->cyls = cyls;
2066
    bs->heads = heads;
2067
    bs->secs = secs;
2068
}
2069

    
2070
void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2071
{
2072
    bs->translation = translation;
2073
}
2074

    
2075
void bdrv_get_geometry_hint(BlockDriverState *bs,
2076
                            int *pcyls, int *pheads, int *psecs)
2077
{
2078
    *pcyls = bs->cyls;
2079
    *pheads = bs->heads;
2080
    *psecs = bs->secs;
2081
}
2082

    
2083
/* throttling disk io limits */
2084
void bdrv_set_io_limits(BlockDriverState *bs,
2085
                        BlockIOLimit *io_limits)
2086
{
2087
    bs->io_limits = *io_limits;
2088
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2089
}
2090

    
2091
/* Recognize floppy formats */
2092
typedef struct FDFormat {
2093
    FDriveType drive;
2094
    uint8_t last_sect;
2095
    uint8_t max_track;
2096
    uint8_t max_head;
2097
    FDriveRate rate;
2098
} FDFormat;
2099

    
2100
static const FDFormat fd_formats[] = {
2101
    /* First entry is default format */
2102
    /* 1.44 MB 3"1/2 floppy disks */
2103
    { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2104
    { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2105
    { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2106
    { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2107
    { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2108
    { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2109
    { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2110
    { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2111
    /* 2.88 MB 3"1/2 floppy disks */
2112
    { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2113
    { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2114
    { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2115
    { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2116
    { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2117
    /* 720 kB 3"1/2 floppy disks */
2118
    { FDRIVE_DRV_144,  9, 80, 1, FDRIVE_RATE_250K, },
2119
    { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2120
    { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2121
    { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2122
    { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2123
    { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2124
    /* 1.2 MB 5"1/4 floppy disks */
2125
    { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2126
    { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2127
    { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2128
    { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2129
    { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2130
    /* 720 kB 5"1/4 floppy disks */
2131
    { FDRIVE_DRV_120,  9, 80, 1, FDRIVE_RATE_250K, },
2132
    { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2133
    /* 360 kB 5"1/4 floppy disks */
2134
    { FDRIVE_DRV_120,  9, 40, 1, FDRIVE_RATE_300K, },
2135
    { FDRIVE_DRV_120,  9, 40, 0, FDRIVE_RATE_300K, },
2136
    { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2137
    { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2138
    /* 320 kB 5"1/4 floppy disks */
2139
    { FDRIVE_DRV_120,  8, 40, 1, FDRIVE_RATE_250K, },
2140
    { FDRIVE_DRV_120,  8, 40, 0, FDRIVE_RATE_250K, },
2141
    /* 360 kB must match 5"1/4 better than 3"1/2... */
2142
    { FDRIVE_DRV_144,  9, 80, 0, FDRIVE_RATE_250K, },
2143
    /* end */
2144
    { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2145
};
2146

    
2147
void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2148
                                   int *max_track, int *last_sect,
2149
                                   FDriveType drive_in, FDriveType *drive,
2150
                                   FDriveRate *rate)
2151
{
2152
    const FDFormat *parse;
2153
    uint64_t nb_sectors, size;
2154
    int i, first_match, match;
2155

    
2156
    bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2157
    if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2158
        /* User defined disk */
2159
        *rate = FDRIVE_RATE_500K;
2160
    } else {
2161
        bdrv_get_geometry(bs, &nb_sectors);
2162
        match = -1;
2163
        first_match = -1;
2164
        for (i = 0; ; i++) {
2165
            parse = &fd_formats[i];
2166
            if (parse->drive == FDRIVE_DRV_NONE) {
2167
                break;
2168
            }
2169
            if (drive_in == parse->drive ||
2170
                drive_in == FDRIVE_DRV_NONE) {
2171
                size = (parse->max_head + 1) * parse->max_track *
2172
                    parse->last_sect;
2173
                if (nb_sectors == size) {
2174
                    match = i;
2175
                    break;
2176
                }
2177
                if (first_match == -1) {
2178
                    first_match = i;
2179
                }
2180
            }
2181
        }
2182
        if (match == -1) {
2183
            if (first_match == -1) {
2184
                match = 1;
2185
            } else {
2186
                match = first_match;
2187
            }
2188
            parse = &fd_formats[match];
2189
        }
2190
        *nb_heads = parse->max_head + 1;
2191
        *max_track = parse->max_track;
2192
        *last_sect = parse->last_sect;
2193
        *drive = parse->drive;
2194
        *rate = parse->rate;
2195
    }
2196
}
2197

    
2198
int bdrv_get_translation_hint(BlockDriverState *bs)
2199
{
2200
    return bs->translation;
2201
}
2202

    
2203
void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2204
                       BlockErrorAction on_write_error)
2205
{
2206
    bs->on_read_error = on_read_error;
2207
    bs->on_write_error = on_write_error;
2208
}
2209

    
2210
BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2211
{
2212
    return is_read ? bs->on_read_error : bs->on_write_error;
2213
}
2214

    
2215
int bdrv_is_read_only(BlockDriverState *bs)
2216
{
2217
    return bs->read_only;
2218
}
2219

    
2220
int bdrv_is_sg(BlockDriverState *bs)
2221
{
2222
    return bs->sg;
2223
}
2224

    
2225
int bdrv_enable_write_cache(BlockDriverState *bs)
2226
{
2227
    return bs->enable_write_cache;
2228
}
2229

    
2230
int bdrv_is_encrypted(BlockDriverState *bs)
2231
{
2232
    if (bs->backing_hd && bs->backing_hd->encrypted)
2233
        return 1;
2234
    return bs->encrypted;
2235
}
2236

    
2237
int bdrv_key_required(BlockDriverState *bs)
2238
{
2239
    BlockDriverState *backing_hd = bs->backing_hd;
2240

    
2241
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2242
        return 1;
2243
    return (bs->encrypted && !bs->valid_key);
2244
}
2245

    
2246
int bdrv_set_key(BlockDriverState *bs, const char *key)
2247
{
2248
    int ret;
2249
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2250
        ret = bdrv_set_key(bs->backing_hd, key);
2251
        if (ret < 0)
2252
            return ret;
2253
        if (!bs->encrypted)
2254
            return 0;
2255
    }
2256
    if (!bs->encrypted) {
2257
        return -EINVAL;
2258
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2259
        return -ENOMEDIUM;
2260
    }
2261
    ret = bs->drv->bdrv_set_key(bs, key);
2262
    if (ret < 0) {
2263
        bs->valid_key = 0;
2264
    } else if (!bs->valid_key) {
2265
        bs->valid_key = 1;
2266
        /* call the change callback now, we skipped it on open */
2267
        bdrv_dev_change_media_cb(bs, true);
2268
    }
2269
    return ret;
2270
}
2271

    
2272
void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2273
{
2274
    if (!bs->drv) {
2275
        buf[0] = '\0';
2276
    } else {
2277
        pstrcpy(buf, buf_size, bs->drv->format_name);
2278
    }
2279
}
2280

    
2281
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2282
                         void *opaque)
2283
{
2284
    BlockDriver *drv;
2285

    
2286
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2287
        it(opaque, drv->format_name);
2288
    }
2289
}
2290

    
2291
BlockDriverState *bdrv_find(const char *name)
2292
{
2293
    BlockDriverState *bs;
2294

    
2295
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2296
        if (!strcmp(name, bs->device_name)) {
2297
            return bs;
2298
        }
2299
    }
2300
    return NULL;
2301
}
2302

    
2303
BlockDriverState *bdrv_next(BlockDriverState *bs)
2304
{
2305
    if (!bs) {
2306
        return QTAILQ_FIRST(&bdrv_states);
2307
    }
2308
    return QTAILQ_NEXT(bs, list);
2309
}
2310

    
2311
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2312
{
2313
    BlockDriverState *bs;
2314

    
2315
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2316
        it(opaque, bs);
2317
    }
2318
}
2319

    
2320
const char *bdrv_get_device_name(BlockDriverState *bs)
2321
{
2322
    return bs->device_name;
2323
}
2324

    
2325
void bdrv_flush_all(void)
2326
{
2327
    BlockDriverState *bs;
2328

    
2329
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2330
        if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
2331
            bdrv_flush(bs);
2332
        }
2333
    }
2334
}
2335

    
2336
int bdrv_has_zero_init(BlockDriverState *bs)
2337
{
2338
    assert(bs->drv);
2339

    
2340
    if (bs->drv->bdrv_has_zero_init) {
2341
        return bs->drv->bdrv_has_zero_init(bs);
2342
    }
2343

    
2344
    return 1;
2345
}
2346

    
2347
typedef struct BdrvCoIsAllocatedData {
2348
    BlockDriverState *bs;
2349
    int64_t sector_num;
2350
    int nb_sectors;
2351
    int *pnum;
2352
    int ret;
2353
    bool done;
2354
} BdrvCoIsAllocatedData;
2355

    
2356
/*
2357
 * Returns true iff the specified sector is present in the disk image. Drivers
2358
 * not implementing the functionality are assumed to not support backing files,
2359
 * hence all their sectors are reported as allocated.
2360
 *
2361
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2362
 * and 'pnum' is set to 0.
2363
 *
2364
 * 'pnum' is set to the number of sectors (including and immediately following
2365
 * the specified sector) that are known to be in the same
2366
 * allocated/unallocated state.
2367
 *
2368
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2369
 * beyond the end of the disk image it will be clamped.
2370
 */
2371
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2372
                                      int nb_sectors, int *pnum)
2373
{
2374
    int64_t n;
2375

    
2376
    if (sector_num >= bs->total_sectors) {
2377
        *pnum = 0;
2378
        return 0;
2379
    }
2380

    
2381
    n = bs->total_sectors - sector_num;
2382
    if (n < nb_sectors) {
2383
        nb_sectors = n;
2384
    }
2385

    
2386
    if (!bs->drv->bdrv_co_is_allocated) {
2387
        *pnum = nb_sectors;
2388
        return 1;
2389
    }
2390

    
2391
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2392
}
2393

    
2394
/* Coroutine wrapper for bdrv_is_allocated() */
2395
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2396
{
2397
    BdrvCoIsAllocatedData *data = opaque;
2398
    BlockDriverState *bs = data->bs;
2399

    
2400
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2401
                                     data->pnum);
2402
    data->done = true;
2403
}
2404

    
2405
/*
2406
 * Synchronous wrapper around bdrv_co_is_allocated().
2407
 *
2408
 * See bdrv_co_is_allocated() for details.
2409
 */
2410
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2411
                      int *pnum)
2412
{
2413
    Coroutine *co;
2414
    BdrvCoIsAllocatedData data = {
2415
        .bs = bs,
2416
        .sector_num = sector_num,
2417
        .nb_sectors = nb_sectors,
2418
        .pnum = pnum,
2419
        .done = false,
2420
    };
2421

    
2422
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2423
    qemu_coroutine_enter(co, &data);
2424
    while (!data.done) {
2425
        qemu_aio_wait();
2426
    }
2427
    return data.ret;
2428
}
2429

    
2430
BlockInfoList *qmp_query_block(Error **errp)
2431
{
2432
    BlockInfoList *head = NULL, *cur_item = NULL;
2433
    BlockDriverState *bs;
2434

    
2435
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2436
        BlockInfoList *info = g_malloc0(sizeof(*info));
2437

    
2438
        info->value = g_malloc0(sizeof(*info->value));
2439
        info->value->device = g_strdup(bs->device_name);
2440
        info->value->type = g_strdup("unknown");
2441
        info->value->locked = bdrv_dev_is_medium_locked(bs);
2442
        info->value->removable = bdrv_dev_has_removable_media(bs);
2443

    
2444
        if (bdrv_dev_has_removable_media(bs)) {
2445
            info->value->has_tray_open = true;
2446
            info->value->tray_open = bdrv_dev_is_tray_open(bs);
2447
        }
2448

    
2449
        if (bdrv_iostatus_is_enabled(bs)) {
2450
            info->value->has_io_status = true;
2451
            info->value->io_status = bs->iostatus;
2452
        }
2453

    
2454
        if (bs->drv) {
2455
            info->value->has_inserted = true;
2456
            info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2457
            info->value->inserted->file = g_strdup(bs->filename);
2458
            info->value->inserted->ro = bs->read_only;
2459
            info->value->inserted->drv = g_strdup(bs->drv->format_name);
2460
            info->value->inserted->encrypted = bs->encrypted;
2461
            if (bs->backing_file[0]) {
2462
                info->value->inserted->has_backing_file = true;
2463
                info->value->inserted->backing_file = g_strdup(bs->backing_file);
2464
            }
2465

    
2466
            if (bs->io_limits_enabled) {
2467
                info->value->inserted->bps =
2468
                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2469
                info->value->inserted->bps_rd =
2470
                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2471
                info->value->inserted->bps_wr =
2472
                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2473
                info->value->inserted->iops =
2474
                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2475
                info->value->inserted->iops_rd =
2476
                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2477
                info->value->inserted->iops_wr =
2478
                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2479
            }
2480
        }
2481

    
2482
        /* XXX: waiting for the qapi to support GSList */
2483
        if (!cur_item) {
2484
            head = cur_item = info;
2485
        } else {
2486
            cur_item->next = info;
2487
            cur_item = info;
2488
        }
2489
    }
2490

    
2491
    return head;
2492
}
2493

    
2494
/* Consider exposing this as a full fledged QMP command */
2495
static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2496
{
2497
    BlockStats *s;
2498

    
2499
    s = g_malloc0(sizeof(*s));
2500

    
2501
    if (bs->device_name[0]) {
2502
        s->has_device = true;
2503
        s->device = g_strdup(bs->device_name);
2504
    }
2505

    
2506
    s->stats = g_malloc0(sizeof(*s->stats));
2507
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2508
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2509
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2510
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2511
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2512
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2513
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2514
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2515
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2516

    
2517
    if (bs->file) {
2518
        s->has_parent = true;
2519
        s->parent = qmp_query_blockstat(bs->file, NULL);
2520
    }
2521

    
2522
    return s;
2523
}
2524

    
2525
BlockStatsList *qmp_query_blockstats(Error **errp)
2526
{
2527
    BlockStatsList *head = NULL, *cur_item = NULL;
2528
    BlockDriverState *bs;
2529

    
2530
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2531
        BlockStatsList *info = g_malloc0(sizeof(*info));
2532
        info->value = qmp_query_blockstat(bs, NULL);
2533

    
2534
        /* XXX: waiting for the qapi to support GSList */
2535
        if (!cur_item) {
2536
            head = cur_item = info;
2537
        } else {
2538
            cur_item->next = info;
2539
            cur_item = info;
2540
        }
2541
    }
2542

    
2543
    return head;
2544
}
2545

    
2546
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2547
{
2548
    if (bs->backing_hd && bs->backing_hd->encrypted)
2549
        return bs->backing_file;
2550
    else if (bs->encrypted)
2551
        return bs->filename;
2552
    else
2553
        return NULL;
2554
}
2555

    
2556
void bdrv_get_backing_filename(BlockDriverState *bs,
2557
                               char *filename, int filename_size)
2558
{
2559
    pstrcpy(filename, filename_size, bs->backing_file);
2560
}
2561

    
2562
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2563
                          const uint8_t *buf, int nb_sectors)
2564
{
2565
    BlockDriver *drv = bs->drv;
2566
    if (!drv)
2567
        return -ENOMEDIUM;
2568
    if (!drv->bdrv_write_compressed)
2569
        return -ENOTSUP;
2570
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2571
        return -EIO;
2572

    
2573
    if (bs->dirty_bitmap) {
2574
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2575
    }
2576

    
2577
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2578
}
2579

    
2580
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2581
{
2582
    BlockDriver *drv = bs->drv;
2583
    if (!drv)
2584
        return -ENOMEDIUM;
2585
    if (!drv->bdrv_get_info)
2586
        return -ENOTSUP;
2587
    memset(bdi, 0, sizeof(*bdi));
2588
    return drv->bdrv_get_info(bs, bdi);
2589
}
2590

    
2591
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2592
                      int64_t pos, int size)
2593
{
2594
    BlockDriver *drv = bs->drv;
2595
    if (!drv)
2596
        return -ENOMEDIUM;
2597
    if (drv->bdrv_save_vmstate)
2598
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
2599
    if (bs->file)
2600
        return bdrv_save_vmstate(bs->file, buf, pos, size);
2601
    return -ENOTSUP;
2602
}
2603

    
2604
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2605
                      int64_t pos, int size)
2606
{
2607
    BlockDriver *drv = bs->drv;
2608
    if (!drv)
2609
        return -ENOMEDIUM;
2610
    if (drv->bdrv_load_vmstate)
2611
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
2612
    if (bs->file)
2613
        return bdrv_load_vmstate(bs->file, buf, pos, size);
2614
    return -ENOTSUP;
2615
}
2616

    
2617
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2618
{
2619
    BlockDriver *drv = bs->drv;
2620

    
2621
    if (!drv || !drv->bdrv_debug_event) {
2622
        return;
2623
    }
2624

    
2625
    return drv->bdrv_debug_event(bs, event);
2626

    
2627
}
2628

    
2629
/**************************************************************/
2630
/* handling of snapshots */
2631

    
2632
int bdrv_can_snapshot(BlockDriverState *bs)
2633
{
2634
    BlockDriver *drv = bs->drv;
2635
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2636
        return 0;
2637
    }
2638

    
2639
    if (!drv->bdrv_snapshot_create) {
2640
        if (bs->file != NULL) {
2641
            return bdrv_can_snapshot(bs->file);
2642
        }
2643
        return 0;
2644
    }
2645

    
2646
    return 1;
2647
}
2648

    
2649
int bdrv_is_snapshot(BlockDriverState *bs)
2650
{
2651
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2652
}
2653

    
2654
BlockDriverState *bdrv_snapshots(void)
2655
{
2656
    BlockDriverState *bs;
2657

    
2658
    if (bs_snapshots) {
2659
        return bs_snapshots;
2660
    }
2661

    
2662
    bs = NULL;
2663
    while ((bs = bdrv_next(bs))) {
2664
        if (bdrv_can_snapshot(bs)) {
2665
            bs_snapshots = bs;
2666
            return bs;
2667
        }
2668
    }
2669
    return NULL;
2670
}
2671

    
2672
int bdrv_snapshot_create(BlockDriverState *bs,
2673
                         QEMUSnapshotInfo *sn_info)
2674
{
2675
    BlockDriver *drv = bs->drv;
2676
    if (!drv)
2677
        return -ENOMEDIUM;
2678
    if (drv->bdrv_snapshot_create)
2679
        return drv->bdrv_snapshot_create(bs, sn_info);
2680
    if (bs->file)
2681
        return bdrv_snapshot_create(bs->file, sn_info);
2682
    return -ENOTSUP;
2683
}
2684

    
2685
int bdrv_snapshot_goto(BlockDriverState *bs,
2686
                       const char *snapshot_id)
2687
{
2688
    BlockDriver *drv = bs->drv;
2689
    int ret, open_ret;
2690

    
2691
    if (!drv)
2692
        return -ENOMEDIUM;
2693
    if (drv->bdrv_snapshot_goto)
2694
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
2695

    
2696
    if (bs->file) {
2697
        drv->bdrv_close(bs);
2698
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2699
        open_ret = drv->bdrv_open(bs, bs->open_flags);
2700
        if (open_ret < 0) {
2701
            bdrv_delete(bs->file);
2702
            bs->drv = NULL;
2703
            return open_ret;
2704
        }
2705
        return ret;
2706
    }
2707

    
2708
    return -ENOTSUP;
2709
}
2710

    
2711
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2712
{
2713
    BlockDriver *drv = bs->drv;
2714
    if (!drv)
2715
        return -ENOMEDIUM;
2716
    if (drv->bdrv_snapshot_delete)
2717
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
2718
    if (bs->file)
2719
        return bdrv_snapshot_delete(bs->file, snapshot_id);
2720
    return -ENOTSUP;
2721
}
2722

    
2723
int bdrv_snapshot_list(BlockDriverState *bs,
2724
                       QEMUSnapshotInfo **psn_info)
2725
{
2726
    BlockDriver *drv = bs->drv;
2727
    if (!drv)
2728
        return -ENOMEDIUM;
2729
    if (drv->bdrv_snapshot_list)
2730
        return drv->bdrv_snapshot_list(bs, psn_info);
2731
    if (bs->file)
2732
        return bdrv_snapshot_list(bs->file, psn_info);
2733
    return -ENOTSUP;
2734
}
2735

    
2736
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2737
        const char *snapshot_name)
2738
{
2739
    BlockDriver *drv = bs->drv;
2740
    if (!drv) {
2741
        return -ENOMEDIUM;
2742
    }
2743
    if (!bs->read_only) {
2744
        return -EINVAL;
2745
    }
2746
    if (drv->bdrv_snapshot_load_tmp) {
2747
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2748
    }
2749
    return -ENOTSUP;
2750
}
2751

    
2752
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2753
        const char *backing_file)
2754
{
2755
    if (!bs->drv) {
2756
        return NULL;
2757
    }
2758

    
2759
    if (bs->backing_hd) {
2760
        if (strcmp(bs->backing_file, backing_file) == 0) {
2761
            return bs->backing_hd;
2762
        } else {
2763
            return bdrv_find_backing_image(bs->backing_hd, backing_file);
2764
        }
2765
    }
2766

    
2767
    return NULL;
2768
}
2769

    
2770
#define NB_SUFFIXES 4
2771

    
2772
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2773
{
2774
    static const char suffixes[NB_SUFFIXES] = "KMGT";
2775
    int64_t base;
2776
    int i;
2777

    
2778
    if (size <= 999) {
2779
        snprintf(buf, buf_size, "%" PRId64, size);
2780
    } else {
2781
        base = 1024;
2782
        for(i = 0; i < NB_SUFFIXES; i++) {
2783
            if (size < (10 * base)) {
2784
                snprintf(buf, buf_size, "%0.1f%c",
2785
                         (double)size / base,
2786
                         suffixes[i]);
2787
                break;
2788
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2789
                snprintf(buf, buf_size, "%" PRId64 "%c",
2790
                         ((size + (base >> 1)) / base),
2791
                         suffixes[i]);
2792
                break;
2793
            }
2794
            base = base * 1024;
2795
        }
2796
    }
2797
    return buf;
2798
}
2799

    
2800
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2801
{
2802
    char buf1[128], date_buf[128], clock_buf[128];
2803
#ifdef _WIN32
2804
    struct tm *ptm;
2805
#else
2806
    struct tm tm;
2807
#endif
2808
    time_t ti;
2809
    int64_t secs;
2810

    
2811
    if (!sn) {
2812
        snprintf(buf, buf_size,
2813
                 "%-10s%-20s%7s%20s%15s",
2814
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2815
    } else {
2816
        ti = sn->date_sec;
2817
#ifdef _WIN32
2818
        ptm = localtime(&ti);
2819
        strftime(date_buf, sizeof(date_buf),
2820
                 "%Y-%m-%d %H:%M:%S", ptm);
2821
#else
2822
        localtime_r(&ti, &tm);
2823
        strftime(date_buf, sizeof(date_buf),
2824
                 "%Y-%m-%d %H:%M:%S", &tm);
2825
#endif
2826
        secs = sn->vm_clock_nsec / 1000000000;
2827
        snprintf(clock_buf, sizeof(clock_buf),
2828
                 "%02d:%02d:%02d.%03d",
2829
                 (int)(secs / 3600),
2830
                 (int)((secs / 60) % 60),
2831
                 (int)(secs % 60),
2832
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2833
        snprintf(buf, buf_size,
2834
                 "%-10s%-20s%7s%20s%15s",
2835
                 sn->id_str, sn->name,
2836
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2837
                 date_buf,
2838
                 clock_buf);
2839
    }
2840
    return buf;
2841
}
2842

    
2843
/**************************************************************/
2844
/* async I/Os */
2845

    
2846
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2847
                                 QEMUIOVector *qiov, int nb_sectors,
2848
                                 BlockDriverCompletionFunc *cb, void *opaque)
2849
{
2850
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2851

    
2852
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2853
                                 cb, opaque, false);
2854
}
2855

    
2856
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2857
                                  QEMUIOVector *qiov, int nb_sectors,
2858
                                  BlockDriverCompletionFunc *cb, void *opaque)
2859
{
2860
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2861

    
2862
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2863
                                 cb, opaque, true);
2864
}
2865

    
2866

    
2867
typedef struct MultiwriteCB {
2868
    int error;
2869
    int num_requests;
2870
    int num_callbacks;
2871
    struct {
2872
        BlockDriverCompletionFunc *cb;
2873
        void *opaque;
2874
        QEMUIOVector *free_qiov;
2875
    } callbacks[];
2876
} MultiwriteCB;
2877

    
2878
static void multiwrite_user_cb(MultiwriteCB *mcb)
2879
{
2880
    int i;
2881

    
2882
    for (i = 0; i < mcb->num_callbacks; i++) {
2883
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2884
        if (mcb->callbacks[i].free_qiov) {
2885
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2886
        }
2887
        g_free(mcb->callbacks[i].free_qiov);
2888
    }
2889
}
2890

    
2891
static void multiwrite_cb(void *opaque, int ret)
2892
{
2893
    MultiwriteCB *mcb = opaque;
2894

    
2895
    trace_multiwrite_cb(mcb, ret);
2896

    
2897
    if (ret < 0 && !mcb->error) {
2898
        mcb->error = ret;
2899
    }
2900

    
2901
    mcb->num_requests--;
2902
    if (mcb->num_requests == 0) {
2903
        multiwrite_user_cb(mcb);
2904
        g_free(mcb);
2905
    }
2906
}
2907

    
2908
static int multiwrite_req_compare(const void *a, const void *b)
2909
{
2910
    const BlockRequest *req1 = a, *req2 = b;
2911

    
2912
    /*
2913
     * Note that we can't simply subtract req2->sector from req1->sector
2914
     * here as that could overflow the return value.
2915
     */
2916
    if (req1->sector > req2->sector) {
2917
        return 1;
2918
    } else if (req1->sector < req2->sector) {
2919
        return -1;
2920
    } else {
2921
        return 0;
2922
    }
2923
}
2924

    
2925
/*
2926
 * Takes a bunch of requests and tries to merge them. Returns the number of
2927
 * requests that remain after merging.
2928
 */
2929
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2930
    int num_reqs, MultiwriteCB *mcb)
2931
{
2932
    int i, outidx;
2933

    
2934
    // Sort requests by start sector
2935
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2936

    
2937
    // Check if adjacent requests touch the same clusters. If so, combine them,
2938
    // filling up gaps with zero sectors.
2939
    outidx = 0;
2940
    for (i = 1; i < num_reqs; i++) {
2941
        int merge = 0;
2942
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2943

    
2944
        // Handle exactly sequential writes and overlapping writes.
2945
        if (reqs[i].sector <= oldreq_last) {
2946
            merge = 1;
2947
        }
2948

    
2949
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2950
            merge = 0;
2951
        }
2952

    
2953
        if (merge) {
2954
            size_t size;
2955
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2956
            qemu_iovec_init(qiov,
2957
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2958

    
2959
            // Add the first request to the merged one. If the requests are
2960
            // overlapping, drop the last sectors of the first request.
2961
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
2962
            qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2963

    
2964
            // We should need to add any zeros between the two requests
2965
            assert (reqs[i].sector <= oldreq_last);
2966

    
2967
            // Add the second request
2968
            qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2969

    
2970
            reqs[outidx].nb_sectors = qiov->size >> 9;
2971
            reqs[outidx].qiov = qiov;
2972

    
2973
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2974
        } else {
2975
            outidx++;
2976
            reqs[outidx].sector     = reqs[i].sector;
2977
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2978
            reqs[outidx].qiov       = reqs[i].qiov;
2979
        }
2980
    }
2981

    
2982
    return outidx + 1;
2983
}
2984

    
2985
/*
2986
 * Submit multiple AIO write requests at once.
2987
 *
2988
 * On success, the function returns 0 and all requests in the reqs array have
2989
 * been submitted. In error case this function returns -1, and any of the
2990
 * requests may or may not be submitted yet. In particular, this means that the
2991
 * callback will be called for some of the requests, for others it won't. The
2992
 * caller must check the error field of the BlockRequest to wait for the right
2993
 * callbacks (if error != 0, no callback will be called).
2994
 *
2995
 * The implementation may modify the contents of the reqs array, e.g. to merge
2996
 * requests. However, the fields opaque and error are left unmodified as they
2997
 * are used to signal failure for a single request to the caller.
2998
 */
2999
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3000
{
3001
    MultiwriteCB *mcb;
3002
    int i;
3003

    
3004
    /* don't submit writes if we don't have a medium */
3005
    if (bs->drv == NULL) {
3006
        for (i = 0; i < num_reqs; i++) {
3007
            reqs[i].error = -ENOMEDIUM;
3008
        }
3009
        return -1;
3010
    }
3011

    
3012
    if (num_reqs == 0) {
3013
        return 0;
3014
    }
3015

    
3016
    // Create MultiwriteCB structure
3017
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3018
    mcb->num_requests = 0;
3019
    mcb->num_callbacks = num_reqs;
3020

    
3021
    for (i = 0; i < num_reqs; i++) {
3022
        mcb->callbacks[i].cb = reqs[i].cb;
3023
        mcb->callbacks[i].opaque = reqs[i].opaque;
3024
    }
3025

    
3026
    // Check for mergable requests
3027
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3028

    
3029
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3030

    
3031
    /* Run the aio requests. */
3032
    mcb->num_requests = num_reqs;
3033
    for (i = 0; i < num_reqs; i++) {
3034
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3035
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3036
    }
3037

    
3038
    return 0;
3039
}
3040

    
3041
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3042
{
3043
    acb->pool->cancel(acb);
3044
}
3045

    
3046
/* block I/O throttling */
3047
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3048
                 bool is_write, double elapsed_time, uint64_t *wait)
3049
{
3050
    uint64_t bps_limit = 0;
3051
    double   bytes_limit, bytes_base, bytes_res;
3052
    double   slice_time, wait_time;
3053

    
3054
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3055
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3056
    } else if (bs->io_limits.bps[is_write]) {
3057
        bps_limit = bs->io_limits.bps[is_write];
3058
    } else {
3059
        if (wait) {
3060
            *wait = 0;
3061
        }
3062

    
3063
        return false;
3064
    }
3065

    
3066
    slice_time = bs->slice_end - bs->slice_start;
3067
    slice_time /= (NANOSECONDS_PER_SECOND);
3068
    bytes_limit = bps_limit * slice_time;
3069
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3070
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3071
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3072
    }
3073

    
3074
    /* bytes_base: the bytes of data which have been read/written; and
3075
     *             it is obtained from the history statistic info.
3076
     * bytes_res: the remaining bytes of data which need to be read/written.
3077
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3078
     *             the total time for completing reading/writting all data.
3079
     */
3080
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3081

    
3082
    if (bytes_base + bytes_res <= bytes_limit) {
3083
        if (wait) {
3084
            *wait = 0;
3085
        }
3086

    
3087
        return false;
3088
    }
3089

    
3090
    /* Calc approx time to dispatch */
3091
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3092

    
3093
    /* When the I/O rate at runtime exceeds the limits,
3094
     * bs->slice_end need to be extended in order that the current statistic
3095
     * info can be kept until the timer fire, so it is increased and tuned
3096
     * based on the result of experiment.
3097
     */
3098
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3099
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3100
    if (wait) {
3101
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3102
    }
3103

    
3104
    return true;
3105
}
3106

    
3107
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3108
                             double elapsed_time, uint64_t *wait)
3109
{
3110
    uint64_t iops_limit = 0;
3111
    double   ios_limit, ios_base;
3112
    double   slice_time, wait_time;
3113

    
3114
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3115
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3116
    } else if (bs->io_limits.iops[is_write]) {
3117
        iops_limit = bs->io_limits.iops[is_write];
3118
    } else {
3119
        if (wait) {
3120
            *wait = 0;
3121
        }
3122

    
3123
        return false;
3124
    }
3125

    
3126
    slice_time = bs->slice_end - bs->slice_start;
3127
    slice_time /= (NANOSECONDS_PER_SECOND);
3128
    ios_limit  = iops_limit * slice_time;
3129
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3130
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3131
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3132
    }
3133

    
3134
    if (ios_base + 1 <= ios_limit) {
3135
        if (wait) {
3136
            *wait = 0;
3137
        }
3138

    
3139
        return false;
3140
    }
3141

    
3142
    /* Calc approx time to dispatch */
3143
    wait_time = (ios_base + 1) / iops_limit;
3144
    if (wait_time > elapsed_time) {
3145
        wait_time = wait_time - elapsed_time;
3146
    } else {
3147
        wait_time = 0;
3148
    }
3149

    
3150
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3151
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3152
    if (wait) {
3153
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3154
    }
3155

    
3156
    return true;
3157
}
3158

    
3159
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3160
                           bool is_write, int64_t *wait)
3161
{
3162
    int64_t  now, max_wait;
3163
    uint64_t bps_wait = 0, iops_wait = 0;
3164
    double   elapsed_time;
3165
    int      bps_ret, iops_ret;
3166

    
3167
    now = qemu_get_clock_ns(vm_clock);
3168
    if ((bs->slice_start < now)
3169
        && (bs->slice_end > now)) {
3170
        bs->slice_end = now + bs->slice_time;
3171
    } else {
3172
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3173
        bs->slice_start = now;
3174
        bs->slice_end   = now + bs->slice_time;
3175

    
3176
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3177
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3178

    
3179
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3180
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3181
    }
3182

    
3183
    elapsed_time  = now - bs->slice_start;
3184
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3185

    
3186
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3187
                                      is_write, elapsed_time, &bps_wait);
3188
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3189
                                      elapsed_time, &iops_wait);
3190
    if (bps_ret || iops_ret) {
3191
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3192
        if (wait) {
3193
            *wait = max_wait;
3194
        }
3195

    
3196
        now = qemu_get_clock_ns(vm_clock);
3197
        if (bs->slice_end < now + max_wait) {
3198
            bs->slice_end = now + max_wait;
3199
        }
3200

    
3201
        return true;
3202
    }
3203

    
3204
    if (wait) {
3205
        *wait = 0;
3206
    }
3207

    
3208
    return false;
3209
}
3210

    
3211
/**************************************************************/
3212
/* async block device emulation */
3213

    
3214
typedef struct BlockDriverAIOCBSync {
3215
    BlockDriverAIOCB common;
3216
    QEMUBH *bh;
3217
    int ret;
3218
    /* vector translation state */
3219
    QEMUIOVector *qiov;
3220
    uint8_t *bounce;
3221
    int is_write;
3222
} BlockDriverAIOCBSync;
3223

    
3224
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3225
{
3226
    BlockDriverAIOCBSync *acb =
3227
        container_of(blockacb, BlockDriverAIOCBSync, common);
3228
    qemu_bh_delete(acb->bh);
3229
    acb->bh = NULL;
3230
    qemu_aio_release(acb);
3231
}
3232

    
3233
static AIOPool bdrv_em_aio_pool = {
3234
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3235
    .cancel             = bdrv_aio_cancel_em,
3236
};
3237

    
3238
static void bdrv_aio_bh_cb(void *opaque)
3239
{
3240
    BlockDriverAIOCBSync *acb = opaque;
3241

    
3242
    if (!acb->is_write)
3243
        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3244
    qemu_vfree(acb->bounce);
3245
    acb->common.cb(acb->common.opaque, acb->ret);
3246
    qemu_bh_delete(acb->bh);
3247
    acb->bh = NULL;
3248
    qemu_aio_release(acb);
3249
}
3250

    
3251
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3252
                                            int64_t sector_num,
3253
                                            QEMUIOVector *qiov,
3254
                                            int nb_sectors,
3255
                                            BlockDriverCompletionFunc *cb,
3256
                                            void *opaque,
3257
                                            int is_write)
3258

    
3259
{
3260
    BlockDriverAIOCBSync *acb;
3261

    
3262
    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3263
    acb->is_write = is_write;
3264
    acb->qiov = qiov;
3265
    acb->bounce = qemu_blockalign(bs, qiov->size);
3266
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3267

    
3268
    if (is_write) {
3269
        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3270
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3271
    } else {
3272
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3273
    }
3274

    
3275
    qemu_bh_schedule(acb->bh);
3276

    
3277
    return &acb->common;
3278
}
3279

    
3280
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3281
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3282
        BlockDriverCompletionFunc *cb, void *opaque)
3283
{
3284
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3285
}
3286

    
3287
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3288
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3289
        BlockDriverCompletionFunc *cb, void *opaque)
3290
{
3291
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3292
}
3293

    
3294

    
3295
typedef struct BlockDriverAIOCBCoroutine {
3296
    BlockDriverAIOCB common;
3297
    BlockRequest req;
3298
    bool is_write;
3299
    QEMUBH* bh;
3300
} BlockDriverAIOCBCoroutine;
3301

    
3302
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3303
{
3304
    qemu_aio_flush();
3305
}
3306

    
3307
static AIOPool bdrv_em_co_aio_pool = {
3308
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3309
    .cancel             = bdrv_aio_co_cancel_em,
3310
};
3311

    
3312
static void bdrv_co_em_bh(void *opaque)
3313
{
3314
    BlockDriverAIOCBCoroutine *acb = opaque;
3315

    
3316
    acb->common.cb(acb->common.opaque, acb->req.error);
3317
    qemu_bh_delete(acb->bh);
3318
    qemu_aio_release(acb);
3319
}
3320

    
3321
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3322
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3323
{
3324
    BlockDriverAIOCBCoroutine *acb = opaque;
3325
    BlockDriverState *bs = acb->common.bs;
3326

    
3327
    if (!acb->is_write) {
3328
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3329
            acb->req.nb_sectors, acb->req.qiov, 0);
3330
    } else {
3331
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3332
            acb->req.nb_sectors, acb->req.qiov, 0);
3333
    }
3334

    
3335
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3336
    qemu_bh_schedule(acb->bh);
3337
}
3338

    
3339
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3340
                                               int64_t sector_num,
3341
                                               QEMUIOVector *qiov,
3342
                                               int nb_sectors,
3343
                                               BlockDriverCompletionFunc *cb,
3344
                                               void *opaque,
3345
                                               bool is_write)
3346
{
3347
    Coroutine *co;
3348
    BlockDriverAIOCBCoroutine *acb;
3349

    
3350
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3351
    acb->req.sector = sector_num;
3352
    acb->req.nb_sectors = nb_sectors;
3353
    acb->req.qiov = qiov;
3354
    acb->is_write = is_write;
3355

    
3356
    co = qemu_coroutine_create(bdrv_co_do_rw);
3357
    qemu_coroutine_enter(co, acb);
3358

    
3359
    return &acb->common;
3360
}
3361

    
3362
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3363
{
3364
    BlockDriverAIOCBCoroutine *acb = opaque;
3365
    BlockDriverState *bs = acb->common.bs;
3366

    
3367
    acb->req.error = bdrv_co_flush(bs);
3368
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3369
    qemu_bh_schedule(acb->bh);
3370
}
3371

    
3372
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3373
        BlockDriverCompletionFunc *cb, void *opaque)
3374
{
3375
    trace_bdrv_aio_flush(bs, opaque);
3376

    
3377
    Coroutine *co;
3378
    BlockDriverAIOCBCoroutine *acb;
3379

    
3380
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3381
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3382
    qemu_coroutine_enter(co, acb);
3383

    
3384
    return &acb->common;
3385
}
3386

    
3387
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3388
{
3389
    BlockDriverAIOCBCoroutine *acb = opaque;
3390
    BlockDriverState *bs = acb->common.bs;
3391

    
3392
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3393
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3394
    qemu_bh_schedule(acb->bh);
3395
}
3396

    
3397
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3398
        int64_t sector_num, int nb_sectors,
3399
        BlockDriverCompletionFunc *cb, void *opaque)
3400
{
3401
    Coroutine *co;
3402
    BlockDriverAIOCBCoroutine *acb;
3403

    
3404
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3405

    
3406
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3407
    acb->req.sector = sector_num;
3408
    acb->req.nb_sectors = nb_sectors;
3409
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3410
    qemu_coroutine_enter(co, acb);
3411

    
3412
    return &acb->common;
3413
}
3414

    
3415
void bdrv_init(void)
3416
{
3417
    module_call_init(MODULE_INIT_BLOCK);
3418
}
3419

    
3420
void bdrv_init_with_whitelist(void)
3421
{
3422
    use_bdrv_whitelist = 1;
3423
    bdrv_init();
3424
}
3425

    
3426
void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3427
                   BlockDriverCompletionFunc *cb, void *opaque)
3428
{
3429
    BlockDriverAIOCB *acb;
3430

    
3431
    if (pool->free_aiocb) {
3432
        acb = pool->free_aiocb;
3433
        pool->free_aiocb = acb->next;
3434
    } else {
3435
        acb = g_malloc0(pool->aiocb_size);
3436
        acb->pool = pool;
3437
    }
3438
    acb->bs = bs;
3439
    acb->cb = cb;
3440
    acb->opaque = opaque;
3441
    return acb;
3442
}
3443

    
3444
void qemu_aio_release(void *p)
3445
{
3446
    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3447
    AIOPool *pool = acb->pool;
3448
    acb->next = pool->free_aiocb;
3449
    pool->free_aiocb = acb;
3450
}
3451

    
3452
/**************************************************************/
3453
/* Coroutine block device emulation */
3454

    
3455
typedef struct CoroutineIOCompletion {
3456
    Coroutine *coroutine;
3457
    int ret;
3458
} CoroutineIOCompletion;
3459

    
3460
static void bdrv_co_io_em_complete(void *opaque, int ret)
3461
{
3462
    CoroutineIOCompletion *co = opaque;
3463

    
3464
    co->ret = ret;
3465
    qemu_coroutine_enter(co->coroutine, NULL);
3466
}
3467

    
3468
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3469
                                      int nb_sectors, QEMUIOVector *iov,
3470
                                      bool is_write)
3471
{
3472
    CoroutineIOCompletion co = {
3473
        .coroutine = qemu_coroutine_self(),
3474
    };
3475
    BlockDriverAIOCB *acb;
3476

    
3477
    if (is_write) {
3478
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3479
                                       bdrv_co_io_em_complete, &co);
3480
    } else {
3481
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3482
                                      bdrv_co_io_em_complete, &co);
3483
    }
3484

    
3485
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3486
    if (!acb) {
3487
        return -EIO;
3488
    }
3489
    qemu_coroutine_yield();
3490

    
3491
    return co.ret;
3492
}
3493

    
3494
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3495
                                         int64_t sector_num, int nb_sectors,
3496
                                         QEMUIOVector *iov)
3497
{
3498
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3499
}
3500

    
3501
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3502
                                         int64_t sector_num, int nb_sectors,
3503
                                         QEMUIOVector *iov)
3504
{
3505
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3506
}
3507

    
3508
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3509
{
3510
    RwCo *rwco = opaque;
3511

    
3512
    rwco->ret = bdrv_co_flush(rwco->bs);
3513
}
3514

    
3515
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3516
{
3517
    int ret;
3518

    
3519
    if (!bs->drv) {
3520
        return 0;
3521
    }
3522

    
3523
    /* Write back cached data to the OS even with cache=unsafe */
3524
    if (bs->drv->bdrv_co_flush_to_os) {
3525
        ret = bs->drv->bdrv_co_flush_to_os(bs);
3526
        if (ret < 0) {
3527
            return ret;
3528
        }
3529
    }
3530

    
3531
    /* But don't actually force it to the disk with cache=unsafe */
3532
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
3533
        return 0;
3534
    }
3535

    
3536
    if (bs->drv->bdrv_co_flush_to_disk) {
3537
        return bs->drv->bdrv_co_flush_to_disk(bs);
3538
    } else if (bs->drv->bdrv_aio_flush) {
3539
        BlockDriverAIOCB *acb;
3540
        CoroutineIOCompletion co = {
3541
            .coroutine = qemu_coroutine_self(),
3542
        };
3543

    
3544
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3545
        if (acb == NULL) {
3546
            return -EIO;
3547
        } else {
3548
            qemu_coroutine_yield();
3549
            return co.ret;
3550
        }
3551
    } else {
3552
        /*
3553
         * Some block drivers always operate in either writethrough or unsafe
3554
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3555
         * know how the server works (because the behaviour is hardcoded or
3556
         * depends on server-side configuration), so we can't ensure that
3557
         * everything is safe on disk. Returning an error doesn't work because
3558
         * that would break guests even if the server operates in writethrough
3559
         * mode.
3560
         *
3561
         * Let's hope the user knows what he's doing.
3562
         */
3563
        return 0;
3564
    }
3565
}
3566

    
3567
void bdrv_invalidate_cache(BlockDriverState *bs)
3568
{
3569
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3570
        bs->drv->bdrv_invalidate_cache(bs);
3571
    }
3572
}
3573

    
3574
void bdrv_invalidate_cache_all(void)
3575
{
3576
    BlockDriverState *bs;
3577

    
3578
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3579
        bdrv_invalidate_cache(bs);
3580
    }
3581
}
3582

    
3583
int bdrv_flush(BlockDriverState *bs)
3584
{
3585
    Coroutine *co;
3586
    RwCo rwco = {
3587
        .bs = bs,
3588
        .ret = NOT_DONE,
3589
    };
3590

    
3591
    if (qemu_in_coroutine()) {
3592
        /* Fast-path if already in coroutine context */
3593
        bdrv_flush_co_entry(&rwco);
3594
    } else {
3595
        co = qemu_coroutine_create(bdrv_flush_co_entry);
3596
        qemu_coroutine_enter(co, &rwco);
3597
        while (rwco.ret == NOT_DONE) {
3598
            qemu_aio_wait();
3599
        }
3600
    }
3601

    
3602
    return rwco.ret;
3603
}
3604

    
3605
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3606
{
3607
    RwCo *rwco = opaque;
3608

    
3609
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3610
}
3611

    
3612
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3613
                                 int nb_sectors)
3614
{
3615
    if (!bs->drv) {
3616
        return -ENOMEDIUM;
3617
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3618
        return -EIO;
3619
    } else if (bs->read_only) {
3620
        return -EROFS;
3621
    } else if (bs->drv->bdrv_co_discard) {
3622
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3623
    } else if (bs->drv->bdrv_aio_discard) {
3624
        BlockDriverAIOCB *acb;
3625
        CoroutineIOCompletion co = {
3626
            .coroutine = qemu_coroutine_self(),
3627
        };
3628

    
3629
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3630
                                        bdrv_co_io_em_complete, &co);
3631
        if (acb == NULL) {
3632
            return -EIO;
3633
        } else {
3634
            qemu_coroutine_yield();
3635
            return co.ret;
3636
        }
3637
    } else {
3638
        return 0;
3639
    }
3640
}
3641

    
3642
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3643
{
3644
    Coroutine *co;
3645
    RwCo rwco = {
3646
        .bs = bs,
3647
        .sector_num = sector_num,
3648
        .nb_sectors = nb_sectors,
3649
        .ret = NOT_DONE,
3650
    };
3651

    
3652
    if (qemu_in_coroutine()) {
3653
        /* Fast-path if already in coroutine context */
3654
        bdrv_discard_co_entry(&rwco);
3655
    } else {
3656
        co = qemu_coroutine_create(bdrv_discard_co_entry);
3657
        qemu_coroutine_enter(co, &rwco);
3658
        while (rwco.ret == NOT_DONE) {
3659
            qemu_aio_wait();
3660
        }
3661
    }
3662

    
3663
    return rwco.ret;
3664
}
3665

    
3666
/**************************************************************/
3667
/* removable device support */
3668

    
3669
/**
3670
 * Return TRUE if the media is present
3671
 */
3672
int bdrv_is_inserted(BlockDriverState *bs)
3673
{
3674
    BlockDriver *drv = bs->drv;
3675

    
3676
    if (!drv)
3677
        return 0;
3678
    if (!drv->bdrv_is_inserted)
3679
        return 1;
3680
    return drv->bdrv_is_inserted(bs);
3681
}
3682

    
3683
/**
3684
 * Return whether the media changed since the last call to this
3685
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3686
 */
3687
int bdrv_media_changed(BlockDriverState *bs)
3688
{
3689
    BlockDriver *drv = bs->drv;
3690

    
3691
    if (drv && drv->bdrv_media_changed) {
3692
        return drv->bdrv_media_changed(bs);
3693
    }
3694
    return -ENOTSUP;
3695
}
3696

    
3697
/**
3698
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3699
 */
3700
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3701
{
3702
    BlockDriver *drv = bs->drv;
3703

    
3704
    if (drv && drv->bdrv_eject) {
3705
        drv->bdrv_eject(bs, eject_flag);
3706
    }
3707

    
3708
    if (bs->device_name[0] != '\0') {
3709
        bdrv_emit_qmp_eject_event(bs, eject_flag);
3710
    }
3711
}
3712

    
3713
/**
3714
 * Lock or unlock the media (if it is locked, the user won't be able
3715
 * to eject it manually).
3716
 */
3717
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3718
{
3719
    BlockDriver *drv = bs->drv;
3720

    
3721
    trace_bdrv_lock_medium(bs, locked);
3722

    
3723
    if (drv && drv->bdrv_lock_medium) {
3724
        drv->bdrv_lock_medium(bs, locked);
3725
    }
3726
}
3727

    
3728
/* needed for generic scsi interface */
3729

    
3730
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3731
{
3732
    BlockDriver *drv = bs->drv;
3733

    
3734
    if (drv && drv->bdrv_ioctl)
3735
        return drv->bdrv_ioctl(bs, req, buf);
3736
    return -ENOTSUP;
3737
}
3738

    
3739
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3740
        unsigned long int req, void *buf,
3741
        BlockDriverCompletionFunc *cb, void *opaque)
3742
{
3743
    BlockDriver *drv = bs->drv;
3744

    
3745
    if (drv && drv->bdrv_aio_ioctl)
3746
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3747
    return NULL;
3748
}
3749

    
3750
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3751
{
3752
    bs->buffer_alignment = align;
3753
}
3754

    
3755
void *qemu_blockalign(BlockDriverState *bs, size_t size)
3756
{
3757
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3758
}
3759

    
3760
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3761
{
3762
    int64_t bitmap_size;
3763

    
3764
    bs->dirty_count = 0;
3765
    if (enable) {
3766
        if (!bs->dirty_bitmap) {
3767
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3768
                    BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3769
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3770

    
3771
            bs->dirty_bitmap = g_malloc0(bitmap_size);
3772
        }
3773
    } else {
3774
        if (bs->dirty_bitmap) {
3775
            g_free(bs->dirty_bitmap);
3776
            bs->dirty_bitmap = NULL;
3777
        }
3778
    }
3779
}
3780

    
3781
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3782
{
3783
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3784

    
3785
    if (bs->dirty_bitmap &&
3786
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3787
        return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3788
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
3789
    } else {
3790
        return 0;
3791
    }
3792
}
3793

    
3794
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3795
                      int nr_sectors)
3796
{
3797
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3798
}
3799

    
3800
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3801
{
3802
    return bs->dirty_count;
3803
}
3804

    
3805
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3806
{
3807
    assert(bs->in_use != in_use);
3808
    bs->in_use = in_use;
3809
}
3810

    
3811
int bdrv_in_use(BlockDriverState *bs)
3812
{
3813
    return bs->in_use;
3814
}
3815

    
3816
void bdrv_iostatus_enable(BlockDriverState *bs)
3817
{
3818
    bs->iostatus_enabled = true;
3819
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3820
}
3821

    
3822
/* The I/O status is only enabled if the drive explicitly
3823
 * enables it _and_ the VM is configured to stop on errors */
3824
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3825
{
3826
    return (bs->iostatus_enabled &&
3827
           (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3828
            bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3829
            bs->on_read_error == BLOCK_ERR_STOP_ANY));
3830
}
3831

    
3832
void bdrv_iostatus_disable(BlockDriverState *bs)
3833
{
3834
    bs->iostatus_enabled = false;
3835
}
3836

    
3837
void bdrv_iostatus_reset(BlockDriverState *bs)
3838
{
3839
    if (bdrv_iostatus_is_enabled(bs)) {
3840
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3841
    }
3842
}
3843

    
3844
/* XXX: Today this is set by device models because it makes the implementation
3845
   quite simple. However, the block layer knows about the error, so it's
3846
   possible to implement this without device models being involved */
3847
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3848
{
3849
    if (bdrv_iostatus_is_enabled(bs) &&
3850
        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3851
        assert(error >= 0);
3852
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3853
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
3854
    }
3855
}
3856

    
3857
void
3858
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3859
        enum BlockAcctType type)
3860
{
3861
    assert(type < BDRV_MAX_IOTYPE);
3862

    
3863
    cookie->bytes = bytes;
3864
    cookie->start_time_ns = get_clock();
3865
    cookie->type = type;
3866
}
3867

    
3868
void
3869
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3870
{
3871
    assert(cookie->type < BDRV_MAX_IOTYPE);
3872

    
3873
    bs->nr_bytes[cookie->type] += cookie->bytes;
3874
    bs->nr_ops[cookie->type]++;
3875
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3876
}
3877

    
3878
int bdrv_img_create(const char *filename, const char *fmt,
3879
                    const char *base_filename, const char *base_fmt,
3880
                    char *options, uint64_t img_size, int flags)
3881
{
3882
    QEMUOptionParameter *param = NULL, *create_options = NULL;
3883
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
3884
    BlockDriverState *bs = NULL;
3885
    BlockDriver *drv, *proto_drv;
3886
    BlockDriver *backing_drv = NULL;
3887
    int ret = 0;
3888

    
3889
    /* Find driver and parse its options */
3890
    drv = bdrv_find_format(fmt);
3891
    if (!drv) {
3892
        error_report("Unknown file format '%s'", fmt);
3893
        ret = -EINVAL;
3894
        goto out;
3895
    }
3896

    
3897
    proto_drv = bdrv_find_protocol(filename);
3898
    if (!proto_drv) {
3899
        error_report("Unknown protocol '%s'", filename);
3900
        ret = -EINVAL;
3901
        goto out;
3902
    }
3903

    
3904
    create_options = append_option_parameters(create_options,
3905
                                              drv->create_options);
3906
    create_options = append_option_parameters(create_options,
3907
                                              proto_drv->create_options);
3908

    
3909
    /* Create parameter list with default values */
3910
    param = parse_option_parameters("", create_options, param);
3911

    
3912
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3913

    
3914
    /* Parse -o options */
3915
    if (options) {
3916
        param = parse_option_parameters(options, create_options, param);
3917
        if (param == NULL) {
3918
            error_report("Invalid options for file format '%s'.", fmt);
3919
            ret = -EINVAL;
3920
            goto out;
3921
        }
3922
    }
3923

    
3924
    if (base_filename) {
3925
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3926
                                 base_filename)) {
3927
            error_report("Backing file not supported for file format '%s'",
3928
                         fmt);
3929
            ret = -EINVAL;
3930
            goto out;
3931
        }
3932
    }
3933

    
3934
    if (base_fmt) {
3935
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3936
            error_report("Backing file format not supported for file "
3937
                         "format '%s'", fmt);
3938
            ret = -EINVAL;
3939
            goto out;
3940
        }
3941
    }
3942

    
3943
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3944
    if (backing_file && backing_file->value.s) {
3945
        if (!strcmp(filename, backing_file->value.s)) {
3946
            error_report("Error: Trying to create an image with the "
3947
                         "same filename as the backing file");
3948
            ret = -EINVAL;
3949
            goto out;
3950
        }
3951
    }
3952

    
3953
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3954
    if (backing_fmt && backing_fmt->value.s) {
3955
        backing_drv = bdrv_find_format(backing_fmt->value.s);
3956
        if (!backing_drv) {
3957
            error_report("Unknown backing file format '%s'",
3958
                         backing_fmt->value.s);
3959
            ret = -EINVAL;
3960
            goto out;
3961
        }
3962
    }
3963

    
3964
    // The size for the image must always be specified, with one exception:
3965
    // If we are using a backing file, we can obtain the size from there
3966
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
3967
    if (size && size->value.n == -1) {
3968
        if (backing_file && backing_file->value.s) {
3969
            uint64_t size;
3970
            char buf[32];
3971

    
3972
            bs = bdrv_new("");
3973

    
3974
            ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3975
            if (ret < 0) {
3976
                error_report("Could not open '%s'", backing_file->value.s);
3977
                goto out;
3978
            }
3979
            bdrv_get_geometry(bs, &size);
3980
            size *= 512;
3981

    
3982
            snprintf(buf, sizeof(buf), "%" PRId64, size);
3983
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3984
        } else {
3985
            error_report("Image creation needs a size parameter");
3986
            ret = -EINVAL;
3987
            goto out;
3988
        }
3989
    }
3990

    
3991
    printf("Formatting '%s', fmt=%s ", filename, fmt);
3992
    print_option_parameters(param);
3993
    puts("");
3994

    
3995
    ret = bdrv_create(drv, filename, param);
3996

    
3997
    if (ret < 0) {
3998
        if (ret == -ENOTSUP) {
3999
            error_report("Formatting or formatting option not supported for "
4000
                         "file format '%s'", fmt);
4001
        } else if (ret == -EFBIG) {
4002
            error_report("The image size is too large for file format '%s'",
4003
                         fmt);
4004
        } else {
4005
            error_report("%s: error while creating %s: %s", filename, fmt,
4006
                         strerror(-ret));
4007
        }
4008
    }
4009

    
4010
out:
4011
    free_option_parameters(create_options);
4012
    free_option_parameters(param);
4013

    
4014
    if (bs) {
4015
        bdrv_delete(bs);
4016
    }
4017

    
4018
    return ret;
4019
}
4020

    
4021
void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4022
                       BlockDriverCompletionFunc *cb, void *opaque)
4023
{
4024
    BlockJob *job;
4025

    
4026
    if (bs->job || bdrv_in_use(bs)) {
4027
        return NULL;
4028
    }
4029
    bdrv_set_in_use(bs, 1);
4030

    
4031
    job = g_malloc0(job_type->instance_size);
4032
    job->job_type      = job_type;
4033
    job->bs            = bs;
4034
    job->cb            = cb;
4035
    job->opaque        = opaque;
4036
    bs->job = job;
4037
    return job;
4038
}
4039

    
4040
void block_job_complete(BlockJob *job, int ret)
4041
{
4042
    BlockDriverState *bs = job->bs;
4043

    
4044
    assert(bs->job == job);
4045
    job->cb(job->opaque, ret);
4046
    bs->job = NULL;
4047
    g_free(job);
4048
    bdrv_set_in_use(bs, 0);
4049
}
4050

    
4051
int block_job_set_speed(BlockJob *job, int64_t value)
4052
{
4053
    if (!job->job_type->set_speed) {
4054
        return -ENOTSUP;
4055
    }
4056
    return job->job_type->set_speed(job, value);
4057
}
4058

    
4059
void block_job_cancel(BlockJob *job)
4060
{
4061
    job->cancelled = true;
4062
}
4063

    
4064
bool block_job_is_cancelled(BlockJob *job)
4065
{
4066
    return job->cancelled;
4067
}