Statistics
| Branch: | Revision:

root / block.c @ 0ecb72a5

History | View | Annotate | Download (112.1 kB)

1
/*
2
 * QEMU System Emulator block driver
3
 *
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "config-host.h"
25
#include "qemu-common.h"
26
#include "trace.h"
27
#include "monitor.h"
28
#include "block_int.h"
29
#include "module.h"
30
#include "qjson.h"
31
#include "qemu-coroutine.h"
32
#include "qmp-commands.h"
33
#include "qemu-timer.h"
34

    
35
#ifdef CONFIG_BSD
36
#include <sys/types.h>
37
#include <sys/stat.h>
38
#include <sys/ioctl.h>
39
#include <sys/queue.h>
40
#ifndef __DragonFly__
41
#include <sys/disk.h>
42
#endif
43
#endif
44

    
45
#ifdef _WIN32
46
#include <windows.h>
47
#endif
48

    
49
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50

    
51
typedef enum {
52
    BDRV_REQ_COPY_ON_READ = 0x1,
53
    BDRV_REQ_ZERO_WRITE   = 0x2,
54
} BdrvRequestFlags;
55

    
56
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59
        BlockDriverCompletionFunc *cb, void *opaque);
60
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62
        BlockDriverCompletionFunc *cb, void *opaque);
63
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64
                                         int64_t sector_num, int nb_sectors,
65
                                         QEMUIOVector *iov);
66
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67
                                         int64_t sector_num, int nb_sectors,
68
                                         QEMUIOVector *iov);
69
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71
    BdrvRequestFlags flags);
72
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74
    BdrvRequestFlags flags);
75
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76
                                               int64_t sector_num,
77
                                               QEMUIOVector *qiov,
78
                                               int nb_sectors,
79
                                               BlockDriverCompletionFunc *cb,
80
                                               void *opaque,
81
                                               bool is_write);
82
static void coroutine_fn bdrv_co_do_rw(void *opaque);
83

    
84
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
85
        bool is_write, double elapsed_time, uint64_t *wait);
86
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
87
        double elapsed_time, uint64_t *wait);
88
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
89
        bool is_write, int64_t *wait);
90

    
91
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
93

    
94
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
95
    QLIST_HEAD_INITIALIZER(bdrv_drivers);
96

    
97
/* The device to use for VM snapshots */
98
static BlockDriverState *bs_snapshots;
99

    
100
/* If non-zero, use only whitelisted block drivers */
101
static int use_bdrv_whitelist;
102

    
103
#ifdef _WIN32
104
static int is_windows_drive_prefix(const char *filename)
105
{
106
    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107
             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108
            filename[1] == ':');
109
}
110

    
111
int is_windows_drive(const char *filename)
112
{
113
    if (is_windows_drive_prefix(filename) &&
114
        filename[2] == '\0')
115
        return 1;
116
    if (strstart(filename, "\\\\.\\", NULL) ||
117
        strstart(filename, "//./", NULL))
118
        return 1;
119
    return 0;
120
}
121
#endif
122

    
123
/* throttling disk I/O limits */
124
void bdrv_io_limits_disable(BlockDriverState *bs)
125
{
126
    bs->io_limits_enabled = false;
127

    
128
    while (qemu_co_queue_next(&bs->throttled_reqs));
129

    
130
    if (bs->block_timer) {
131
        qemu_del_timer(bs->block_timer);
132
        qemu_free_timer(bs->block_timer);
133
        bs->block_timer = NULL;
134
    }
135

    
136
    bs->slice_start = 0;
137
    bs->slice_end   = 0;
138
    bs->slice_time  = 0;
139
    memset(&bs->io_base, 0, sizeof(bs->io_base));
140
}
141

    
142
static void bdrv_block_timer(void *opaque)
143
{
144
    BlockDriverState *bs = opaque;
145

    
146
    qemu_co_queue_next(&bs->throttled_reqs);
147
}
148

    
149
void bdrv_io_limits_enable(BlockDriverState *bs)
150
{
151
    qemu_co_queue_init(&bs->throttled_reqs);
152
    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153
    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
154
    bs->slice_start = qemu_get_clock_ns(vm_clock);
155
    bs->slice_end   = bs->slice_start + bs->slice_time;
156
    memset(&bs->io_base, 0, sizeof(bs->io_base));
157
    bs->io_limits_enabled = true;
158
}
159

    
160
bool bdrv_io_limits_enabled(BlockDriverState *bs)
161
{
162
    BlockIOLimit *io_limits = &bs->io_limits;
163
    return io_limits->bps[BLOCK_IO_LIMIT_READ]
164
         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
165
         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
166
         || io_limits->iops[BLOCK_IO_LIMIT_READ]
167
         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
168
         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
169
}
170

    
171
static void bdrv_io_limits_intercept(BlockDriverState *bs,
172
                                     bool is_write, int nb_sectors)
173
{
174
    int64_t wait_time = -1;
175

    
176
    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
177
        qemu_co_queue_wait(&bs->throttled_reqs);
178
    }
179

    
180
    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
181
     * throttled requests will not be dequeued until the current request is
182
     * allowed to be serviced. So if the current request still exceeds the
183
     * limits, it will be inserted to the head. All requests followed it will
184
     * be still in throttled_reqs queue.
185
     */
186

    
187
    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
188
        qemu_mod_timer(bs->block_timer,
189
                       wait_time + qemu_get_clock_ns(vm_clock));
190
        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
191
    }
192

    
193
    qemu_co_queue_next(&bs->throttled_reqs);
194
}
195

    
196
/* check if the path starts with "<protocol>:" */
197
static int path_has_protocol(const char *path)
198
{
199
#ifdef _WIN32
200
    if (is_windows_drive(path) ||
201
        is_windows_drive_prefix(path)) {
202
        return 0;
203
    }
204
#endif
205

    
206
    return strchr(path, ':') != NULL;
207
}
208

    
209
int path_is_absolute(const char *path)
210
{
211
    const char *p;
212
#ifdef _WIN32
213
    /* specific case for names like: "\\.\d:" */
214
    if (*path == '/' || *path == '\\')
215
        return 1;
216
#endif
217
    p = strchr(path, ':');
218
    if (p)
219
        p++;
220
    else
221
        p = path;
222
#ifdef _WIN32
223
    return (*p == '/' || *p == '\\');
224
#else
225
    return (*p == '/');
226
#endif
227
}
228

    
229
/* if filename is absolute, just copy it to dest. Otherwise, build a
230
   path to it by considering it is relative to base_path. URL are
231
   supported. */
232
void path_combine(char *dest, int dest_size,
233
                  const char *base_path,
234
                  const char *filename)
235
{
236
    const char *p, *p1;
237
    int len;
238

    
239
    if (dest_size <= 0)
240
        return;
241
    if (path_is_absolute(filename)) {
242
        pstrcpy(dest, dest_size, filename);
243
    } else {
244
        p = strchr(base_path, ':');
245
        if (p)
246
            p++;
247
        else
248
            p = base_path;
249
        p1 = strrchr(base_path, '/');
250
#ifdef _WIN32
251
        {
252
            const char *p2;
253
            p2 = strrchr(base_path, '\\');
254
            if (!p1 || p2 > p1)
255
                p1 = p2;
256
        }
257
#endif
258
        if (p1)
259
            p1++;
260
        else
261
            p1 = base_path;
262
        if (p1 > p)
263
            p = p1;
264
        len = p - base_path;
265
        if (len > dest_size - 1)
266
            len = dest_size - 1;
267
        memcpy(dest, base_path, len);
268
        dest[len] = '\0';
269
        pstrcat(dest, dest_size, filename);
270
    }
271
}
272

    
273
void bdrv_register(BlockDriver *bdrv)
274
{
275
    /* Block drivers without coroutine functions need emulation */
276
    if (!bdrv->bdrv_co_readv) {
277
        bdrv->bdrv_co_readv = bdrv_co_readv_em;
278
        bdrv->bdrv_co_writev = bdrv_co_writev_em;
279

    
280
        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
281
         * the block driver lacks aio we need to emulate that too.
282
         */
283
        if (!bdrv->bdrv_aio_readv) {
284
            /* add AIO emulation layer */
285
            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
286
            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
287
        }
288
    }
289

    
290
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
291
}
292

    
293
/* create a new block device (by default it is empty) */
294
BlockDriverState *bdrv_new(const char *device_name)
295
{
296
    BlockDriverState *bs;
297

    
298
    bs = g_malloc0(sizeof(BlockDriverState));
299
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
300
    if (device_name[0] != '\0') {
301
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
302
    }
303
    bdrv_iostatus_disable(bs);
304
    return bs;
305
}
306

    
307
BlockDriver *bdrv_find_format(const char *format_name)
308
{
309
    BlockDriver *drv1;
310
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
311
        if (!strcmp(drv1->format_name, format_name)) {
312
            return drv1;
313
        }
314
    }
315
    return NULL;
316
}
317

    
318
static int bdrv_is_whitelisted(BlockDriver *drv)
319
{
320
    static const char *whitelist[] = {
321
        CONFIG_BDRV_WHITELIST
322
    };
323
    const char **p;
324

    
325
    if (!whitelist[0])
326
        return 1;               /* no whitelist, anything goes */
327

    
328
    for (p = whitelist; *p; p++) {
329
        if (!strcmp(drv->format_name, *p)) {
330
            return 1;
331
        }
332
    }
333
    return 0;
334
}
335

    
336
BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
337
{
338
    BlockDriver *drv = bdrv_find_format(format_name);
339
    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
340
}
341

    
342
int bdrv_create(BlockDriver *drv, const char* filename,
343
    QEMUOptionParameter *options)
344
{
345
    if (!drv->bdrv_create)
346
        return -ENOTSUP;
347

    
348
    return drv->bdrv_create(filename, options);
349
}
350

    
351
int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
352
{
353
    BlockDriver *drv;
354

    
355
    drv = bdrv_find_protocol(filename);
356
    if (drv == NULL) {
357
        return -ENOENT;
358
    }
359

    
360
    return bdrv_create(drv, filename, options);
361
}
362

    
363
#ifdef _WIN32
364
void get_tmp_filename(char *filename, int size)
365
{
366
    char temp_dir[MAX_PATH];
367

    
368
    GetTempPath(MAX_PATH, temp_dir);
369
    GetTempFileName(temp_dir, "qem", 0, filename);
370
}
371
#else
372
void get_tmp_filename(char *filename, int size)
373
{
374
    int fd;
375
    const char *tmpdir;
376
    /* XXX: race condition possible */
377
    tmpdir = getenv("TMPDIR");
378
    if (!tmpdir)
379
        tmpdir = "/tmp";
380
    snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
381
    fd = mkstemp(filename);
382
    close(fd);
383
}
384
#endif
385

    
386
/*
387
 * Detect host devices. By convention, /dev/cdrom[N] is always
388
 * recognized as a host CDROM.
389
 */
390
static BlockDriver *find_hdev_driver(const char *filename)
391
{
392
    int score_max = 0, score;
393
    BlockDriver *drv = NULL, *d;
394

    
395
    QLIST_FOREACH(d, &bdrv_drivers, list) {
396
        if (d->bdrv_probe_device) {
397
            score = d->bdrv_probe_device(filename);
398
            if (score > score_max) {
399
                score_max = score;
400
                drv = d;
401
            }
402
        }
403
    }
404

    
405
    return drv;
406
}
407

    
408
BlockDriver *bdrv_find_protocol(const char *filename)
409
{
410
    BlockDriver *drv1;
411
    char protocol[128];
412
    int len;
413
    const char *p;
414

    
415
    /* TODO Drivers without bdrv_file_open must be specified explicitly */
416

    
417
    /*
418
     * XXX(hch): we really should not let host device detection
419
     * override an explicit protocol specification, but moving this
420
     * later breaks access to device names with colons in them.
421
     * Thanks to the brain-dead persistent naming schemes on udev-
422
     * based Linux systems those actually are quite common.
423
     */
424
    drv1 = find_hdev_driver(filename);
425
    if (drv1) {
426
        return drv1;
427
    }
428

    
429
    if (!path_has_protocol(filename)) {
430
        return bdrv_find_format("file");
431
    }
432
    p = strchr(filename, ':');
433
    assert(p != NULL);
434
    len = p - filename;
435
    if (len > sizeof(protocol) - 1)
436
        len = sizeof(protocol) - 1;
437
    memcpy(protocol, filename, len);
438
    protocol[len] = '\0';
439
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
440
        if (drv1->protocol_name &&
441
            !strcmp(drv1->protocol_name, protocol)) {
442
            return drv1;
443
        }
444
    }
445
    return NULL;
446
}
447

    
448
static int find_image_format(const char *filename, BlockDriver **pdrv)
449
{
450
    int ret, score, score_max;
451
    BlockDriver *drv1, *drv;
452
    uint8_t buf[2048];
453
    BlockDriverState *bs;
454

    
455
    ret = bdrv_file_open(&bs, filename, 0);
456
    if (ret < 0) {
457
        *pdrv = NULL;
458
        return ret;
459
    }
460

    
461
    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
462
    if (bs->sg || !bdrv_is_inserted(bs)) {
463
        bdrv_delete(bs);
464
        drv = bdrv_find_format("raw");
465
        if (!drv) {
466
            ret = -ENOENT;
467
        }
468
        *pdrv = drv;
469
        return ret;
470
    }
471

    
472
    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
473
    bdrv_delete(bs);
474
    if (ret < 0) {
475
        *pdrv = NULL;
476
        return ret;
477
    }
478

    
479
    score_max = 0;
480
    drv = NULL;
481
    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482
        if (drv1->bdrv_probe) {
483
            score = drv1->bdrv_probe(buf, ret, filename);
484
            if (score > score_max) {
485
                score_max = score;
486
                drv = drv1;
487
            }
488
        }
489
    }
490
    if (!drv) {
491
        ret = -ENOENT;
492
    }
493
    *pdrv = drv;
494
    return ret;
495
}
496

    
497
/**
498
 * Set the current 'total_sectors' value
499
 */
500
static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
501
{
502
    BlockDriver *drv = bs->drv;
503

    
504
    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
505
    if (bs->sg)
506
        return 0;
507

    
508
    /* query actual device if possible, otherwise just trust the hint */
509
    if (drv->bdrv_getlength) {
510
        int64_t length = drv->bdrv_getlength(bs);
511
        if (length < 0) {
512
            return length;
513
        }
514
        hint = length >> BDRV_SECTOR_BITS;
515
    }
516

    
517
    bs->total_sectors = hint;
518
    return 0;
519
}
520

    
521
/**
522
 * Set open flags for a given cache mode
523
 *
524
 * Return 0 on success, -1 if the cache mode was invalid.
525
 */
526
int bdrv_parse_cache_flags(const char *mode, int *flags)
527
{
528
    *flags &= ~BDRV_O_CACHE_MASK;
529

    
530
    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
531
        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
532
    } else if (!strcmp(mode, "directsync")) {
533
        *flags |= BDRV_O_NOCACHE;
534
    } else if (!strcmp(mode, "writeback")) {
535
        *flags |= BDRV_O_CACHE_WB;
536
    } else if (!strcmp(mode, "unsafe")) {
537
        *flags |= BDRV_O_CACHE_WB;
538
        *flags |= BDRV_O_NO_FLUSH;
539
    } else if (!strcmp(mode, "writethrough")) {
540
        /* this is the default */
541
    } else {
542
        return -1;
543
    }
544

    
545
    return 0;
546
}
547

    
548
/**
549
 * The copy-on-read flag is actually a reference count so multiple users may
550
 * use the feature without worrying about clobbering its previous state.
551
 * Copy-on-read stays enabled until all users have called to disable it.
552
 */
553
void bdrv_enable_copy_on_read(BlockDriverState *bs)
554
{
555
    bs->copy_on_read++;
556
}
557

    
558
void bdrv_disable_copy_on_read(BlockDriverState *bs)
559
{
560
    assert(bs->copy_on_read > 0);
561
    bs->copy_on_read--;
562
}
563

    
564
/*
565
 * Common part for opening disk images and files
566
 */
567
static int bdrv_open_common(BlockDriverState *bs, const char *filename,
568
    int flags, BlockDriver *drv)
569
{
570
    int ret, open_flags;
571

    
572
    assert(drv != NULL);
573

    
574
    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
575

    
576
    bs->file = NULL;
577
    bs->total_sectors = 0;
578
    bs->encrypted = 0;
579
    bs->valid_key = 0;
580
    bs->sg = 0;
581
    bs->open_flags = flags;
582
    bs->growable = 0;
583
    bs->buffer_alignment = 512;
584

    
585
    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
586
    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
587
        bdrv_enable_copy_on_read(bs);
588
    }
589

    
590
    pstrcpy(bs->filename, sizeof(bs->filename), filename);
591
    bs->backing_file[0] = '\0';
592

    
593
    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
594
        return -ENOTSUP;
595
    }
596

    
597
    bs->drv = drv;
598
    bs->opaque = g_malloc0(drv->instance_size);
599

    
600
    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
601

    
602
    /*
603
     * Clear flags that are internal to the block layer before opening the
604
     * image.
605
     */
606
    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
607

    
608
    /*
609
     * Snapshots should be writable.
610
     */
611
    if (bs->is_temporary) {
612
        open_flags |= BDRV_O_RDWR;
613
    }
614

    
615
    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
616

    
617
    /* Open the image, either directly or using a protocol */
618
    if (drv->bdrv_file_open) {
619
        ret = drv->bdrv_file_open(bs, filename, open_flags);
620
    } else {
621
        ret = bdrv_file_open(&bs->file, filename, open_flags);
622
        if (ret >= 0) {
623
            ret = drv->bdrv_open(bs, open_flags);
624
        }
625
    }
626

    
627
    if (ret < 0) {
628
        goto free_and_fail;
629
    }
630

    
631
    ret = refresh_total_sectors(bs, bs->total_sectors);
632
    if (ret < 0) {
633
        goto free_and_fail;
634
    }
635

    
636
#ifndef _WIN32
637
    if (bs->is_temporary) {
638
        unlink(filename);
639
    }
640
#endif
641
    return 0;
642

    
643
free_and_fail:
644
    if (bs->file) {
645
        bdrv_delete(bs->file);
646
        bs->file = NULL;
647
    }
648
    g_free(bs->opaque);
649
    bs->opaque = NULL;
650
    bs->drv = NULL;
651
    return ret;
652
}
653

    
654
/*
655
 * Opens a file using a protocol (file, host_device, nbd, ...)
656
 */
657
int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
658
{
659
    BlockDriverState *bs;
660
    BlockDriver *drv;
661
    int ret;
662

    
663
    drv = bdrv_find_protocol(filename);
664
    if (!drv) {
665
        return -ENOENT;
666
    }
667

    
668
    bs = bdrv_new("");
669
    ret = bdrv_open_common(bs, filename, flags, drv);
670
    if (ret < 0) {
671
        bdrv_delete(bs);
672
        return ret;
673
    }
674
    bs->growable = 1;
675
    *pbs = bs;
676
    return 0;
677
}
678

    
679
/*
680
 * Opens a disk image (raw, qcow2, vmdk, ...)
681
 */
682
int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
683
              BlockDriver *drv)
684
{
685
    int ret;
686
    char tmp_filename[PATH_MAX];
687

    
688
    if (flags & BDRV_O_SNAPSHOT) {
689
        BlockDriverState *bs1;
690
        int64_t total_size;
691
        int is_protocol = 0;
692
        BlockDriver *bdrv_qcow2;
693
        QEMUOptionParameter *options;
694
        char backing_filename[PATH_MAX];
695

    
696
        /* if snapshot, we create a temporary backing file and open it
697
           instead of opening 'filename' directly */
698

    
699
        /* if there is a backing file, use it */
700
        bs1 = bdrv_new("");
701
        ret = bdrv_open(bs1, filename, 0, drv);
702
        if (ret < 0) {
703
            bdrv_delete(bs1);
704
            return ret;
705
        }
706
        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
707

    
708
        if (bs1->drv && bs1->drv->protocol_name)
709
            is_protocol = 1;
710

    
711
        bdrv_delete(bs1);
712

    
713
        get_tmp_filename(tmp_filename, sizeof(tmp_filename));
714

    
715
        /* Real path is meaningless for protocols */
716
        if (is_protocol)
717
            snprintf(backing_filename, sizeof(backing_filename),
718
                     "%s", filename);
719
        else if (!realpath(filename, backing_filename))
720
            return -errno;
721

    
722
        bdrv_qcow2 = bdrv_find_format("qcow2");
723
        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
724

    
725
        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
726
        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
727
        if (drv) {
728
            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
729
                drv->format_name);
730
        }
731

    
732
        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
733
        free_option_parameters(options);
734
        if (ret < 0) {
735
            return ret;
736
        }
737

    
738
        filename = tmp_filename;
739
        drv = bdrv_qcow2;
740
        bs->is_temporary = 1;
741
    }
742

    
743
    /* Find the right image format driver */
744
    if (!drv) {
745
        ret = find_image_format(filename, &drv);
746
    }
747

    
748
    if (!drv) {
749
        goto unlink_and_fail;
750
    }
751

    
752
    /* Open the image */
753
    ret = bdrv_open_common(bs, filename, flags, drv);
754
    if (ret < 0) {
755
        goto unlink_and_fail;
756
    }
757

    
758
    /* If there is a backing file, use it */
759
    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
760
        char backing_filename[PATH_MAX];
761
        int back_flags;
762
        BlockDriver *back_drv = NULL;
763

    
764
        bs->backing_hd = bdrv_new("");
765

    
766
        if (path_has_protocol(bs->backing_file)) {
767
            pstrcpy(backing_filename, sizeof(backing_filename),
768
                    bs->backing_file);
769
        } else {
770
            path_combine(backing_filename, sizeof(backing_filename),
771
                         filename, bs->backing_file);
772
        }
773

    
774
        if (bs->backing_format[0] != '\0') {
775
            back_drv = bdrv_find_format(bs->backing_format);
776
        }
777

    
778
        /* backing files always opened read-only */
779
        back_flags =
780
            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
781

    
782
        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
783
        if (ret < 0) {
784
            bdrv_close(bs);
785
            return ret;
786
        }
787
        if (bs->is_temporary) {
788
            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
789
        } else {
790
            /* base image inherits from "parent" */
791
            bs->backing_hd->keep_read_only = bs->keep_read_only;
792
        }
793
    }
794

    
795
    if (!bdrv_key_required(bs)) {
796
        bdrv_dev_change_media_cb(bs, true);
797
    }
798

    
799
    /* throttling disk I/O limits */
800
    if (bs->io_limits_enabled) {
801
        bdrv_io_limits_enable(bs);
802
    }
803

    
804
    return 0;
805

    
806
unlink_and_fail:
807
    if (bs->is_temporary) {
808
        unlink(filename);
809
    }
810
    return ret;
811
}
812

    
813
void bdrv_close(BlockDriverState *bs)
814
{
815
    if (bs->drv) {
816
        if (bs == bs_snapshots) {
817
            bs_snapshots = NULL;
818
        }
819
        if (bs->backing_hd) {
820
            bdrv_delete(bs->backing_hd);
821
            bs->backing_hd = NULL;
822
        }
823
        bs->drv->bdrv_close(bs);
824
        g_free(bs->opaque);
825
#ifdef _WIN32
826
        if (bs->is_temporary) {
827
            unlink(bs->filename);
828
        }
829
#endif
830
        bs->opaque = NULL;
831
        bs->drv = NULL;
832
        bs->copy_on_read = 0;
833

    
834
        if (bs->file != NULL) {
835
            bdrv_close(bs->file);
836
        }
837

    
838
        bdrv_dev_change_media_cb(bs, false);
839
    }
840

    
841
    /*throttling disk I/O limits*/
842
    if (bs->io_limits_enabled) {
843
        bdrv_io_limits_disable(bs);
844
    }
845
}
846

    
847
void bdrv_close_all(void)
848
{
849
    BlockDriverState *bs;
850

    
851
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
852
        bdrv_close(bs);
853
    }
854
}
855

    
856
/*
857
 * Wait for pending requests to complete across all BlockDriverStates
858
 *
859
 * This function does not flush data to disk, use bdrv_flush_all() for that
860
 * after calling this function.
861
 */
862
void bdrv_drain_all(void)
863
{
864
    BlockDriverState *bs;
865

    
866
    qemu_aio_flush();
867

    
868
    /* If requests are still pending there is a bug somewhere */
869
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
870
        assert(QLIST_EMPTY(&bs->tracked_requests));
871
        assert(qemu_co_queue_empty(&bs->throttled_reqs));
872
    }
873
}
874

    
875
/* make a BlockDriverState anonymous by removing from bdrv_state list.
876
   Also, NULL terminate the device_name to prevent double remove */
877
void bdrv_make_anon(BlockDriverState *bs)
878
{
879
    if (bs->device_name[0] != '\0') {
880
        QTAILQ_REMOVE(&bdrv_states, bs, list);
881
    }
882
    bs->device_name[0] = '\0';
883
}
884

    
885
/*
886
 * Add new bs contents at the top of an image chain while the chain is
887
 * live, while keeping required fields on the top layer.
888
 *
889
 * This will modify the BlockDriverState fields, and swap contents
890
 * between bs_new and bs_top. Both bs_new and bs_top are modified.
891
 *
892
 * This function does not create any image files.
893
 */
894
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
895
{
896
    BlockDriverState tmp;
897

    
898
    /* the new bs must not be in bdrv_states */
899
    bdrv_make_anon(bs_new);
900

    
901
    tmp = *bs_new;
902

    
903
    /* there are some fields that need to stay on the top layer: */
904

    
905
    /* dev info */
906
    tmp.dev_ops           = bs_top->dev_ops;
907
    tmp.dev_opaque        = bs_top->dev_opaque;
908
    tmp.dev               = bs_top->dev;
909
    tmp.buffer_alignment  = bs_top->buffer_alignment;
910
    tmp.copy_on_read      = bs_top->copy_on_read;
911

    
912
    /* i/o timing parameters */
913
    tmp.slice_time        = bs_top->slice_time;
914
    tmp.slice_start       = bs_top->slice_start;
915
    tmp.slice_end         = bs_top->slice_end;
916
    tmp.io_limits         = bs_top->io_limits;
917
    tmp.io_base           = bs_top->io_base;
918
    tmp.throttled_reqs    = bs_top->throttled_reqs;
919
    tmp.block_timer       = bs_top->block_timer;
920
    tmp.io_limits_enabled = bs_top->io_limits_enabled;
921

    
922
    /* geometry */
923
    tmp.cyls              = bs_top->cyls;
924
    tmp.heads             = bs_top->heads;
925
    tmp.secs              = bs_top->secs;
926
    tmp.translation       = bs_top->translation;
927

    
928
    /* r/w error */
929
    tmp.on_read_error     = bs_top->on_read_error;
930
    tmp.on_write_error    = bs_top->on_write_error;
931

    
932
    /* i/o status */
933
    tmp.iostatus_enabled  = bs_top->iostatus_enabled;
934
    tmp.iostatus          = bs_top->iostatus;
935

    
936
    /* keep the same entry in bdrv_states */
937
    pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
938
    tmp.list = bs_top->list;
939

    
940
    /* The contents of 'tmp' will become bs_top, as we are
941
     * swapping bs_new and bs_top contents. */
942
    tmp.backing_hd = bs_new;
943
    pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
944

    
945
    /* swap contents of the fixed new bs and the current top */
946
    *bs_new = *bs_top;
947
    *bs_top = tmp;
948

    
949
    /* clear the copied fields in the new backing file */
950
    bdrv_detach_dev(bs_new, bs_new->dev);
951

    
952
    qemu_co_queue_init(&bs_new->throttled_reqs);
953
    memset(&bs_new->io_base,   0, sizeof(bs_new->io_base));
954
    memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
955
    bdrv_iostatus_disable(bs_new);
956

    
957
    /* we don't use bdrv_io_limits_disable() for this, because we don't want
958
     * to affect or delete the block_timer, as it has been moved to bs_top */
959
    bs_new->io_limits_enabled = false;
960
    bs_new->block_timer       = NULL;
961
    bs_new->slice_time        = 0;
962
    bs_new->slice_start       = 0;
963
    bs_new->slice_end         = 0;
964
}
965

    
966
void bdrv_delete(BlockDriverState *bs)
967
{
968
    assert(!bs->dev);
969

    
970
    /* remove from list, if necessary */
971
    bdrv_make_anon(bs);
972

    
973
    bdrv_close(bs);
974
    if (bs->file != NULL) {
975
        bdrv_delete(bs->file);
976
    }
977

    
978
    assert(bs != bs_snapshots);
979
    g_free(bs);
980
}
981

    
982
int bdrv_attach_dev(BlockDriverState *bs, void *dev)
983
/* TODO change to DeviceState *dev when all users are qdevified */
984
{
985
    if (bs->dev) {
986
        return -EBUSY;
987
    }
988
    bs->dev = dev;
989
    bdrv_iostatus_reset(bs);
990
    return 0;
991
}
992

    
993
/* TODO qdevified devices don't use this, remove when devices are qdevified */
994
void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
995
{
996
    if (bdrv_attach_dev(bs, dev) < 0) {
997
        abort();
998
    }
999
}
1000

    
1001
void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1002
/* TODO change to DeviceState *dev when all users are qdevified */
1003
{
1004
    assert(bs->dev == dev);
1005
    bs->dev = NULL;
1006
    bs->dev_ops = NULL;
1007
    bs->dev_opaque = NULL;
1008
    bs->buffer_alignment = 512;
1009
}
1010

    
1011
/* TODO change to return DeviceState * when all users are qdevified */
1012
void *bdrv_get_attached_dev(BlockDriverState *bs)
1013
{
1014
    return bs->dev;
1015
}
1016

    
1017
void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1018
                      void *opaque)
1019
{
1020
    bs->dev_ops = ops;
1021
    bs->dev_opaque = opaque;
1022
    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1023
        bs_snapshots = NULL;
1024
    }
1025
}
1026

    
1027
void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1028
                               BlockQMPEventAction action, int is_read)
1029
{
1030
    QObject *data;
1031
    const char *action_str;
1032

    
1033
    switch (action) {
1034
    case BDRV_ACTION_REPORT:
1035
        action_str = "report";
1036
        break;
1037
    case BDRV_ACTION_IGNORE:
1038
        action_str = "ignore";
1039
        break;
1040
    case BDRV_ACTION_STOP:
1041
        action_str = "stop";
1042
        break;
1043
    default:
1044
        abort();
1045
    }
1046

    
1047
    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1048
                              bdrv->device_name,
1049
                              action_str,
1050
                              is_read ? "read" : "write");
1051
    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1052

    
1053
    qobject_decref(data);
1054
}
1055

    
1056
static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1057
{
1058
    QObject *data;
1059

    
1060
    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1061
                              bdrv_get_device_name(bs), ejected);
1062
    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1063

    
1064
    qobject_decref(data);
1065
}
1066

    
1067
static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1068
{
1069
    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1070
        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1071
        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1072
        if (tray_was_closed) {
1073
            /* tray open */
1074
            bdrv_emit_qmp_eject_event(bs, true);
1075
        }
1076
        if (load) {
1077
            /* tray close */
1078
            bdrv_emit_qmp_eject_event(bs, false);
1079
        }
1080
    }
1081
}
1082

    
1083
bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1084
{
1085
    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1086
}
1087

    
1088
void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1089
{
1090
    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1091
        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1092
    }
1093
}
1094

    
1095
bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1096
{
1097
    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1098
        return bs->dev_ops->is_tray_open(bs->dev_opaque);
1099
    }
1100
    return false;
1101
}
1102

    
1103
static void bdrv_dev_resize_cb(BlockDriverState *bs)
1104
{
1105
    if (bs->dev_ops && bs->dev_ops->resize_cb) {
1106
        bs->dev_ops->resize_cb(bs->dev_opaque);
1107
    }
1108
}
1109

    
1110
bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1111
{
1112
    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1113
        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1114
    }
1115
    return false;
1116
}
1117

    
1118
/*
1119
 * Run consistency checks on an image
1120
 *
1121
 * Returns 0 if the check could be completed (it doesn't mean that the image is
1122
 * free of errors) or -errno when an internal error occurred. The results of the
1123
 * check are stored in res.
1124
 */
1125
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1126
{
1127
    if (bs->drv->bdrv_check == NULL) {
1128
        return -ENOTSUP;
1129
    }
1130

    
1131
    memset(res, 0, sizeof(*res));
1132
    return bs->drv->bdrv_check(bs, res);
1133
}
1134

    
1135
#define COMMIT_BUF_SECTORS 2048
1136

    
1137
/* commit COW file into the raw image */
1138
int bdrv_commit(BlockDriverState *bs)
1139
{
1140
    BlockDriver *drv = bs->drv;
1141
    BlockDriver *backing_drv;
1142
    int64_t sector, total_sectors;
1143
    int n, ro, open_flags;
1144
    int ret = 0, rw_ret = 0;
1145
    uint8_t *buf;
1146
    char filename[1024];
1147
    BlockDriverState *bs_rw, *bs_ro;
1148

    
1149
    if (!drv)
1150
        return -ENOMEDIUM;
1151
    
1152
    if (!bs->backing_hd) {
1153
        return -ENOTSUP;
1154
    }
1155

    
1156
    if (bs->backing_hd->keep_read_only) {
1157
        return -EACCES;
1158
    }
1159

    
1160
    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1161
        return -EBUSY;
1162
    }
1163

    
1164
    backing_drv = bs->backing_hd->drv;
1165
    ro = bs->backing_hd->read_only;
1166
    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1167
    open_flags =  bs->backing_hd->open_flags;
1168

    
1169
    if (ro) {
1170
        /* re-open as RW */
1171
        bdrv_delete(bs->backing_hd);
1172
        bs->backing_hd = NULL;
1173
        bs_rw = bdrv_new("");
1174
        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1175
            backing_drv);
1176
        if (rw_ret < 0) {
1177
            bdrv_delete(bs_rw);
1178
            /* try to re-open read-only */
1179
            bs_ro = bdrv_new("");
1180
            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1181
                backing_drv);
1182
            if (ret < 0) {
1183
                bdrv_delete(bs_ro);
1184
                /* drive not functional anymore */
1185
                bs->drv = NULL;
1186
                return ret;
1187
            }
1188
            bs->backing_hd = bs_ro;
1189
            return rw_ret;
1190
        }
1191
        bs->backing_hd = bs_rw;
1192
    }
1193

    
1194
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1195
    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1196

    
1197
    for (sector = 0; sector < total_sectors; sector += n) {
1198
        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1199

    
1200
            if (bdrv_read(bs, sector, buf, n) != 0) {
1201
                ret = -EIO;
1202
                goto ro_cleanup;
1203
            }
1204

    
1205
            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1206
                ret = -EIO;
1207
                goto ro_cleanup;
1208
            }
1209
        }
1210
    }
1211

    
1212
    if (drv->bdrv_make_empty) {
1213
        ret = drv->bdrv_make_empty(bs);
1214
        bdrv_flush(bs);
1215
    }
1216

    
1217
    /*
1218
     * Make sure all data we wrote to the backing device is actually
1219
     * stable on disk.
1220
     */
1221
    if (bs->backing_hd)
1222
        bdrv_flush(bs->backing_hd);
1223

    
1224
ro_cleanup:
1225
    g_free(buf);
1226

    
1227
    if (ro) {
1228
        /* re-open as RO */
1229
        bdrv_delete(bs->backing_hd);
1230
        bs->backing_hd = NULL;
1231
        bs_ro = bdrv_new("");
1232
        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1233
            backing_drv);
1234
        if (ret < 0) {
1235
            bdrv_delete(bs_ro);
1236
            /* drive not functional anymore */
1237
            bs->drv = NULL;
1238
            return ret;
1239
        }
1240
        bs->backing_hd = bs_ro;
1241
        bs->backing_hd->keep_read_only = 0;
1242
    }
1243

    
1244
    return ret;
1245
}
1246

    
1247
int bdrv_commit_all(void)
1248
{
1249
    BlockDriverState *bs;
1250

    
1251
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1252
        int ret = bdrv_commit(bs);
1253
        if (ret < 0) {
1254
            return ret;
1255
        }
1256
    }
1257
    return 0;
1258
}
1259

    
1260
struct BdrvTrackedRequest {
1261
    BlockDriverState *bs;
1262
    int64_t sector_num;
1263
    int nb_sectors;
1264
    bool is_write;
1265
    QLIST_ENTRY(BdrvTrackedRequest) list;
1266
    Coroutine *co; /* owner, used for deadlock detection */
1267
    CoQueue wait_queue; /* coroutines blocked on this request */
1268
};
1269

    
1270
/**
1271
 * Remove an active request from the tracked requests list
1272
 *
1273
 * This function should be called when a tracked request is completing.
1274
 */
1275
static void tracked_request_end(BdrvTrackedRequest *req)
1276
{
1277
    QLIST_REMOVE(req, list);
1278
    qemu_co_queue_restart_all(&req->wait_queue);
1279
}
1280

    
1281
/**
1282
 * Add an active request to the tracked requests list
1283
 */
1284
static void tracked_request_begin(BdrvTrackedRequest *req,
1285
                                  BlockDriverState *bs,
1286
                                  int64_t sector_num,
1287
                                  int nb_sectors, bool is_write)
1288
{
1289
    *req = (BdrvTrackedRequest){
1290
        .bs = bs,
1291
        .sector_num = sector_num,
1292
        .nb_sectors = nb_sectors,
1293
        .is_write = is_write,
1294
        .co = qemu_coroutine_self(),
1295
    };
1296

    
1297
    qemu_co_queue_init(&req->wait_queue);
1298

    
1299
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1300
}
1301

    
1302
/**
1303
 * Round a region to cluster boundaries
1304
 */
1305
static void round_to_clusters(BlockDriverState *bs,
1306
                              int64_t sector_num, int nb_sectors,
1307
                              int64_t *cluster_sector_num,
1308
                              int *cluster_nb_sectors)
1309
{
1310
    BlockDriverInfo bdi;
1311

    
1312
    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1313
        *cluster_sector_num = sector_num;
1314
        *cluster_nb_sectors = nb_sectors;
1315
    } else {
1316
        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1317
        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1318
        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1319
                                            nb_sectors, c);
1320
    }
1321
}
1322

    
1323
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1324
                                     int64_t sector_num, int nb_sectors) {
1325
    /*        aaaa   bbbb */
1326
    if (sector_num >= req->sector_num + req->nb_sectors) {
1327
        return false;
1328
    }
1329
    /* bbbb   aaaa        */
1330
    if (req->sector_num >= sector_num + nb_sectors) {
1331
        return false;
1332
    }
1333
    return true;
1334
}
1335

    
1336
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1337
        int64_t sector_num, int nb_sectors)
1338
{
1339
    BdrvTrackedRequest *req;
1340
    int64_t cluster_sector_num;
1341
    int cluster_nb_sectors;
1342
    bool retry;
1343

    
1344
    /* If we touch the same cluster it counts as an overlap.  This guarantees
1345
     * that allocating writes will be serialized and not race with each other
1346
     * for the same cluster.  For example, in copy-on-read it ensures that the
1347
     * CoR read and write operations are atomic and guest writes cannot
1348
     * interleave between them.
1349
     */
1350
    round_to_clusters(bs, sector_num, nb_sectors,
1351
                      &cluster_sector_num, &cluster_nb_sectors);
1352

    
1353
    do {
1354
        retry = false;
1355
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
1356
            if (tracked_request_overlaps(req, cluster_sector_num,
1357
                                         cluster_nb_sectors)) {
1358
                /* Hitting this means there was a reentrant request, for
1359
                 * example, a block driver issuing nested requests.  This must
1360
                 * never happen since it means deadlock.
1361
                 */
1362
                assert(qemu_coroutine_self() != req->co);
1363

    
1364
                qemu_co_queue_wait(&req->wait_queue);
1365
                retry = true;
1366
                break;
1367
            }
1368
        }
1369
    } while (retry);
1370
}
1371

    
1372
/*
1373
 * Return values:
1374
 * 0        - success
1375
 * -EINVAL  - backing format specified, but no file
1376
 * -ENOSPC  - can't update the backing file because no space is left in the
1377
 *            image file header
1378
 * -ENOTSUP - format driver doesn't support changing the backing file
1379
 */
1380
int bdrv_change_backing_file(BlockDriverState *bs,
1381
    const char *backing_file, const char *backing_fmt)
1382
{
1383
    BlockDriver *drv = bs->drv;
1384

    
1385
    if (drv->bdrv_change_backing_file != NULL) {
1386
        return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1387
    } else {
1388
        return -ENOTSUP;
1389
    }
1390
}
1391

    
1392
static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1393
                                   size_t size)
1394
{
1395
    int64_t len;
1396

    
1397
    if (!bdrv_is_inserted(bs))
1398
        return -ENOMEDIUM;
1399

    
1400
    if (bs->growable)
1401
        return 0;
1402

    
1403
    len = bdrv_getlength(bs);
1404

    
1405
    if (offset < 0)
1406
        return -EIO;
1407

    
1408
    if ((offset > len) || (len - offset < size))
1409
        return -EIO;
1410

    
1411
    return 0;
1412
}
1413

    
1414
static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1415
                              int nb_sectors)
1416
{
1417
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1418
                                   nb_sectors * BDRV_SECTOR_SIZE);
1419
}
1420

    
1421
typedef struct RwCo {
1422
    BlockDriverState *bs;
1423
    int64_t sector_num;
1424
    int nb_sectors;
1425
    QEMUIOVector *qiov;
1426
    bool is_write;
1427
    int ret;
1428
} RwCo;
1429

    
1430
static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1431
{
1432
    RwCo *rwco = opaque;
1433

    
1434
    if (!rwco->is_write) {
1435
        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1436
                                     rwco->nb_sectors, rwco->qiov, 0);
1437
    } else {
1438
        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1439
                                      rwco->nb_sectors, rwco->qiov, 0);
1440
    }
1441
}
1442

    
1443
/*
1444
 * Process a synchronous request using coroutines
1445
 */
1446
static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1447
                      int nb_sectors, bool is_write)
1448
{
1449
    QEMUIOVector qiov;
1450
    struct iovec iov = {
1451
        .iov_base = (void *)buf,
1452
        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1453
    };
1454
    Coroutine *co;
1455
    RwCo rwco = {
1456
        .bs = bs,
1457
        .sector_num = sector_num,
1458
        .nb_sectors = nb_sectors,
1459
        .qiov = &qiov,
1460
        .is_write = is_write,
1461
        .ret = NOT_DONE,
1462
    };
1463

    
1464
    qemu_iovec_init_external(&qiov, &iov, 1);
1465

    
1466
    if (qemu_in_coroutine()) {
1467
        /* Fast-path if already in coroutine context */
1468
        bdrv_rw_co_entry(&rwco);
1469
    } else {
1470
        co = qemu_coroutine_create(bdrv_rw_co_entry);
1471
        qemu_coroutine_enter(co, &rwco);
1472
        while (rwco.ret == NOT_DONE) {
1473
            qemu_aio_wait();
1474
        }
1475
    }
1476
    return rwco.ret;
1477
}
1478

    
1479
/* return < 0 if error. See bdrv_write() for the return codes */
1480
int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1481
              uint8_t *buf, int nb_sectors)
1482
{
1483
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1484
}
1485

    
1486
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1487
                             int nb_sectors, int dirty)
1488
{
1489
    int64_t start, end;
1490
    unsigned long val, idx, bit;
1491

    
1492
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1493
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1494

    
1495
    for (; start <= end; start++) {
1496
        idx = start / (sizeof(unsigned long) * 8);
1497
        bit = start % (sizeof(unsigned long) * 8);
1498
        val = bs->dirty_bitmap[idx];
1499
        if (dirty) {
1500
            if (!(val & (1UL << bit))) {
1501
                bs->dirty_count++;
1502
                val |= 1UL << bit;
1503
            }
1504
        } else {
1505
            if (val & (1UL << bit)) {
1506
                bs->dirty_count--;
1507
                val &= ~(1UL << bit);
1508
            }
1509
        }
1510
        bs->dirty_bitmap[idx] = val;
1511
    }
1512
}
1513

    
1514
/* Return < 0 if error. Important errors are:
1515
  -EIO         generic I/O error (may happen for all errors)
1516
  -ENOMEDIUM   No media inserted.
1517
  -EINVAL      Invalid sector number or nb_sectors
1518
  -EACCES      Trying to write a read-only device
1519
*/
1520
int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1521
               const uint8_t *buf, int nb_sectors)
1522
{
1523
    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1524
}
1525

    
1526
int bdrv_pread(BlockDriverState *bs, int64_t offset,
1527
               void *buf, int count1)
1528
{
1529
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1530
    int len, nb_sectors, count;
1531
    int64_t sector_num;
1532
    int ret;
1533

    
1534
    count = count1;
1535
    /* first read to align to sector start */
1536
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1537
    if (len > count)
1538
        len = count;
1539
    sector_num = offset >> BDRV_SECTOR_BITS;
1540
    if (len > 0) {
1541
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1542
            return ret;
1543
        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1544
        count -= len;
1545
        if (count == 0)
1546
            return count1;
1547
        sector_num++;
1548
        buf += len;
1549
    }
1550

    
1551
    /* read the sectors "in place" */
1552
    nb_sectors = count >> BDRV_SECTOR_BITS;
1553
    if (nb_sectors > 0) {
1554
        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1555
            return ret;
1556
        sector_num += nb_sectors;
1557
        len = nb_sectors << BDRV_SECTOR_BITS;
1558
        buf += len;
1559
        count -= len;
1560
    }
1561

    
1562
    /* add data from the last sector */
1563
    if (count > 0) {
1564
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1565
            return ret;
1566
        memcpy(buf, tmp_buf, count);
1567
    }
1568
    return count1;
1569
}
1570

    
1571
int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1572
                const void *buf, int count1)
1573
{
1574
    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1575
    int len, nb_sectors, count;
1576
    int64_t sector_num;
1577
    int ret;
1578

    
1579
    count = count1;
1580
    /* first write to align to sector start */
1581
    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1582
    if (len > count)
1583
        len = count;
1584
    sector_num = offset >> BDRV_SECTOR_BITS;
1585
    if (len > 0) {
1586
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1587
            return ret;
1588
        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1589
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1590
            return ret;
1591
        count -= len;
1592
        if (count == 0)
1593
            return count1;
1594
        sector_num++;
1595
        buf += len;
1596
    }
1597

    
1598
    /* write the sectors "in place" */
1599
    nb_sectors = count >> BDRV_SECTOR_BITS;
1600
    if (nb_sectors > 0) {
1601
        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1602
            return ret;
1603
        sector_num += nb_sectors;
1604
        len = nb_sectors << BDRV_SECTOR_BITS;
1605
        buf += len;
1606
        count -= len;
1607
    }
1608

    
1609
    /* add data from the last sector */
1610
    if (count > 0) {
1611
        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1612
            return ret;
1613
        memcpy(tmp_buf, buf, count);
1614
        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1615
            return ret;
1616
    }
1617
    return count1;
1618
}
1619

    
1620
/*
1621
 * Writes to the file and ensures that no writes are reordered across this
1622
 * request (acts as a barrier)
1623
 *
1624
 * Returns 0 on success, -errno in error cases.
1625
 */
1626
int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1627
    const void *buf, int count)
1628
{
1629
    int ret;
1630

    
1631
    ret = bdrv_pwrite(bs, offset, buf, count);
1632
    if (ret < 0) {
1633
        return ret;
1634
    }
1635

    
1636
    /* No flush needed for cache modes that use O_DSYNC */
1637
    if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1638
        bdrv_flush(bs);
1639
    }
1640

    
1641
    return 0;
1642
}
1643

    
1644
static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1645
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1646
{
1647
    /* Perform I/O through a temporary buffer so that users who scribble over
1648
     * their read buffer while the operation is in progress do not end up
1649
     * modifying the image file.  This is critical for zero-copy guest I/O
1650
     * where anything might happen inside guest memory.
1651
     */
1652
    void *bounce_buffer;
1653

    
1654
    BlockDriver *drv = bs->drv;
1655
    struct iovec iov;
1656
    QEMUIOVector bounce_qiov;
1657
    int64_t cluster_sector_num;
1658
    int cluster_nb_sectors;
1659
    size_t skip_bytes;
1660
    int ret;
1661

    
1662
    /* Cover entire cluster so no additional backing file I/O is required when
1663
     * allocating cluster in the image file.
1664
     */
1665
    round_to_clusters(bs, sector_num, nb_sectors,
1666
                      &cluster_sector_num, &cluster_nb_sectors);
1667

    
1668
    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1669
                                   cluster_sector_num, cluster_nb_sectors);
1670

    
1671
    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1672
    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1673
    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1674

    
1675
    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1676
                             &bounce_qiov);
1677
    if (ret < 0) {
1678
        goto err;
1679
    }
1680

    
1681
    if (drv->bdrv_co_write_zeroes &&
1682
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
1683
        ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
1684
                                        cluster_nb_sectors);
1685
    } else {
1686
        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1687
                                  &bounce_qiov);
1688
    }
1689

    
1690
    if (ret < 0) {
1691
        /* It might be okay to ignore write errors for guest requests.  If this
1692
         * is a deliberate copy-on-read then we don't want to ignore the error.
1693
         * Simply report it in all cases.
1694
         */
1695
        goto err;
1696
    }
1697

    
1698
    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1699
    qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1700
                           nb_sectors * BDRV_SECTOR_SIZE);
1701

    
1702
err:
1703
    qemu_vfree(bounce_buffer);
1704
    return ret;
1705
}
1706

    
1707
/*
1708
 * Handle a read request in coroutine context
1709
 */
1710
static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1711
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1712
    BdrvRequestFlags flags)
1713
{
1714
    BlockDriver *drv = bs->drv;
1715
    BdrvTrackedRequest req;
1716
    int ret;
1717

    
1718
    if (!drv) {
1719
        return -ENOMEDIUM;
1720
    }
1721
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1722
        return -EIO;
1723
    }
1724

    
1725
    /* throttling disk read I/O */
1726
    if (bs->io_limits_enabled) {
1727
        bdrv_io_limits_intercept(bs, false, nb_sectors);
1728
    }
1729

    
1730
    if (bs->copy_on_read) {
1731
        flags |= BDRV_REQ_COPY_ON_READ;
1732
    }
1733
    if (flags & BDRV_REQ_COPY_ON_READ) {
1734
        bs->copy_on_read_in_flight++;
1735
    }
1736

    
1737
    if (bs->copy_on_read_in_flight) {
1738
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1739
    }
1740

    
1741
    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1742

    
1743
    if (flags & BDRV_REQ_COPY_ON_READ) {
1744
        int pnum;
1745

    
1746
        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1747
        if (ret < 0) {
1748
            goto out;
1749
        }
1750

    
1751
        if (!ret || pnum != nb_sectors) {
1752
            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1753
            goto out;
1754
        }
1755
    }
1756

    
1757
    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1758

    
1759
out:
1760
    tracked_request_end(&req);
1761

    
1762
    if (flags & BDRV_REQ_COPY_ON_READ) {
1763
        bs->copy_on_read_in_flight--;
1764
    }
1765

    
1766
    return ret;
1767
}
1768

    
1769
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1770
    int nb_sectors, QEMUIOVector *qiov)
1771
{
1772
    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1773

    
1774
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1775
}
1776

    
1777
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1778
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1779
{
1780
    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1781

    
1782
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1783
                            BDRV_REQ_COPY_ON_READ);
1784
}
1785

    
1786
static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1787
    int64_t sector_num, int nb_sectors)
1788
{
1789
    BlockDriver *drv = bs->drv;
1790
    QEMUIOVector qiov;
1791
    struct iovec iov;
1792
    int ret;
1793

    
1794
    /* First try the efficient write zeroes operation */
1795
    if (drv->bdrv_co_write_zeroes) {
1796
        return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1797
    }
1798

    
1799
    /* Fall back to bounce buffer if write zeroes is unsupported */
1800
    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1801
    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1802
    memset(iov.iov_base, 0, iov.iov_len);
1803
    qemu_iovec_init_external(&qiov, &iov, 1);
1804

    
1805
    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1806

    
1807
    qemu_vfree(iov.iov_base);
1808
    return ret;
1809
}
1810

    
1811
/*
1812
 * Handle a write request in coroutine context
1813
 */
1814
static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1815
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1816
    BdrvRequestFlags flags)
1817
{
1818
    BlockDriver *drv = bs->drv;
1819
    BdrvTrackedRequest req;
1820
    int ret;
1821

    
1822
    if (!bs->drv) {
1823
        return -ENOMEDIUM;
1824
    }
1825
    if (bs->read_only) {
1826
        return -EACCES;
1827
    }
1828
    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1829
        return -EIO;
1830
    }
1831

    
1832
    /* throttling disk write I/O */
1833
    if (bs->io_limits_enabled) {
1834
        bdrv_io_limits_intercept(bs, true, nb_sectors);
1835
    }
1836

    
1837
    if (bs->copy_on_read_in_flight) {
1838
        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1839
    }
1840

    
1841
    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1842

    
1843
    if (flags & BDRV_REQ_ZERO_WRITE) {
1844
        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1845
    } else {
1846
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1847
    }
1848

    
1849
    if (bs->dirty_bitmap) {
1850
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1851
    }
1852

    
1853
    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1854
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
1855
    }
1856

    
1857
    tracked_request_end(&req);
1858

    
1859
    return ret;
1860
}
1861

    
1862
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1863
    int nb_sectors, QEMUIOVector *qiov)
1864
{
1865
    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1866

    
1867
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1868
}
1869

    
1870
int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1871
                                      int64_t sector_num, int nb_sectors)
1872
{
1873
    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1874

    
1875
    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1876
                             BDRV_REQ_ZERO_WRITE);
1877
}
1878

    
1879
/**
1880
 * Truncate file to 'offset' bytes (needed only for file protocols)
1881
 */
1882
int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1883
{
1884
    BlockDriver *drv = bs->drv;
1885
    int ret;
1886
    if (!drv)
1887
        return -ENOMEDIUM;
1888
    if (!drv->bdrv_truncate)
1889
        return -ENOTSUP;
1890
    if (bs->read_only)
1891
        return -EACCES;
1892
    if (bdrv_in_use(bs))
1893
        return -EBUSY;
1894
    ret = drv->bdrv_truncate(bs, offset);
1895
    if (ret == 0) {
1896
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1897
        bdrv_dev_resize_cb(bs);
1898
    }
1899
    return ret;
1900
}
1901

    
1902
/**
1903
 * Length of a allocated file in bytes. Sparse files are counted by actual
1904
 * allocated space. Return < 0 if error or unknown.
1905
 */
1906
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1907
{
1908
    BlockDriver *drv = bs->drv;
1909
    if (!drv) {
1910
        return -ENOMEDIUM;
1911
    }
1912
    if (drv->bdrv_get_allocated_file_size) {
1913
        return drv->bdrv_get_allocated_file_size(bs);
1914
    }
1915
    if (bs->file) {
1916
        return bdrv_get_allocated_file_size(bs->file);
1917
    }
1918
    return -ENOTSUP;
1919
}
1920

    
1921
/**
1922
 * Length of a file in bytes. Return < 0 if error or unknown.
1923
 */
1924
int64_t bdrv_getlength(BlockDriverState *bs)
1925
{
1926
    BlockDriver *drv = bs->drv;
1927
    if (!drv)
1928
        return -ENOMEDIUM;
1929

    
1930
    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1931
        if (drv->bdrv_getlength) {
1932
            return drv->bdrv_getlength(bs);
1933
        }
1934
    }
1935
    return bs->total_sectors * BDRV_SECTOR_SIZE;
1936
}
1937

    
1938
/* return 0 as number of sectors if no device present or error */
1939
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1940
{
1941
    int64_t length;
1942
    length = bdrv_getlength(bs);
1943
    if (length < 0)
1944
        length = 0;
1945
    else
1946
        length = length >> BDRV_SECTOR_BITS;
1947
    *nb_sectors_ptr = length;
1948
}
1949

    
1950
struct partition {
1951
        uint8_t boot_ind;           /* 0x80 - active */
1952
        uint8_t head;               /* starting head */
1953
        uint8_t sector;             /* starting sector */
1954
        uint8_t cyl;                /* starting cylinder */
1955
        uint8_t sys_ind;            /* What partition type */
1956
        uint8_t end_head;           /* end head */
1957
        uint8_t end_sector;         /* end sector */
1958
        uint8_t end_cyl;            /* end cylinder */
1959
        uint32_t start_sect;        /* starting sector counting from 0 */
1960
        uint32_t nr_sects;          /* nr of sectors in partition */
1961
} QEMU_PACKED;
1962

    
1963
/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1964
static int guess_disk_lchs(BlockDriverState *bs,
1965
                           int *pcylinders, int *pheads, int *psectors)
1966
{
1967
    uint8_t buf[BDRV_SECTOR_SIZE];
1968
    int ret, i, heads, sectors, cylinders;
1969
    struct partition *p;
1970
    uint32_t nr_sects;
1971
    uint64_t nb_sectors;
1972

    
1973
    bdrv_get_geometry(bs, &nb_sectors);
1974

    
1975
    ret = bdrv_read(bs, 0, buf, 1);
1976
    if (ret < 0)
1977
        return -1;
1978
    /* test msdos magic */
1979
    if (buf[510] != 0x55 || buf[511] != 0xaa)
1980
        return -1;
1981
    for(i = 0; i < 4; i++) {
1982
        p = ((struct partition *)(buf + 0x1be)) + i;
1983
        nr_sects = le32_to_cpu(p->nr_sects);
1984
        if (nr_sects && p->end_head) {
1985
            /* We make the assumption that the partition terminates on
1986
               a cylinder boundary */
1987
            heads = p->end_head + 1;
1988
            sectors = p->end_sector & 63;
1989
            if (sectors == 0)
1990
                continue;
1991
            cylinders = nb_sectors / (heads * sectors);
1992
            if (cylinders < 1 || cylinders > 16383)
1993
                continue;
1994
            *pheads = heads;
1995
            *psectors = sectors;
1996
            *pcylinders = cylinders;
1997
#if 0
1998
            printf("guessed geometry: LCHS=%d %d %d\n",
1999
                   cylinders, heads, sectors);
2000
#endif
2001
            return 0;
2002
        }
2003
    }
2004
    return -1;
2005
}
2006

    
2007
void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2008
{
2009
    int translation, lba_detected = 0;
2010
    int cylinders, heads, secs;
2011
    uint64_t nb_sectors;
2012

    
2013
    /* if a geometry hint is available, use it */
2014
    bdrv_get_geometry(bs, &nb_sectors);
2015
    bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2016
    translation = bdrv_get_translation_hint(bs);
2017
    if (cylinders != 0) {
2018
        *pcyls = cylinders;
2019
        *pheads = heads;
2020
        *psecs = secs;
2021
    } else {
2022
        if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2023
            if (heads > 16) {
2024
                /* if heads > 16, it means that a BIOS LBA
2025
                   translation was active, so the default
2026
                   hardware geometry is OK */
2027
                lba_detected = 1;
2028
                goto default_geometry;
2029
            } else {
2030
                *pcyls = cylinders;
2031
                *pheads = heads;
2032
                *psecs = secs;
2033
                /* disable any translation to be in sync with
2034
                   the logical geometry */
2035
                if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2036
                    bdrv_set_translation_hint(bs,
2037
                                              BIOS_ATA_TRANSLATION_NONE);
2038
                }
2039
            }
2040
        } else {
2041
        default_geometry:
2042
            /* if no geometry, use a standard physical disk geometry */
2043
            cylinders = nb_sectors / (16 * 63);
2044

    
2045
            if (cylinders > 16383)
2046
                cylinders = 16383;
2047
            else if (cylinders < 2)
2048
                cylinders = 2;
2049
            *pcyls = cylinders;
2050
            *pheads = 16;
2051
            *psecs = 63;
2052
            if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2053
                if ((*pcyls * *pheads) <= 131072) {
2054
                    bdrv_set_translation_hint(bs,
2055
                                              BIOS_ATA_TRANSLATION_LARGE);
2056
                } else {
2057
                    bdrv_set_translation_hint(bs,
2058
                                              BIOS_ATA_TRANSLATION_LBA);
2059
                }
2060
            }
2061
        }
2062
        bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2063
    }
2064
}
2065

    
2066
void bdrv_set_geometry_hint(BlockDriverState *bs,
2067
                            int cyls, int heads, int secs)
2068
{
2069
    bs->cyls = cyls;
2070
    bs->heads = heads;
2071
    bs->secs = secs;
2072
}
2073

    
2074
void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2075
{
2076
    bs->translation = translation;
2077
}
2078

    
2079
void bdrv_get_geometry_hint(BlockDriverState *bs,
2080
                            int *pcyls, int *pheads, int *psecs)
2081
{
2082
    *pcyls = bs->cyls;
2083
    *pheads = bs->heads;
2084
    *psecs = bs->secs;
2085
}
2086

    
2087
/* throttling disk io limits */
2088
void bdrv_set_io_limits(BlockDriverState *bs,
2089
                        BlockIOLimit *io_limits)
2090
{
2091
    bs->io_limits = *io_limits;
2092
    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2093
}
2094

    
2095
/* Recognize floppy formats */
2096
typedef struct FDFormat {
2097
    FDriveType drive;
2098
    uint8_t last_sect;
2099
    uint8_t max_track;
2100
    uint8_t max_head;
2101
    FDriveRate rate;
2102
} FDFormat;
2103

    
2104
static const FDFormat fd_formats[] = {
2105
    /* First entry is default format */
2106
    /* 1.44 MB 3"1/2 floppy disks */
2107
    { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2108
    { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2109
    { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2110
    { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2111
    { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2112
    { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2113
    { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2114
    { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2115
    /* 2.88 MB 3"1/2 floppy disks */
2116
    { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2117
    { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2118
    { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2119
    { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2120
    { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2121
    /* 720 kB 3"1/2 floppy disks */
2122
    { FDRIVE_DRV_144,  9, 80, 1, FDRIVE_RATE_250K, },
2123
    { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2124
    { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2125
    { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2126
    { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2127
    { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2128
    /* 1.2 MB 5"1/4 floppy disks */
2129
    { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2130
    { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2131
    { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2132
    { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2133
    { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2134
    /* 720 kB 5"1/4 floppy disks */
2135
    { FDRIVE_DRV_120,  9, 80, 1, FDRIVE_RATE_250K, },
2136
    { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2137
    /* 360 kB 5"1/4 floppy disks */
2138
    { FDRIVE_DRV_120,  9, 40, 1, FDRIVE_RATE_300K, },
2139
    { FDRIVE_DRV_120,  9, 40, 0, FDRIVE_RATE_300K, },
2140
    { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2141
    { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2142
    /* 320 kB 5"1/4 floppy disks */
2143
    { FDRIVE_DRV_120,  8, 40, 1, FDRIVE_RATE_250K, },
2144
    { FDRIVE_DRV_120,  8, 40, 0, FDRIVE_RATE_250K, },
2145
    /* 360 kB must match 5"1/4 better than 3"1/2... */
2146
    { FDRIVE_DRV_144,  9, 80, 0, FDRIVE_RATE_250K, },
2147
    /* end */
2148
    { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2149
};
2150

    
2151
void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2152
                                   int *max_track, int *last_sect,
2153
                                   FDriveType drive_in, FDriveType *drive,
2154
                                   FDriveRate *rate)
2155
{
2156
    const FDFormat *parse;
2157
    uint64_t nb_sectors, size;
2158
    int i, first_match, match;
2159

    
2160
    bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2161
    if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2162
        /* User defined disk */
2163
        *rate = FDRIVE_RATE_500K;
2164
    } else {
2165
        bdrv_get_geometry(bs, &nb_sectors);
2166
        match = -1;
2167
        first_match = -1;
2168
        for (i = 0; ; i++) {
2169
            parse = &fd_formats[i];
2170
            if (parse->drive == FDRIVE_DRV_NONE) {
2171
                break;
2172
            }
2173
            if (drive_in == parse->drive ||
2174
                drive_in == FDRIVE_DRV_NONE) {
2175
                size = (parse->max_head + 1) * parse->max_track *
2176
                    parse->last_sect;
2177
                if (nb_sectors == size) {
2178
                    match = i;
2179
                    break;
2180
                }
2181
                if (first_match == -1) {
2182
                    first_match = i;
2183
                }
2184
            }
2185
        }
2186
        if (match == -1) {
2187
            if (first_match == -1) {
2188
                match = 1;
2189
            } else {
2190
                match = first_match;
2191
            }
2192
            parse = &fd_formats[match];
2193
        }
2194
        *nb_heads = parse->max_head + 1;
2195
        *max_track = parse->max_track;
2196
        *last_sect = parse->last_sect;
2197
        *drive = parse->drive;
2198
        *rate = parse->rate;
2199
    }
2200
}
2201

    
2202
int bdrv_get_translation_hint(BlockDriverState *bs)
2203
{
2204
    return bs->translation;
2205
}
2206

    
2207
void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2208
                       BlockErrorAction on_write_error)
2209
{
2210
    bs->on_read_error = on_read_error;
2211
    bs->on_write_error = on_write_error;
2212
}
2213

    
2214
BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2215
{
2216
    return is_read ? bs->on_read_error : bs->on_write_error;
2217
}
2218

    
2219
int bdrv_is_read_only(BlockDriverState *bs)
2220
{
2221
    return bs->read_only;
2222
}
2223

    
2224
int bdrv_is_sg(BlockDriverState *bs)
2225
{
2226
    return bs->sg;
2227
}
2228

    
2229
int bdrv_enable_write_cache(BlockDriverState *bs)
2230
{
2231
    return bs->enable_write_cache;
2232
}
2233

    
2234
int bdrv_is_encrypted(BlockDriverState *bs)
2235
{
2236
    if (bs->backing_hd && bs->backing_hd->encrypted)
2237
        return 1;
2238
    return bs->encrypted;
2239
}
2240

    
2241
int bdrv_key_required(BlockDriverState *bs)
2242
{
2243
    BlockDriverState *backing_hd = bs->backing_hd;
2244

    
2245
    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2246
        return 1;
2247
    return (bs->encrypted && !bs->valid_key);
2248
}
2249

    
2250
int bdrv_set_key(BlockDriverState *bs, const char *key)
2251
{
2252
    int ret;
2253
    if (bs->backing_hd && bs->backing_hd->encrypted) {
2254
        ret = bdrv_set_key(bs->backing_hd, key);
2255
        if (ret < 0)
2256
            return ret;
2257
        if (!bs->encrypted)
2258
            return 0;
2259
    }
2260
    if (!bs->encrypted) {
2261
        return -EINVAL;
2262
    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2263
        return -ENOMEDIUM;
2264
    }
2265
    ret = bs->drv->bdrv_set_key(bs, key);
2266
    if (ret < 0) {
2267
        bs->valid_key = 0;
2268
    } else if (!bs->valid_key) {
2269
        bs->valid_key = 1;
2270
        /* call the change callback now, we skipped it on open */
2271
        bdrv_dev_change_media_cb(bs, true);
2272
    }
2273
    return ret;
2274
}
2275

    
2276
void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2277
{
2278
    if (!bs->drv) {
2279
        buf[0] = '\0';
2280
    } else {
2281
        pstrcpy(buf, buf_size, bs->drv->format_name);
2282
    }
2283
}
2284

    
2285
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2286
                         void *opaque)
2287
{
2288
    BlockDriver *drv;
2289

    
2290
    QLIST_FOREACH(drv, &bdrv_drivers, list) {
2291
        it(opaque, drv->format_name);
2292
    }
2293
}
2294

    
2295
BlockDriverState *bdrv_find(const char *name)
2296
{
2297
    BlockDriverState *bs;
2298

    
2299
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2300
        if (!strcmp(name, bs->device_name)) {
2301
            return bs;
2302
        }
2303
    }
2304
    return NULL;
2305
}
2306

    
2307
BlockDriverState *bdrv_next(BlockDriverState *bs)
2308
{
2309
    if (!bs) {
2310
        return QTAILQ_FIRST(&bdrv_states);
2311
    }
2312
    return QTAILQ_NEXT(bs, list);
2313
}
2314

    
2315
void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2316
{
2317
    BlockDriverState *bs;
2318

    
2319
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2320
        it(opaque, bs);
2321
    }
2322
}
2323

    
2324
const char *bdrv_get_device_name(BlockDriverState *bs)
2325
{
2326
    return bs->device_name;
2327
}
2328

    
2329
void bdrv_flush_all(void)
2330
{
2331
    BlockDriverState *bs;
2332

    
2333
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2334
        if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
2335
            bdrv_flush(bs);
2336
        }
2337
    }
2338
}
2339

    
2340
int bdrv_has_zero_init(BlockDriverState *bs)
2341
{
2342
    assert(bs->drv);
2343

    
2344
    if (bs->drv->bdrv_has_zero_init) {
2345
        return bs->drv->bdrv_has_zero_init(bs);
2346
    }
2347

    
2348
    return 1;
2349
}
2350

    
2351
typedef struct BdrvCoIsAllocatedData {
2352
    BlockDriverState *bs;
2353
    int64_t sector_num;
2354
    int nb_sectors;
2355
    int *pnum;
2356
    int ret;
2357
    bool done;
2358
} BdrvCoIsAllocatedData;
2359

    
2360
/*
2361
 * Returns true iff the specified sector is present in the disk image. Drivers
2362
 * not implementing the functionality are assumed to not support backing files,
2363
 * hence all their sectors are reported as allocated.
2364
 *
2365
 * If 'sector_num' is beyond the end of the disk image the return value is 0
2366
 * and 'pnum' is set to 0.
2367
 *
2368
 * 'pnum' is set to the number of sectors (including and immediately following
2369
 * the specified sector) that are known to be in the same
2370
 * allocated/unallocated state.
2371
 *
2372
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2373
 * beyond the end of the disk image it will be clamped.
2374
 */
2375
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2376
                                      int nb_sectors, int *pnum)
2377
{
2378
    int64_t n;
2379

    
2380
    if (sector_num >= bs->total_sectors) {
2381
        *pnum = 0;
2382
        return 0;
2383
    }
2384

    
2385
    n = bs->total_sectors - sector_num;
2386
    if (n < nb_sectors) {
2387
        nb_sectors = n;
2388
    }
2389

    
2390
    if (!bs->drv->bdrv_co_is_allocated) {
2391
        *pnum = nb_sectors;
2392
        return 1;
2393
    }
2394

    
2395
    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2396
}
2397

    
2398
/* Coroutine wrapper for bdrv_is_allocated() */
2399
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2400
{
2401
    BdrvCoIsAllocatedData *data = opaque;
2402
    BlockDriverState *bs = data->bs;
2403

    
2404
    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2405
                                     data->pnum);
2406
    data->done = true;
2407
}
2408

    
2409
/*
2410
 * Synchronous wrapper around bdrv_co_is_allocated().
2411
 *
2412
 * See bdrv_co_is_allocated() for details.
2413
 */
2414
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2415
                      int *pnum)
2416
{
2417
    Coroutine *co;
2418
    BdrvCoIsAllocatedData data = {
2419
        .bs = bs,
2420
        .sector_num = sector_num,
2421
        .nb_sectors = nb_sectors,
2422
        .pnum = pnum,
2423
        .done = false,
2424
    };
2425

    
2426
    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2427
    qemu_coroutine_enter(co, &data);
2428
    while (!data.done) {
2429
        qemu_aio_wait();
2430
    }
2431
    return data.ret;
2432
}
2433

    
2434
BlockInfoList *qmp_query_block(Error **errp)
2435
{
2436
    BlockInfoList *head = NULL, *cur_item = NULL;
2437
    BlockDriverState *bs;
2438

    
2439
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2440
        BlockInfoList *info = g_malloc0(sizeof(*info));
2441

    
2442
        info->value = g_malloc0(sizeof(*info->value));
2443
        info->value->device = g_strdup(bs->device_name);
2444
        info->value->type = g_strdup("unknown");
2445
        info->value->locked = bdrv_dev_is_medium_locked(bs);
2446
        info->value->removable = bdrv_dev_has_removable_media(bs);
2447

    
2448
        if (bdrv_dev_has_removable_media(bs)) {
2449
            info->value->has_tray_open = true;
2450
            info->value->tray_open = bdrv_dev_is_tray_open(bs);
2451
        }
2452

    
2453
        if (bdrv_iostatus_is_enabled(bs)) {
2454
            info->value->has_io_status = true;
2455
            info->value->io_status = bs->iostatus;
2456
        }
2457

    
2458
        if (bs->drv) {
2459
            info->value->has_inserted = true;
2460
            info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2461
            info->value->inserted->file = g_strdup(bs->filename);
2462
            info->value->inserted->ro = bs->read_only;
2463
            info->value->inserted->drv = g_strdup(bs->drv->format_name);
2464
            info->value->inserted->encrypted = bs->encrypted;
2465
            if (bs->backing_file[0]) {
2466
                info->value->inserted->has_backing_file = true;
2467
                info->value->inserted->backing_file = g_strdup(bs->backing_file);
2468
            }
2469

    
2470
            if (bs->io_limits_enabled) {
2471
                info->value->inserted->bps =
2472
                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2473
                info->value->inserted->bps_rd =
2474
                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2475
                info->value->inserted->bps_wr =
2476
                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2477
                info->value->inserted->iops =
2478
                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2479
                info->value->inserted->iops_rd =
2480
                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2481
                info->value->inserted->iops_wr =
2482
                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2483
            }
2484
        }
2485

    
2486
        /* XXX: waiting for the qapi to support GSList */
2487
        if (!cur_item) {
2488
            head = cur_item = info;
2489
        } else {
2490
            cur_item->next = info;
2491
            cur_item = info;
2492
        }
2493
    }
2494

    
2495
    return head;
2496
}
2497

    
2498
/* Consider exposing this as a full fledged QMP command */
2499
static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2500
{
2501
    BlockStats *s;
2502

    
2503
    s = g_malloc0(sizeof(*s));
2504

    
2505
    if (bs->device_name[0]) {
2506
        s->has_device = true;
2507
        s->device = g_strdup(bs->device_name);
2508
    }
2509

    
2510
    s->stats = g_malloc0(sizeof(*s->stats));
2511
    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2512
    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2513
    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2514
    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2515
    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2516
    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2517
    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2518
    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2519
    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2520

    
2521
    if (bs->file) {
2522
        s->has_parent = true;
2523
        s->parent = qmp_query_blockstat(bs->file, NULL);
2524
    }
2525

    
2526
    return s;
2527
}
2528

    
2529
BlockStatsList *qmp_query_blockstats(Error **errp)
2530
{
2531
    BlockStatsList *head = NULL, *cur_item = NULL;
2532
    BlockDriverState *bs;
2533

    
2534
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2535
        BlockStatsList *info = g_malloc0(sizeof(*info));
2536
        info->value = qmp_query_blockstat(bs, NULL);
2537

    
2538
        /* XXX: waiting for the qapi to support GSList */
2539
        if (!cur_item) {
2540
            head = cur_item = info;
2541
        } else {
2542
            cur_item->next = info;
2543
            cur_item = info;
2544
        }
2545
    }
2546

    
2547
    return head;
2548
}
2549

    
2550
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2551
{
2552
    if (bs->backing_hd && bs->backing_hd->encrypted)
2553
        return bs->backing_file;
2554
    else if (bs->encrypted)
2555
        return bs->filename;
2556
    else
2557
        return NULL;
2558
}
2559

    
2560
void bdrv_get_backing_filename(BlockDriverState *bs,
2561
                               char *filename, int filename_size)
2562
{
2563
    pstrcpy(filename, filename_size, bs->backing_file);
2564
}
2565

    
2566
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2567
                          const uint8_t *buf, int nb_sectors)
2568
{
2569
    BlockDriver *drv = bs->drv;
2570
    if (!drv)
2571
        return -ENOMEDIUM;
2572
    if (!drv->bdrv_write_compressed)
2573
        return -ENOTSUP;
2574
    if (bdrv_check_request(bs, sector_num, nb_sectors))
2575
        return -EIO;
2576

    
2577
    if (bs->dirty_bitmap) {
2578
        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2579
    }
2580

    
2581
    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2582
}
2583

    
2584
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2585
{
2586
    BlockDriver *drv = bs->drv;
2587
    if (!drv)
2588
        return -ENOMEDIUM;
2589
    if (!drv->bdrv_get_info)
2590
        return -ENOTSUP;
2591
    memset(bdi, 0, sizeof(*bdi));
2592
    return drv->bdrv_get_info(bs, bdi);
2593
}
2594

    
2595
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2596
                      int64_t pos, int size)
2597
{
2598
    BlockDriver *drv = bs->drv;
2599
    if (!drv)
2600
        return -ENOMEDIUM;
2601
    if (drv->bdrv_save_vmstate)
2602
        return drv->bdrv_save_vmstate(bs, buf, pos, size);
2603
    if (bs->file)
2604
        return bdrv_save_vmstate(bs->file, buf, pos, size);
2605
    return -ENOTSUP;
2606
}
2607

    
2608
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2609
                      int64_t pos, int size)
2610
{
2611
    BlockDriver *drv = bs->drv;
2612
    if (!drv)
2613
        return -ENOMEDIUM;
2614
    if (drv->bdrv_load_vmstate)
2615
        return drv->bdrv_load_vmstate(bs, buf, pos, size);
2616
    if (bs->file)
2617
        return bdrv_load_vmstate(bs->file, buf, pos, size);
2618
    return -ENOTSUP;
2619
}
2620

    
2621
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2622
{
2623
    BlockDriver *drv = bs->drv;
2624

    
2625
    if (!drv || !drv->bdrv_debug_event) {
2626
        return;
2627
    }
2628

    
2629
    return drv->bdrv_debug_event(bs, event);
2630

    
2631
}
2632

    
2633
/**************************************************************/
2634
/* handling of snapshots */
2635

    
2636
int bdrv_can_snapshot(BlockDriverState *bs)
2637
{
2638
    BlockDriver *drv = bs->drv;
2639
    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2640
        return 0;
2641
    }
2642

    
2643
    if (!drv->bdrv_snapshot_create) {
2644
        if (bs->file != NULL) {
2645
            return bdrv_can_snapshot(bs->file);
2646
        }
2647
        return 0;
2648
    }
2649

    
2650
    return 1;
2651
}
2652

    
2653
int bdrv_is_snapshot(BlockDriverState *bs)
2654
{
2655
    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2656
}
2657

    
2658
BlockDriverState *bdrv_snapshots(void)
2659
{
2660
    BlockDriverState *bs;
2661

    
2662
    if (bs_snapshots) {
2663
        return bs_snapshots;
2664
    }
2665

    
2666
    bs = NULL;
2667
    while ((bs = bdrv_next(bs))) {
2668
        if (bdrv_can_snapshot(bs)) {
2669
            bs_snapshots = bs;
2670
            return bs;
2671
        }
2672
    }
2673
    return NULL;
2674
}
2675

    
2676
int bdrv_snapshot_create(BlockDriverState *bs,
2677
                         QEMUSnapshotInfo *sn_info)
2678
{
2679
    BlockDriver *drv = bs->drv;
2680
    if (!drv)
2681
        return -ENOMEDIUM;
2682
    if (drv->bdrv_snapshot_create)
2683
        return drv->bdrv_snapshot_create(bs, sn_info);
2684
    if (bs->file)
2685
        return bdrv_snapshot_create(bs->file, sn_info);
2686
    return -ENOTSUP;
2687
}
2688

    
2689
int bdrv_snapshot_goto(BlockDriverState *bs,
2690
                       const char *snapshot_id)
2691
{
2692
    BlockDriver *drv = bs->drv;
2693
    int ret, open_ret;
2694

    
2695
    if (!drv)
2696
        return -ENOMEDIUM;
2697
    if (drv->bdrv_snapshot_goto)
2698
        return drv->bdrv_snapshot_goto(bs, snapshot_id);
2699

    
2700
    if (bs->file) {
2701
        drv->bdrv_close(bs);
2702
        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2703
        open_ret = drv->bdrv_open(bs, bs->open_flags);
2704
        if (open_ret < 0) {
2705
            bdrv_delete(bs->file);
2706
            bs->drv = NULL;
2707
            return open_ret;
2708
        }
2709
        return ret;
2710
    }
2711

    
2712
    return -ENOTSUP;
2713
}
2714

    
2715
int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2716
{
2717
    BlockDriver *drv = bs->drv;
2718
    if (!drv)
2719
        return -ENOMEDIUM;
2720
    if (drv->bdrv_snapshot_delete)
2721
        return drv->bdrv_snapshot_delete(bs, snapshot_id);
2722
    if (bs->file)
2723
        return bdrv_snapshot_delete(bs->file, snapshot_id);
2724
    return -ENOTSUP;
2725
}
2726

    
2727
int bdrv_snapshot_list(BlockDriverState *bs,
2728
                       QEMUSnapshotInfo **psn_info)
2729
{
2730
    BlockDriver *drv = bs->drv;
2731
    if (!drv)
2732
        return -ENOMEDIUM;
2733
    if (drv->bdrv_snapshot_list)
2734
        return drv->bdrv_snapshot_list(bs, psn_info);
2735
    if (bs->file)
2736
        return bdrv_snapshot_list(bs->file, psn_info);
2737
    return -ENOTSUP;
2738
}
2739

    
2740
int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2741
        const char *snapshot_name)
2742
{
2743
    BlockDriver *drv = bs->drv;
2744
    if (!drv) {
2745
        return -ENOMEDIUM;
2746
    }
2747
    if (!bs->read_only) {
2748
        return -EINVAL;
2749
    }
2750
    if (drv->bdrv_snapshot_load_tmp) {
2751
        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2752
    }
2753
    return -ENOTSUP;
2754
}
2755

    
2756
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2757
        const char *backing_file)
2758
{
2759
    if (!bs->drv) {
2760
        return NULL;
2761
    }
2762

    
2763
    if (bs->backing_hd) {
2764
        if (strcmp(bs->backing_file, backing_file) == 0) {
2765
            return bs->backing_hd;
2766
        } else {
2767
            return bdrv_find_backing_image(bs->backing_hd, backing_file);
2768
        }
2769
    }
2770

    
2771
    return NULL;
2772
}
2773

    
2774
#define NB_SUFFIXES 4
2775

    
2776
char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2777
{
2778
    static const char suffixes[NB_SUFFIXES] = "KMGT";
2779
    int64_t base;
2780
    int i;
2781

    
2782
    if (size <= 999) {
2783
        snprintf(buf, buf_size, "%" PRId64, size);
2784
    } else {
2785
        base = 1024;
2786
        for(i = 0; i < NB_SUFFIXES; i++) {
2787
            if (size < (10 * base)) {
2788
                snprintf(buf, buf_size, "%0.1f%c",
2789
                         (double)size / base,
2790
                         suffixes[i]);
2791
                break;
2792
            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2793
                snprintf(buf, buf_size, "%" PRId64 "%c",
2794
                         ((size + (base >> 1)) / base),
2795
                         suffixes[i]);
2796
                break;
2797
            }
2798
            base = base * 1024;
2799
        }
2800
    }
2801
    return buf;
2802
}
2803

    
2804
char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2805
{
2806
    char buf1[128], date_buf[128], clock_buf[128];
2807
#ifdef _WIN32
2808
    struct tm *ptm;
2809
#else
2810
    struct tm tm;
2811
#endif
2812
    time_t ti;
2813
    int64_t secs;
2814

    
2815
    if (!sn) {
2816
        snprintf(buf, buf_size,
2817
                 "%-10s%-20s%7s%20s%15s",
2818
                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2819
    } else {
2820
        ti = sn->date_sec;
2821
#ifdef _WIN32
2822
        ptm = localtime(&ti);
2823
        strftime(date_buf, sizeof(date_buf),
2824
                 "%Y-%m-%d %H:%M:%S", ptm);
2825
#else
2826
        localtime_r(&ti, &tm);
2827
        strftime(date_buf, sizeof(date_buf),
2828
                 "%Y-%m-%d %H:%M:%S", &tm);
2829
#endif
2830
        secs = sn->vm_clock_nsec / 1000000000;
2831
        snprintf(clock_buf, sizeof(clock_buf),
2832
                 "%02d:%02d:%02d.%03d",
2833
                 (int)(secs / 3600),
2834
                 (int)((secs / 60) % 60),
2835
                 (int)(secs % 60),
2836
                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2837
        snprintf(buf, buf_size,
2838
                 "%-10s%-20s%7s%20s%15s",
2839
                 sn->id_str, sn->name,
2840
                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2841
                 date_buf,
2842
                 clock_buf);
2843
    }
2844
    return buf;
2845
}
2846

    
2847
/**************************************************************/
2848
/* async I/Os */
2849

    
2850
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2851
                                 QEMUIOVector *qiov, int nb_sectors,
2852
                                 BlockDriverCompletionFunc *cb, void *opaque)
2853
{
2854
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2855

    
2856
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2857
                                 cb, opaque, false);
2858
}
2859

    
2860
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2861
                                  QEMUIOVector *qiov, int nb_sectors,
2862
                                  BlockDriverCompletionFunc *cb, void *opaque)
2863
{
2864
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2865

    
2866
    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2867
                                 cb, opaque, true);
2868
}
2869

    
2870

    
2871
typedef struct MultiwriteCB {
2872
    int error;
2873
    int num_requests;
2874
    int num_callbacks;
2875
    struct {
2876
        BlockDriverCompletionFunc *cb;
2877
        void *opaque;
2878
        QEMUIOVector *free_qiov;
2879
    } callbacks[];
2880
} MultiwriteCB;
2881

    
2882
static void multiwrite_user_cb(MultiwriteCB *mcb)
2883
{
2884
    int i;
2885

    
2886
    for (i = 0; i < mcb->num_callbacks; i++) {
2887
        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2888
        if (mcb->callbacks[i].free_qiov) {
2889
            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2890
        }
2891
        g_free(mcb->callbacks[i].free_qiov);
2892
    }
2893
}
2894

    
2895
static void multiwrite_cb(void *opaque, int ret)
2896
{
2897
    MultiwriteCB *mcb = opaque;
2898

    
2899
    trace_multiwrite_cb(mcb, ret);
2900

    
2901
    if (ret < 0 && !mcb->error) {
2902
        mcb->error = ret;
2903
    }
2904

    
2905
    mcb->num_requests--;
2906
    if (mcb->num_requests == 0) {
2907
        multiwrite_user_cb(mcb);
2908
        g_free(mcb);
2909
    }
2910
}
2911

    
2912
static int multiwrite_req_compare(const void *a, const void *b)
2913
{
2914
    const BlockRequest *req1 = a, *req2 = b;
2915

    
2916
    /*
2917
     * Note that we can't simply subtract req2->sector from req1->sector
2918
     * here as that could overflow the return value.
2919
     */
2920
    if (req1->sector > req2->sector) {
2921
        return 1;
2922
    } else if (req1->sector < req2->sector) {
2923
        return -1;
2924
    } else {
2925
        return 0;
2926
    }
2927
}
2928

    
2929
/*
2930
 * Takes a bunch of requests and tries to merge them. Returns the number of
2931
 * requests that remain after merging.
2932
 */
2933
static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2934
    int num_reqs, MultiwriteCB *mcb)
2935
{
2936
    int i, outidx;
2937

    
2938
    // Sort requests by start sector
2939
    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2940

    
2941
    // Check if adjacent requests touch the same clusters. If so, combine them,
2942
    // filling up gaps with zero sectors.
2943
    outidx = 0;
2944
    for (i = 1; i < num_reqs; i++) {
2945
        int merge = 0;
2946
        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2947

    
2948
        // Handle exactly sequential writes and overlapping writes.
2949
        if (reqs[i].sector <= oldreq_last) {
2950
            merge = 1;
2951
        }
2952

    
2953
        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2954
            merge = 0;
2955
        }
2956

    
2957
        if (merge) {
2958
            size_t size;
2959
            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2960
            qemu_iovec_init(qiov,
2961
                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2962

    
2963
            // Add the first request to the merged one. If the requests are
2964
            // overlapping, drop the last sectors of the first request.
2965
            size = (reqs[i].sector - reqs[outidx].sector) << 9;
2966
            qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2967

    
2968
            // We should need to add any zeros between the two requests
2969
            assert (reqs[i].sector <= oldreq_last);
2970

    
2971
            // Add the second request
2972
            qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2973

    
2974
            reqs[outidx].nb_sectors = qiov->size >> 9;
2975
            reqs[outidx].qiov = qiov;
2976

    
2977
            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2978
        } else {
2979
            outidx++;
2980
            reqs[outidx].sector     = reqs[i].sector;
2981
            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2982
            reqs[outidx].qiov       = reqs[i].qiov;
2983
        }
2984
    }
2985

    
2986
    return outidx + 1;
2987
}
2988

    
2989
/*
2990
 * Submit multiple AIO write requests at once.
2991
 *
2992
 * On success, the function returns 0 and all requests in the reqs array have
2993
 * been submitted. In error case this function returns -1, and any of the
2994
 * requests may or may not be submitted yet. In particular, this means that the
2995
 * callback will be called for some of the requests, for others it won't. The
2996
 * caller must check the error field of the BlockRequest to wait for the right
2997
 * callbacks (if error != 0, no callback will be called).
2998
 *
2999
 * The implementation may modify the contents of the reqs array, e.g. to merge
3000
 * requests. However, the fields opaque and error are left unmodified as they
3001
 * are used to signal failure for a single request to the caller.
3002
 */
3003
int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3004
{
3005
    MultiwriteCB *mcb;
3006
    int i;
3007

    
3008
    /* don't submit writes if we don't have a medium */
3009
    if (bs->drv == NULL) {
3010
        for (i = 0; i < num_reqs; i++) {
3011
            reqs[i].error = -ENOMEDIUM;
3012
        }
3013
        return -1;
3014
    }
3015

    
3016
    if (num_reqs == 0) {
3017
        return 0;
3018
    }
3019

    
3020
    // Create MultiwriteCB structure
3021
    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3022
    mcb->num_requests = 0;
3023
    mcb->num_callbacks = num_reqs;
3024

    
3025
    for (i = 0; i < num_reqs; i++) {
3026
        mcb->callbacks[i].cb = reqs[i].cb;
3027
        mcb->callbacks[i].opaque = reqs[i].opaque;
3028
    }
3029

    
3030
    // Check for mergable requests
3031
    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3032

    
3033
    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3034

    
3035
    /* Run the aio requests. */
3036
    mcb->num_requests = num_reqs;
3037
    for (i = 0; i < num_reqs; i++) {
3038
        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3039
            reqs[i].nb_sectors, multiwrite_cb, mcb);
3040
    }
3041

    
3042
    return 0;
3043
}
3044

    
3045
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3046
{
3047
    acb->pool->cancel(acb);
3048
}
3049

    
3050
/* block I/O throttling */
3051
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3052
                 bool is_write, double elapsed_time, uint64_t *wait)
3053
{
3054
    uint64_t bps_limit = 0;
3055
    double   bytes_limit, bytes_base, bytes_res;
3056
    double   slice_time, wait_time;
3057

    
3058
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3059
        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3060
    } else if (bs->io_limits.bps[is_write]) {
3061
        bps_limit = bs->io_limits.bps[is_write];
3062
    } else {
3063
        if (wait) {
3064
            *wait = 0;
3065
        }
3066

    
3067
        return false;
3068
    }
3069

    
3070
    slice_time = bs->slice_end - bs->slice_start;
3071
    slice_time /= (NANOSECONDS_PER_SECOND);
3072
    bytes_limit = bps_limit * slice_time;
3073
    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3074
    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3075
        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3076
    }
3077

    
3078
    /* bytes_base: the bytes of data which have been read/written; and
3079
     *             it is obtained from the history statistic info.
3080
     * bytes_res: the remaining bytes of data which need to be read/written.
3081
     * (bytes_base + bytes_res) / bps_limit: used to calcuate
3082
     *             the total time for completing reading/writting all data.
3083
     */
3084
    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3085

    
3086
    if (bytes_base + bytes_res <= bytes_limit) {
3087
        if (wait) {
3088
            *wait = 0;
3089
        }
3090

    
3091
        return false;
3092
    }
3093

    
3094
    /* Calc approx time to dispatch */
3095
    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3096

    
3097
    /* When the I/O rate at runtime exceeds the limits,
3098
     * bs->slice_end need to be extended in order that the current statistic
3099
     * info can be kept until the timer fire, so it is increased and tuned
3100
     * based on the result of experiment.
3101
     */
3102
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3103
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3104
    if (wait) {
3105
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3106
    }
3107

    
3108
    return true;
3109
}
3110

    
3111
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3112
                             double elapsed_time, uint64_t *wait)
3113
{
3114
    uint64_t iops_limit = 0;
3115
    double   ios_limit, ios_base;
3116
    double   slice_time, wait_time;
3117

    
3118
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3119
        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3120
    } else if (bs->io_limits.iops[is_write]) {
3121
        iops_limit = bs->io_limits.iops[is_write];
3122
    } else {
3123
        if (wait) {
3124
            *wait = 0;
3125
        }
3126

    
3127
        return false;
3128
    }
3129

    
3130
    slice_time = bs->slice_end - bs->slice_start;
3131
    slice_time /= (NANOSECONDS_PER_SECOND);
3132
    ios_limit  = iops_limit * slice_time;
3133
    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3134
    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3135
        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3136
    }
3137

    
3138
    if (ios_base + 1 <= ios_limit) {
3139
        if (wait) {
3140
            *wait = 0;
3141
        }
3142

    
3143
        return false;
3144
    }
3145

    
3146
    /* Calc approx time to dispatch */
3147
    wait_time = (ios_base + 1) / iops_limit;
3148
    if (wait_time > elapsed_time) {
3149
        wait_time = wait_time - elapsed_time;
3150
    } else {
3151
        wait_time = 0;
3152
    }
3153

    
3154
    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3155
    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3156
    if (wait) {
3157
        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3158
    }
3159

    
3160
    return true;
3161
}
3162

    
3163
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3164
                           bool is_write, int64_t *wait)
3165
{
3166
    int64_t  now, max_wait;
3167
    uint64_t bps_wait = 0, iops_wait = 0;
3168
    double   elapsed_time;
3169
    int      bps_ret, iops_ret;
3170

    
3171
    now = qemu_get_clock_ns(vm_clock);
3172
    if ((bs->slice_start < now)
3173
        && (bs->slice_end > now)) {
3174
        bs->slice_end = now + bs->slice_time;
3175
    } else {
3176
        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3177
        bs->slice_start = now;
3178
        bs->slice_end   = now + bs->slice_time;
3179

    
3180
        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3181
        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3182

    
3183
        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3184
        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3185
    }
3186

    
3187
    elapsed_time  = now - bs->slice_start;
3188
    elapsed_time  /= (NANOSECONDS_PER_SECOND);
3189

    
3190
    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3191
                                      is_write, elapsed_time, &bps_wait);
3192
    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3193
                                      elapsed_time, &iops_wait);
3194
    if (bps_ret || iops_ret) {
3195
        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3196
        if (wait) {
3197
            *wait = max_wait;
3198
        }
3199

    
3200
        now = qemu_get_clock_ns(vm_clock);
3201
        if (bs->slice_end < now + max_wait) {
3202
            bs->slice_end = now + max_wait;
3203
        }
3204

    
3205
        return true;
3206
    }
3207

    
3208
    if (wait) {
3209
        *wait = 0;
3210
    }
3211

    
3212
    return false;
3213
}
3214

    
3215
/**************************************************************/
3216
/* async block device emulation */
3217

    
3218
typedef struct BlockDriverAIOCBSync {
3219
    BlockDriverAIOCB common;
3220
    QEMUBH *bh;
3221
    int ret;
3222
    /* vector translation state */
3223
    QEMUIOVector *qiov;
3224
    uint8_t *bounce;
3225
    int is_write;
3226
} BlockDriverAIOCBSync;
3227

    
3228
static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3229
{
3230
    BlockDriverAIOCBSync *acb =
3231
        container_of(blockacb, BlockDriverAIOCBSync, common);
3232
    qemu_bh_delete(acb->bh);
3233
    acb->bh = NULL;
3234
    qemu_aio_release(acb);
3235
}
3236

    
3237
static AIOPool bdrv_em_aio_pool = {
3238
    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3239
    .cancel             = bdrv_aio_cancel_em,
3240
};
3241

    
3242
static void bdrv_aio_bh_cb(void *opaque)
3243
{
3244
    BlockDriverAIOCBSync *acb = opaque;
3245

    
3246
    if (!acb->is_write)
3247
        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3248
    qemu_vfree(acb->bounce);
3249
    acb->common.cb(acb->common.opaque, acb->ret);
3250
    qemu_bh_delete(acb->bh);
3251
    acb->bh = NULL;
3252
    qemu_aio_release(acb);
3253
}
3254

    
3255
static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3256
                                            int64_t sector_num,
3257
                                            QEMUIOVector *qiov,
3258
                                            int nb_sectors,
3259
                                            BlockDriverCompletionFunc *cb,
3260
                                            void *opaque,
3261
                                            int is_write)
3262

    
3263
{
3264
    BlockDriverAIOCBSync *acb;
3265

    
3266
    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3267
    acb->is_write = is_write;
3268
    acb->qiov = qiov;
3269
    acb->bounce = qemu_blockalign(bs, qiov->size);
3270
    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3271

    
3272
    if (is_write) {
3273
        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3274
        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3275
    } else {
3276
        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3277
    }
3278

    
3279
    qemu_bh_schedule(acb->bh);
3280

    
3281
    return &acb->common;
3282
}
3283

    
3284
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3285
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3286
        BlockDriverCompletionFunc *cb, void *opaque)
3287
{
3288
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3289
}
3290

    
3291
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3292
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3293
        BlockDriverCompletionFunc *cb, void *opaque)
3294
{
3295
    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3296
}
3297

    
3298

    
3299
typedef struct BlockDriverAIOCBCoroutine {
3300
    BlockDriverAIOCB common;
3301
    BlockRequest req;
3302
    bool is_write;
3303
    QEMUBH* bh;
3304
} BlockDriverAIOCBCoroutine;
3305

    
3306
static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3307
{
3308
    qemu_aio_flush();
3309
}
3310

    
3311
static AIOPool bdrv_em_co_aio_pool = {
3312
    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3313
    .cancel             = bdrv_aio_co_cancel_em,
3314
};
3315

    
3316
static void bdrv_co_em_bh(void *opaque)
3317
{
3318
    BlockDriverAIOCBCoroutine *acb = opaque;
3319

    
3320
    acb->common.cb(acb->common.opaque, acb->req.error);
3321
    qemu_bh_delete(acb->bh);
3322
    qemu_aio_release(acb);
3323
}
3324

    
3325
/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3326
static void coroutine_fn bdrv_co_do_rw(void *opaque)
3327
{
3328
    BlockDriverAIOCBCoroutine *acb = opaque;
3329
    BlockDriverState *bs = acb->common.bs;
3330

    
3331
    if (!acb->is_write) {
3332
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3333
            acb->req.nb_sectors, acb->req.qiov, 0);
3334
    } else {
3335
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3336
            acb->req.nb_sectors, acb->req.qiov, 0);
3337
    }
3338

    
3339
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3340
    qemu_bh_schedule(acb->bh);
3341
}
3342

    
3343
static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3344
                                               int64_t sector_num,
3345
                                               QEMUIOVector *qiov,
3346
                                               int nb_sectors,
3347
                                               BlockDriverCompletionFunc *cb,
3348
                                               void *opaque,
3349
                                               bool is_write)
3350
{
3351
    Coroutine *co;
3352
    BlockDriverAIOCBCoroutine *acb;
3353

    
3354
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3355
    acb->req.sector = sector_num;
3356
    acb->req.nb_sectors = nb_sectors;
3357
    acb->req.qiov = qiov;
3358
    acb->is_write = is_write;
3359

    
3360
    co = qemu_coroutine_create(bdrv_co_do_rw);
3361
    qemu_coroutine_enter(co, acb);
3362

    
3363
    return &acb->common;
3364
}
3365

    
3366
static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3367
{
3368
    BlockDriverAIOCBCoroutine *acb = opaque;
3369
    BlockDriverState *bs = acb->common.bs;
3370

    
3371
    acb->req.error = bdrv_co_flush(bs);
3372
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3373
    qemu_bh_schedule(acb->bh);
3374
}
3375

    
3376
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3377
        BlockDriverCompletionFunc *cb, void *opaque)
3378
{
3379
    trace_bdrv_aio_flush(bs, opaque);
3380

    
3381
    Coroutine *co;
3382
    BlockDriverAIOCBCoroutine *acb;
3383

    
3384
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3385
    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3386
    qemu_coroutine_enter(co, acb);
3387

    
3388
    return &acb->common;
3389
}
3390

    
3391
static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3392
{
3393
    BlockDriverAIOCBCoroutine *acb = opaque;
3394
    BlockDriverState *bs = acb->common.bs;
3395

    
3396
    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3397
    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3398
    qemu_bh_schedule(acb->bh);
3399
}
3400

    
3401
BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3402
        int64_t sector_num, int nb_sectors,
3403
        BlockDriverCompletionFunc *cb, void *opaque)
3404
{
3405
    Coroutine *co;
3406
    BlockDriverAIOCBCoroutine *acb;
3407

    
3408
    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3409

    
3410
    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3411
    acb->req.sector = sector_num;
3412
    acb->req.nb_sectors = nb_sectors;
3413
    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3414
    qemu_coroutine_enter(co, acb);
3415

    
3416
    return &acb->common;
3417
}
3418

    
3419
void bdrv_init(void)
3420
{
3421
    module_call_init(MODULE_INIT_BLOCK);
3422
}
3423

    
3424
void bdrv_init_with_whitelist(void)
3425
{
3426
    use_bdrv_whitelist = 1;
3427
    bdrv_init();
3428
}
3429

    
3430
void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3431
                   BlockDriverCompletionFunc *cb, void *opaque)
3432
{
3433
    BlockDriverAIOCB *acb;
3434

    
3435
    if (pool->free_aiocb) {
3436
        acb = pool->free_aiocb;
3437
        pool->free_aiocb = acb->next;
3438
    } else {
3439
        acb = g_malloc0(pool->aiocb_size);
3440
        acb->pool = pool;
3441
    }
3442
    acb->bs = bs;
3443
    acb->cb = cb;
3444
    acb->opaque = opaque;
3445
    return acb;
3446
}
3447

    
3448
void qemu_aio_release(void *p)
3449
{
3450
    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3451
    AIOPool *pool = acb->pool;
3452
    acb->next = pool->free_aiocb;
3453
    pool->free_aiocb = acb;
3454
}
3455

    
3456
/**************************************************************/
3457
/* Coroutine block device emulation */
3458

    
3459
typedef struct CoroutineIOCompletion {
3460
    Coroutine *coroutine;
3461
    int ret;
3462
} CoroutineIOCompletion;
3463

    
3464
static void bdrv_co_io_em_complete(void *opaque, int ret)
3465
{
3466
    CoroutineIOCompletion *co = opaque;
3467

    
3468
    co->ret = ret;
3469
    qemu_coroutine_enter(co->coroutine, NULL);
3470
}
3471

    
3472
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3473
                                      int nb_sectors, QEMUIOVector *iov,
3474
                                      bool is_write)
3475
{
3476
    CoroutineIOCompletion co = {
3477
        .coroutine = qemu_coroutine_self(),
3478
    };
3479
    BlockDriverAIOCB *acb;
3480

    
3481
    if (is_write) {
3482
        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3483
                                       bdrv_co_io_em_complete, &co);
3484
    } else {
3485
        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3486
                                      bdrv_co_io_em_complete, &co);
3487
    }
3488

    
3489
    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3490
    if (!acb) {
3491
        return -EIO;
3492
    }
3493
    qemu_coroutine_yield();
3494

    
3495
    return co.ret;
3496
}
3497

    
3498
static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3499
                                         int64_t sector_num, int nb_sectors,
3500
                                         QEMUIOVector *iov)
3501
{
3502
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3503
}
3504

    
3505
static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3506
                                         int64_t sector_num, int nb_sectors,
3507
                                         QEMUIOVector *iov)
3508
{
3509
    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3510
}
3511

    
3512
static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3513
{
3514
    RwCo *rwco = opaque;
3515

    
3516
    rwco->ret = bdrv_co_flush(rwco->bs);
3517
}
3518

    
3519
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3520
{
3521
    int ret;
3522

    
3523
    if (!bs->drv) {
3524
        return 0;
3525
    }
3526

    
3527
    /* Write back cached data to the OS even with cache=unsafe */
3528
    if (bs->drv->bdrv_co_flush_to_os) {
3529
        ret = bs->drv->bdrv_co_flush_to_os(bs);
3530
        if (ret < 0) {
3531
            return ret;
3532
        }
3533
    }
3534

    
3535
    /* But don't actually force it to the disk with cache=unsafe */
3536
    if (bs->open_flags & BDRV_O_NO_FLUSH) {
3537
        return 0;
3538
    }
3539

    
3540
    if (bs->drv->bdrv_co_flush_to_disk) {
3541
        return bs->drv->bdrv_co_flush_to_disk(bs);
3542
    } else if (bs->drv->bdrv_aio_flush) {
3543
        BlockDriverAIOCB *acb;
3544
        CoroutineIOCompletion co = {
3545
            .coroutine = qemu_coroutine_self(),
3546
        };
3547

    
3548
        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3549
        if (acb == NULL) {
3550
            return -EIO;
3551
        } else {
3552
            qemu_coroutine_yield();
3553
            return co.ret;
3554
        }
3555
    } else {
3556
        /*
3557
         * Some block drivers always operate in either writethrough or unsafe
3558
         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3559
         * know how the server works (because the behaviour is hardcoded or
3560
         * depends on server-side configuration), so we can't ensure that
3561
         * everything is safe on disk. Returning an error doesn't work because
3562
         * that would break guests even if the server operates in writethrough
3563
         * mode.
3564
         *
3565
         * Let's hope the user knows what he's doing.
3566
         */
3567
        return 0;
3568
    }
3569
}
3570

    
3571
void bdrv_invalidate_cache(BlockDriverState *bs)
3572
{
3573
    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3574
        bs->drv->bdrv_invalidate_cache(bs);
3575
    }
3576
}
3577

    
3578
void bdrv_invalidate_cache_all(void)
3579
{
3580
    BlockDriverState *bs;
3581

    
3582
    QTAILQ_FOREACH(bs, &bdrv_states, list) {
3583
        bdrv_invalidate_cache(bs);
3584
    }
3585
}
3586

    
3587
int bdrv_flush(BlockDriverState *bs)
3588
{
3589
    Coroutine *co;
3590
    RwCo rwco = {
3591
        .bs = bs,
3592
        .ret = NOT_DONE,
3593
    };
3594

    
3595
    if (qemu_in_coroutine()) {
3596
        /* Fast-path if already in coroutine context */
3597
        bdrv_flush_co_entry(&rwco);
3598
    } else {
3599
        co = qemu_coroutine_create(bdrv_flush_co_entry);
3600
        qemu_coroutine_enter(co, &rwco);
3601
        while (rwco.ret == NOT_DONE) {
3602
            qemu_aio_wait();
3603
        }
3604
    }
3605

    
3606
    return rwco.ret;
3607
}
3608

    
3609
static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3610
{
3611
    RwCo *rwco = opaque;
3612

    
3613
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3614
}
3615

    
3616
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3617
                                 int nb_sectors)
3618
{
3619
    if (!bs->drv) {
3620
        return -ENOMEDIUM;
3621
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3622
        return -EIO;
3623
    } else if (bs->read_only) {
3624
        return -EROFS;
3625
    } else if (bs->drv->bdrv_co_discard) {
3626
        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3627
    } else if (bs->drv->bdrv_aio_discard) {
3628
        BlockDriverAIOCB *acb;
3629
        CoroutineIOCompletion co = {
3630
            .coroutine = qemu_coroutine_self(),
3631
        };
3632

    
3633
        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3634
                                        bdrv_co_io_em_complete, &co);
3635
        if (acb == NULL) {
3636
            return -EIO;
3637
        } else {
3638
            qemu_coroutine_yield();
3639
            return co.ret;
3640
        }
3641
    } else {
3642
        return 0;
3643
    }
3644
}
3645

    
3646
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3647
{
3648
    Coroutine *co;
3649
    RwCo rwco = {
3650
        .bs = bs,
3651
        .sector_num = sector_num,
3652
        .nb_sectors = nb_sectors,
3653
        .ret = NOT_DONE,
3654
    };
3655

    
3656
    if (qemu_in_coroutine()) {
3657
        /* Fast-path if already in coroutine context */
3658
        bdrv_discard_co_entry(&rwco);
3659
    } else {
3660
        co = qemu_coroutine_create(bdrv_discard_co_entry);
3661
        qemu_coroutine_enter(co, &rwco);
3662
        while (rwco.ret == NOT_DONE) {
3663
            qemu_aio_wait();
3664
        }
3665
    }
3666

    
3667
    return rwco.ret;
3668
}
3669

    
3670
/**************************************************************/
3671
/* removable device support */
3672

    
3673
/**
3674
 * Return TRUE if the media is present
3675
 */
3676
int bdrv_is_inserted(BlockDriverState *bs)
3677
{
3678
    BlockDriver *drv = bs->drv;
3679

    
3680
    if (!drv)
3681
        return 0;
3682
    if (!drv->bdrv_is_inserted)
3683
        return 1;
3684
    return drv->bdrv_is_inserted(bs);
3685
}
3686

    
3687
/**
3688
 * Return whether the media changed since the last call to this
3689
 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3690
 */
3691
int bdrv_media_changed(BlockDriverState *bs)
3692
{
3693
    BlockDriver *drv = bs->drv;
3694

    
3695
    if (drv && drv->bdrv_media_changed) {
3696
        return drv->bdrv_media_changed(bs);
3697
    }
3698
    return -ENOTSUP;
3699
}
3700

    
3701
/**
3702
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3703
 */
3704
void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3705
{
3706
    BlockDriver *drv = bs->drv;
3707

    
3708
    if (drv && drv->bdrv_eject) {
3709
        drv->bdrv_eject(bs, eject_flag);
3710
    }
3711

    
3712
    if (bs->device_name[0] != '\0') {
3713
        bdrv_emit_qmp_eject_event(bs, eject_flag);
3714
    }
3715
}
3716

    
3717
/**
3718
 * Lock or unlock the media (if it is locked, the user won't be able
3719
 * to eject it manually).
3720
 */
3721
void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3722
{
3723
    BlockDriver *drv = bs->drv;
3724

    
3725
    trace_bdrv_lock_medium(bs, locked);
3726

    
3727
    if (drv && drv->bdrv_lock_medium) {
3728
        drv->bdrv_lock_medium(bs, locked);
3729
    }
3730
}
3731

    
3732
/* needed for generic scsi interface */
3733

    
3734
int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3735
{
3736
    BlockDriver *drv = bs->drv;
3737

    
3738
    if (drv && drv->bdrv_ioctl)
3739
        return drv->bdrv_ioctl(bs, req, buf);
3740
    return -ENOTSUP;
3741
}
3742

    
3743
BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3744
        unsigned long int req, void *buf,
3745
        BlockDriverCompletionFunc *cb, void *opaque)
3746
{
3747
    BlockDriver *drv = bs->drv;
3748

    
3749
    if (drv && drv->bdrv_aio_ioctl)
3750
        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3751
    return NULL;
3752
}
3753

    
3754
void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3755
{
3756
    bs->buffer_alignment = align;
3757
}
3758

    
3759
void *qemu_blockalign(BlockDriverState *bs, size_t size)
3760
{
3761
    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3762
}
3763

    
3764
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3765
{
3766
    int64_t bitmap_size;
3767

    
3768
    bs->dirty_count = 0;
3769
    if (enable) {
3770
        if (!bs->dirty_bitmap) {
3771
            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3772
                    BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3773
            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3774

    
3775
            bs->dirty_bitmap = g_malloc0(bitmap_size);
3776
        }
3777
    } else {
3778
        if (bs->dirty_bitmap) {
3779
            g_free(bs->dirty_bitmap);
3780
            bs->dirty_bitmap = NULL;
3781
        }
3782
    }
3783
}
3784

    
3785
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3786
{
3787
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3788

    
3789
    if (bs->dirty_bitmap &&
3790
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3791
        return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3792
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
3793
    } else {
3794
        return 0;
3795
    }
3796
}
3797

    
3798
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3799
                      int nr_sectors)
3800
{
3801
    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3802
}
3803

    
3804
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3805
{
3806
    return bs->dirty_count;
3807
}
3808

    
3809
void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3810
{
3811
    assert(bs->in_use != in_use);
3812
    bs->in_use = in_use;
3813
}
3814

    
3815
int bdrv_in_use(BlockDriverState *bs)
3816
{
3817
    return bs->in_use;
3818
}
3819

    
3820
void bdrv_iostatus_enable(BlockDriverState *bs)
3821
{
3822
    bs->iostatus_enabled = true;
3823
    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3824
}
3825

    
3826
/* The I/O status is only enabled if the drive explicitly
3827
 * enables it _and_ the VM is configured to stop on errors */
3828
bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3829
{
3830
    return (bs->iostatus_enabled &&
3831
           (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3832
            bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3833
            bs->on_read_error == BLOCK_ERR_STOP_ANY));
3834
}
3835

    
3836
void bdrv_iostatus_disable(BlockDriverState *bs)
3837
{
3838
    bs->iostatus_enabled = false;
3839
}
3840

    
3841
void bdrv_iostatus_reset(BlockDriverState *bs)
3842
{
3843
    if (bdrv_iostatus_is_enabled(bs)) {
3844
        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3845
    }
3846
}
3847

    
3848
/* XXX: Today this is set by device models because it makes the implementation
3849
   quite simple. However, the block layer knows about the error, so it's
3850
   possible to implement this without device models being involved */
3851
void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3852
{
3853
    if (bdrv_iostatus_is_enabled(bs) &&
3854
        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3855
        assert(error >= 0);
3856
        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3857
                                         BLOCK_DEVICE_IO_STATUS_FAILED;
3858
    }
3859
}
3860

    
3861
void
3862
bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3863
        enum BlockAcctType type)
3864
{
3865
    assert(type < BDRV_MAX_IOTYPE);
3866

    
3867
    cookie->bytes = bytes;
3868
    cookie->start_time_ns = get_clock();
3869
    cookie->type = type;
3870
}
3871

    
3872
void
3873
bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3874
{
3875
    assert(cookie->type < BDRV_MAX_IOTYPE);
3876

    
3877
    bs->nr_bytes[cookie->type] += cookie->bytes;
3878
    bs->nr_ops[cookie->type]++;
3879
    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3880
}
3881

    
3882
int bdrv_img_create(const char *filename, const char *fmt,
3883
                    const char *base_filename, const char *base_fmt,
3884
                    char *options, uint64_t img_size, int flags)
3885
{
3886
    QEMUOptionParameter *param = NULL, *create_options = NULL;
3887
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
3888
    BlockDriverState *bs = NULL;
3889
    BlockDriver *drv, *proto_drv;
3890
    BlockDriver *backing_drv = NULL;
3891
    int ret = 0;
3892

    
3893
    /* Find driver and parse its options */
3894
    drv = bdrv_find_format(fmt);
3895
    if (!drv) {
3896
        error_report("Unknown file format '%s'", fmt);
3897
        ret = -EINVAL;
3898
        goto out;
3899
    }
3900

    
3901
    proto_drv = bdrv_find_protocol(filename);
3902
    if (!proto_drv) {
3903
        error_report("Unknown protocol '%s'", filename);
3904
        ret = -EINVAL;
3905
        goto out;
3906
    }
3907

    
3908
    create_options = append_option_parameters(create_options,
3909
                                              drv->create_options);
3910
    create_options = append_option_parameters(create_options,
3911
                                              proto_drv->create_options);
3912

    
3913
    /* Create parameter list with default values */
3914
    param = parse_option_parameters("", create_options, param);
3915

    
3916
    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3917

    
3918
    /* Parse -o options */
3919
    if (options) {
3920
        param = parse_option_parameters(options, create_options, param);
3921
        if (param == NULL) {
3922
            error_report("Invalid options for file format '%s'.", fmt);
3923
            ret = -EINVAL;
3924
            goto out;
3925
        }
3926
    }
3927

    
3928
    if (base_filename) {
3929
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3930
                                 base_filename)) {
3931
            error_report("Backing file not supported for file format '%s'",
3932
                         fmt);
3933
            ret = -EINVAL;
3934
            goto out;
3935
        }
3936
    }
3937

    
3938
    if (base_fmt) {
3939
        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3940
            error_report("Backing file format not supported for file "
3941
                         "format '%s'", fmt);
3942
            ret = -EINVAL;
3943
            goto out;
3944
        }
3945
    }
3946

    
3947
    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3948
    if (backing_file && backing_file->value.s) {
3949
        if (!strcmp(filename, backing_file->value.s)) {
3950
            error_report("Error: Trying to create an image with the "
3951
                         "same filename as the backing file");
3952
            ret = -EINVAL;
3953
            goto out;
3954
        }
3955
    }
3956

    
3957
    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3958
    if (backing_fmt && backing_fmt->value.s) {
3959
        backing_drv = bdrv_find_format(backing_fmt->value.s);
3960
        if (!backing_drv) {
3961
            error_report("Unknown backing file format '%s'",
3962
                         backing_fmt->value.s);
3963
            ret = -EINVAL;
3964
            goto out;
3965
        }
3966
    }
3967

    
3968
    // The size for the image must always be specified, with one exception:
3969
    // If we are using a backing file, we can obtain the size from there
3970
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
3971
    if (size && size->value.n == -1) {
3972
        if (backing_file && backing_file->value.s) {
3973
            uint64_t size;
3974
            char buf[32];
3975

    
3976
            bs = bdrv_new("");
3977

    
3978
            ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3979
            if (ret < 0) {
3980
                error_report("Could not open '%s'", backing_file->value.s);
3981
                goto out;
3982
            }
3983
            bdrv_get_geometry(bs, &size);
3984
            size *= 512;
3985

    
3986
            snprintf(buf, sizeof(buf), "%" PRId64, size);
3987
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3988
        } else {
3989
            error_report("Image creation needs a size parameter");
3990
            ret = -EINVAL;
3991
            goto out;
3992
        }
3993
    }
3994

    
3995
    printf("Formatting '%s', fmt=%s ", filename, fmt);
3996
    print_option_parameters(param);
3997
    puts("");
3998

    
3999
    ret = bdrv_create(drv, filename, param);
4000

    
4001
    if (ret < 0) {
4002
        if (ret == -ENOTSUP) {
4003
            error_report("Formatting or formatting option not supported for "
4004
                         "file format '%s'", fmt);
4005
        } else if (ret == -EFBIG) {
4006
            error_report("The image size is too large for file format '%s'",
4007
                         fmt);
4008
        } else {
4009
            error_report("%s: error while creating %s: %s", filename, fmt,
4010
                         strerror(-ret));
4011
        }
4012
    }
4013

    
4014
out:
4015
    free_option_parameters(create_options);
4016
    free_option_parameters(param);
4017

    
4018
    if (bs) {
4019
        bdrv_delete(bs);
4020
    }
4021

    
4022
    return ret;
4023
}
4024

    
4025
void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4026
                       BlockDriverCompletionFunc *cb, void *opaque)
4027
{
4028
    BlockJob *job;
4029

    
4030
    if (bs->job || bdrv_in_use(bs)) {
4031
        return NULL;
4032
    }
4033
    bdrv_set_in_use(bs, 1);
4034

    
4035
    job = g_malloc0(job_type->instance_size);
4036
    job->job_type      = job_type;
4037
    job->bs            = bs;
4038
    job->cb            = cb;
4039
    job->opaque        = opaque;
4040
    bs->job = job;
4041
    return job;
4042
}
4043

    
4044
void block_job_complete(BlockJob *job, int ret)
4045
{
4046
    BlockDriverState *bs = job->bs;
4047

    
4048
    assert(bs->job == job);
4049
    job->cb(job->opaque, ret);
4050
    bs->job = NULL;
4051
    g_free(job);
4052
    bdrv_set_in_use(bs, 0);
4053
}
4054

    
4055
int block_job_set_speed(BlockJob *job, int64_t value)
4056
{
4057
    if (!job->job_type->set_speed) {
4058
        return -ENOTSUP;
4059
    }
4060
    return job->job_type->set_speed(job, value);
4061
}
4062

    
4063
void block_job_cancel(BlockJob *job)
4064
{
4065
    job->cancelled = true;
4066
}
4067

    
4068
bool block_job_is_cancelled(BlockJob *job)
4069
{
4070
    return job->cancelled;
4071
}