Statistics
| Branch: | Revision:

root / block / qcow.c @ f8a2e5e3

History | View | Annotate | Download (26.5 kB)

1
/*
2
 * Block driver for the QCOW format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include "module.h"
27
#include <zlib.h>
28
#include "aes.h"
29
#include "migration.h"
30

    
31
/**************************************************************/
32
/* QEMU COW block driver with compression and encryption support */
33

    
34
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
35
#define QCOW_VERSION 1
36

    
37
#define QCOW_CRYPT_NONE 0
38
#define QCOW_CRYPT_AES  1
39

    
40
#define QCOW_OFLAG_COMPRESSED (1LL << 63)
41

    
42
typedef struct QCowHeader {
43
    uint32_t magic;
44
    uint32_t version;
45
    uint64_t backing_file_offset;
46
    uint32_t backing_file_size;
47
    uint32_t mtime;
48
    uint64_t size; /* in bytes */
49
    uint8_t cluster_bits;
50
    uint8_t l2_bits;
51
    uint32_t crypt_method;
52
    uint64_t l1_table_offset;
53
} QCowHeader;
54

    
55
#define L2_CACHE_SIZE 16
56

    
57
typedef struct BDRVQcowState {
58
    int cluster_bits;
59
    int cluster_size;
60
    int cluster_sectors;
61
    int l2_bits;
62
    int l2_size;
63
    int l1_size;
64
    uint64_t cluster_offset_mask;
65
    uint64_t l1_table_offset;
66
    uint64_t *l1_table;
67
    uint64_t *l2_cache;
68
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
69
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
70
    uint8_t *cluster_cache;
71
    uint8_t *cluster_data;
72
    uint64_t cluster_cache_offset;
73
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
74
    uint32_t crypt_method_header;
75
    AES_KEY aes_encrypt_key;
76
    AES_KEY aes_decrypt_key;
77
    CoMutex lock;
78
    Error *migration_blocker;
79
} BDRVQcowState;
80

    
81
static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
82

    
83
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
84
{
85
    const QCowHeader *cow_header = (const void *)buf;
86

    
87
    if (buf_size >= sizeof(QCowHeader) &&
88
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
89
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
90
        return 100;
91
    else
92
        return 0;
93
}
94

    
95
static int qcow_open(BlockDriverState *bs, int flags)
96
{
97
    BDRVQcowState *s = bs->opaque;
98
    int len, i, shift;
99
    QCowHeader header;
100

    
101
    if (bdrv_pread(bs->file, 0, &header, sizeof(header)) != sizeof(header))
102
        goto fail;
103
    be32_to_cpus(&header.magic);
104
    be32_to_cpus(&header.version);
105
    be64_to_cpus(&header.backing_file_offset);
106
    be32_to_cpus(&header.backing_file_size);
107
    be32_to_cpus(&header.mtime);
108
    be64_to_cpus(&header.size);
109
    be32_to_cpus(&header.crypt_method);
110
    be64_to_cpus(&header.l1_table_offset);
111

    
112
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
113
        goto fail;
114
    if (header.size <= 1 || header.cluster_bits < 9)
115
        goto fail;
116
    if (header.crypt_method > QCOW_CRYPT_AES)
117
        goto fail;
118
    s->crypt_method_header = header.crypt_method;
119
    if (s->crypt_method_header)
120
        bs->encrypted = 1;
121
    s->cluster_bits = header.cluster_bits;
122
    s->cluster_size = 1 << s->cluster_bits;
123
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
124
    s->l2_bits = header.l2_bits;
125
    s->l2_size = 1 << s->l2_bits;
126
    bs->total_sectors = header.size / 512;
127
    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
128

    
129
    /* read the level 1 table */
130
    shift = s->cluster_bits + s->l2_bits;
131
    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
132

    
133
    s->l1_table_offset = header.l1_table_offset;
134
    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
135
    if (!s->l1_table)
136
        goto fail;
137
    if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
138
        s->l1_size * sizeof(uint64_t))
139
        goto fail;
140
    for(i = 0;i < s->l1_size; i++) {
141
        be64_to_cpus(&s->l1_table[i]);
142
    }
143
    /* alloc L2 cache */
144
    s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
145
    if (!s->l2_cache)
146
        goto fail;
147
    s->cluster_cache = g_malloc(s->cluster_size);
148
    if (!s->cluster_cache)
149
        goto fail;
150
    s->cluster_data = g_malloc(s->cluster_size);
151
    if (!s->cluster_data)
152
        goto fail;
153
    s->cluster_cache_offset = -1;
154

    
155
    /* read the backing file name */
156
    if (header.backing_file_offset != 0) {
157
        len = header.backing_file_size;
158
        if (len > 1023)
159
            len = 1023;
160
        if (bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len) != len)
161
            goto fail;
162
        bs->backing_file[len] = '\0';
163
    }
164

    
165
    /* Disable migration when qcow images are used */
166
    error_set(&s->migration_blocker,
167
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
168
              "qcow", bs->device_name, "live migration");
169
    migrate_add_blocker(s->migration_blocker);
170

    
171
    qemu_co_mutex_init(&s->lock);
172
    return 0;
173

    
174
 fail:
175
    g_free(s->l1_table);
176
    g_free(s->l2_cache);
177
    g_free(s->cluster_cache);
178
    g_free(s->cluster_data);
179
    return -1;
180
}
181

    
182
static int qcow_set_key(BlockDriverState *bs, const char *key)
183
{
184
    BDRVQcowState *s = bs->opaque;
185
    uint8_t keybuf[16];
186
    int len, i;
187

    
188
    memset(keybuf, 0, 16);
189
    len = strlen(key);
190
    if (len > 16)
191
        len = 16;
192
    /* XXX: we could compress the chars to 7 bits to increase
193
       entropy */
194
    for(i = 0;i < len;i++) {
195
        keybuf[i] = key[i];
196
    }
197
    s->crypt_method = s->crypt_method_header;
198

    
199
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
200
        return -1;
201
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
202
        return -1;
203
    return 0;
204
}
205

    
206
/* The crypt function is compatible with the linux cryptoloop
207
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
208
   supported */
209
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
210
                            uint8_t *out_buf, const uint8_t *in_buf,
211
                            int nb_sectors, int enc,
212
                            const AES_KEY *key)
213
{
214
    union {
215
        uint64_t ll[2];
216
        uint8_t b[16];
217
    } ivec;
218
    int i;
219

    
220
    for(i = 0; i < nb_sectors; i++) {
221
        ivec.ll[0] = cpu_to_le64(sector_num);
222
        ivec.ll[1] = 0;
223
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
224
                        ivec.b, enc);
225
        sector_num++;
226
        in_buf += 512;
227
        out_buf += 512;
228
    }
229
}
230

    
231
/* 'allocate' is:
232
 *
233
 * 0 to not allocate.
234
 *
235
 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
236
 * 'n_end')
237
 *
238
 * 2 to allocate a compressed cluster of size
239
 * 'compressed_size'. 'compressed_size' must be > 0 and <
240
 * cluster_size
241
 *
242
 * return 0 if not allocated.
243
 */
244
static uint64_t get_cluster_offset(BlockDriverState *bs,
245
                                   uint64_t offset, int allocate,
246
                                   int compressed_size,
247
                                   int n_start, int n_end)
248
{
249
    BDRVQcowState *s = bs->opaque;
250
    int min_index, i, j, l1_index, l2_index;
251
    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
252
    uint32_t min_count;
253
    int new_l2_table;
254

    
255
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
256
    l2_offset = s->l1_table[l1_index];
257
    new_l2_table = 0;
258
    if (!l2_offset) {
259
        if (!allocate)
260
            return 0;
261
        /* allocate a new l2 entry */
262
        l2_offset = bdrv_getlength(bs->file);
263
        /* round to cluster size */
264
        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
265
        /* update the L1 entry */
266
        s->l1_table[l1_index] = l2_offset;
267
        tmp = cpu_to_be64(l2_offset);
268
        if (bdrv_pwrite_sync(bs->file,
269
                s->l1_table_offset + l1_index * sizeof(tmp),
270
                &tmp, sizeof(tmp)) < 0)
271
            return 0;
272
        new_l2_table = 1;
273
    }
274
    for(i = 0; i < L2_CACHE_SIZE; i++) {
275
        if (l2_offset == s->l2_cache_offsets[i]) {
276
            /* increment the hit count */
277
            if (++s->l2_cache_counts[i] == 0xffffffff) {
278
                for(j = 0; j < L2_CACHE_SIZE; j++) {
279
                    s->l2_cache_counts[j] >>= 1;
280
                }
281
            }
282
            l2_table = s->l2_cache + (i << s->l2_bits);
283
            goto found;
284
        }
285
    }
286
    /* not found: load a new entry in the least used one */
287
    min_index = 0;
288
    min_count = 0xffffffff;
289
    for(i = 0; i < L2_CACHE_SIZE; i++) {
290
        if (s->l2_cache_counts[i] < min_count) {
291
            min_count = s->l2_cache_counts[i];
292
            min_index = i;
293
        }
294
    }
295
    l2_table = s->l2_cache + (min_index << s->l2_bits);
296
    if (new_l2_table) {
297
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
298
        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
299
                s->l2_size * sizeof(uint64_t)) < 0)
300
            return 0;
301
    } else {
302
        if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
303
            s->l2_size * sizeof(uint64_t))
304
            return 0;
305
    }
306
    s->l2_cache_offsets[min_index] = l2_offset;
307
    s->l2_cache_counts[min_index] = 1;
308
 found:
309
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
310
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
311
    if (!cluster_offset ||
312
        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
313
        if (!allocate)
314
            return 0;
315
        /* allocate a new cluster */
316
        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
317
            (n_end - n_start) < s->cluster_sectors) {
318
            /* if the cluster is already compressed, we must
319
               decompress it in the case it is not completely
320
               overwritten */
321
            if (decompress_cluster(bs, cluster_offset) < 0)
322
                return 0;
323
            cluster_offset = bdrv_getlength(bs->file);
324
            cluster_offset = (cluster_offset + s->cluster_size - 1) &
325
                ~(s->cluster_size - 1);
326
            /* write the cluster content */
327
            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) !=
328
                s->cluster_size)
329
                return -1;
330
        } else {
331
            cluster_offset = bdrv_getlength(bs->file);
332
            if (allocate == 1) {
333
                /* round to cluster size */
334
                cluster_offset = (cluster_offset + s->cluster_size - 1) &
335
                    ~(s->cluster_size - 1);
336
                bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
337
                /* if encrypted, we must initialize the cluster
338
                   content which won't be written */
339
                if (s->crypt_method &&
340
                    (n_end - n_start) < s->cluster_sectors) {
341
                    uint64_t start_sect;
342
                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
343
                    memset(s->cluster_data + 512, 0x00, 512);
344
                    for(i = 0; i < s->cluster_sectors; i++) {
345
                        if (i < n_start || i >= n_end) {
346
                            encrypt_sectors(s, start_sect + i,
347
                                            s->cluster_data,
348
                                            s->cluster_data + 512, 1, 1,
349
                                            &s->aes_encrypt_key);
350
                            if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
351
                                            s->cluster_data, 512) != 512)
352
                                return -1;
353
                        }
354
                    }
355
                }
356
            } else if (allocate == 2) {
357
                cluster_offset |= QCOW_OFLAG_COMPRESSED |
358
                    (uint64_t)compressed_size << (63 - s->cluster_bits);
359
            }
360
        }
361
        /* update L2 table */
362
        tmp = cpu_to_be64(cluster_offset);
363
        l2_table[l2_index] = tmp;
364
        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
365
                &tmp, sizeof(tmp)) < 0)
366
            return 0;
367
    }
368
    return cluster_offset;
369
}
370

    
371
static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
372
        int64_t sector_num, int nb_sectors, int *pnum)
373
{
374
    BDRVQcowState *s = bs->opaque;
375
    int index_in_cluster, n;
376
    uint64_t cluster_offset;
377

    
378
    qemu_co_mutex_lock(&s->lock);
379
    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
380
    qemu_co_mutex_unlock(&s->lock);
381
    index_in_cluster = sector_num & (s->cluster_sectors - 1);
382
    n = s->cluster_sectors - index_in_cluster;
383
    if (n > nb_sectors)
384
        n = nb_sectors;
385
    *pnum = n;
386
    return (cluster_offset != 0);
387
}
388

    
389
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
390
                             const uint8_t *buf, int buf_size)
391
{
392
    z_stream strm1, *strm = &strm1;
393
    int ret, out_len;
394

    
395
    memset(strm, 0, sizeof(*strm));
396

    
397
    strm->next_in = (uint8_t *)buf;
398
    strm->avail_in = buf_size;
399
    strm->next_out = out_buf;
400
    strm->avail_out = out_buf_size;
401

    
402
    ret = inflateInit2(strm, -12);
403
    if (ret != Z_OK)
404
        return -1;
405
    ret = inflate(strm, Z_FINISH);
406
    out_len = strm->next_out - out_buf;
407
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
408
        out_len != out_buf_size) {
409
        inflateEnd(strm);
410
        return -1;
411
    }
412
    inflateEnd(strm);
413
    return 0;
414
}
415

    
416
static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
417
{
418
    BDRVQcowState *s = bs->opaque;
419
    int ret, csize;
420
    uint64_t coffset;
421

    
422
    coffset = cluster_offset & s->cluster_offset_mask;
423
    if (s->cluster_cache_offset != coffset) {
424
        csize = cluster_offset >> (63 - s->cluster_bits);
425
        csize &= (s->cluster_size - 1);
426
        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
427
        if (ret != csize)
428
            return -1;
429
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
430
                              s->cluster_data, csize) < 0) {
431
            return -1;
432
        }
433
        s->cluster_cache_offset = coffset;
434
    }
435
    return 0;
436
}
437

    
438
static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
439
                         int nb_sectors, QEMUIOVector *qiov)
440
{
441
    BDRVQcowState *s = bs->opaque;
442
    int index_in_cluster;
443
    int ret = 0, n;
444
    uint64_t cluster_offset;
445
    struct iovec hd_iov;
446
    QEMUIOVector hd_qiov;
447
    uint8_t *buf;
448
    void *orig_buf;
449

    
450
    if (qiov->niov > 1) {
451
        buf = orig_buf = qemu_blockalign(bs, qiov->size);
452
    } else {
453
        orig_buf = NULL;
454
        buf = (uint8_t *)qiov->iov->iov_base;
455
    }
456

    
457
    qemu_co_mutex_lock(&s->lock);
458

    
459
    while (nb_sectors != 0) {
460
        /* prepare next request */
461
        cluster_offset = get_cluster_offset(bs, sector_num << 9,
462
                                                 0, 0, 0, 0);
463
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
464
        n = s->cluster_sectors - index_in_cluster;
465
        if (n > nb_sectors) {
466
            n = nb_sectors;
467
        }
468

    
469
        if (!cluster_offset) {
470
            if (bs->backing_hd) {
471
                /* read from the base image */
472
                hd_iov.iov_base = (void *)buf;
473
                hd_iov.iov_len = n * 512;
474
                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
475
                qemu_co_mutex_unlock(&s->lock);
476
                ret = bdrv_co_readv(bs->backing_hd, sector_num,
477
                                    n, &hd_qiov);
478
                qemu_co_mutex_lock(&s->lock);
479
                if (ret < 0) {
480
                    goto fail;
481
                }
482
            } else {
483
                /* Note: in this case, no need to wait */
484
                memset(buf, 0, 512 * n);
485
            }
486
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
487
            /* add AIO support for compressed blocks ? */
488
            if (decompress_cluster(bs, cluster_offset) < 0) {
489
                goto fail;
490
            }
491
            memcpy(buf,
492
                   s->cluster_cache + index_in_cluster * 512, 512 * n);
493
        } else {
494
            if ((cluster_offset & 511) != 0) {
495
                goto fail;
496
            }
497
            hd_iov.iov_base = (void *)buf;
498
            hd_iov.iov_len = n * 512;
499
            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
500
            qemu_co_mutex_unlock(&s->lock);
501
            ret = bdrv_co_readv(bs->file,
502
                                (cluster_offset >> 9) + index_in_cluster,
503
                                n, &hd_qiov);
504
            qemu_co_mutex_lock(&s->lock);
505
            if (ret < 0) {
506
                break;
507
            }
508
            if (s->crypt_method) {
509
                encrypt_sectors(s, sector_num, buf, buf,
510
                                n, 0,
511
                                &s->aes_decrypt_key);
512
            }
513
        }
514
        ret = 0;
515

    
516
        nb_sectors -= n;
517
        sector_num += n;
518
        buf += n * 512;
519
    }
520

    
521
done:
522
    qemu_co_mutex_unlock(&s->lock);
523

    
524
    if (qiov->niov > 1) {
525
        qemu_iovec_from_buffer(qiov, orig_buf, qiov->size);
526
        qemu_vfree(orig_buf);
527
    }
528

    
529
    return ret;
530

    
531
fail:
532
    ret = -EIO;
533
    goto done;
534
}
535

    
536
static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
537
                          int nb_sectors, QEMUIOVector *qiov)
538
{
539
    BDRVQcowState *s = bs->opaque;
540
    int index_in_cluster;
541
    uint64_t cluster_offset;
542
    const uint8_t *src_buf;
543
    int ret = 0, n;
544
    uint8_t *cluster_data = NULL;
545
    struct iovec hd_iov;
546
    QEMUIOVector hd_qiov;
547
    uint8_t *buf;
548
    void *orig_buf;
549

    
550
    s->cluster_cache_offset = -1; /* disable compressed cache */
551

    
552
    if (qiov->niov > 1) {
553
        buf = orig_buf = qemu_blockalign(bs, qiov->size);
554
        qemu_iovec_to_buffer(qiov, buf);
555
    } else {
556
        orig_buf = NULL;
557
        buf = (uint8_t *)qiov->iov->iov_base;
558
    }
559

    
560
    qemu_co_mutex_lock(&s->lock);
561

    
562
    while (nb_sectors != 0) {
563

    
564
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
565
        n = s->cluster_sectors - index_in_cluster;
566
        if (n > nb_sectors) {
567
            n = nb_sectors;
568
        }
569
        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
570
                                            index_in_cluster,
571
                                            index_in_cluster + n);
572
        if (!cluster_offset || (cluster_offset & 511) != 0) {
573
            ret = -EIO;
574
            break;
575
        }
576
        if (s->crypt_method) {
577
            if (!cluster_data) {
578
                cluster_data = g_malloc0(s->cluster_size);
579
            }
580
            encrypt_sectors(s, sector_num, cluster_data, buf,
581
                            n, 1, &s->aes_encrypt_key);
582
            src_buf = cluster_data;
583
        } else {
584
            src_buf = buf;
585
        }
586

    
587
        hd_iov.iov_base = (void *)src_buf;
588
        hd_iov.iov_len = n * 512;
589
        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
590
        qemu_co_mutex_unlock(&s->lock);
591
        ret = bdrv_co_writev(bs->file,
592
                             (cluster_offset >> 9) + index_in_cluster,
593
                             n, &hd_qiov);
594
        qemu_co_mutex_lock(&s->lock);
595
        if (ret < 0) {
596
            break;
597
        }
598
        ret = 0;
599

    
600
        nb_sectors -= n;
601
        sector_num += n;
602
        buf += n * 512;
603
    }
604
    qemu_co_mutex_unlock(&s->lock);
605

    
606
    if (qiov->niov > 1) {
607
        qemu_vfree(orig_buf);
608
    }
609
    g_free(cluster_data);
610

    
611
    return ret;
612
}
613

    
614
static void qcow_close(BlockDriverState *bs)
615
{
616
    BDRVQcowState *s = bs->opaque;
617

    
618
    g_free(s->l1_table);
619
    g_free(s->l2_cache);
620
    g_free(s->cluster_cache);
621
    g_free(s->cluster_data);
622

    
623
    migrate_del_blocker(s->migration_blocker);
624
    error_free(s->migration_blocker);
625
}
626

    
627
static int qcow_create(const char *filename, QEMUOptionParameter *options)
628
{
629
    int fd, header_size, backing_filename_len, l1_size, i, shift;
630
    QCowHeader header;
631
    uint64_t tmp;
632
    int64_t total_size = 0;
633
    const char *backing_file = NULL;
634
    int flags = 0;
635
    int ret;
636

    
637
    /* Read out options */
638
    while (options && options->name) {
639
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
640
            total_size = options->value.n / 512;
641
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
642
            backing_file = options->value.s;
643
        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
644
            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
645
        }
646
        options++;
647
    }
648

    
649
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
650
    if (fd < 0)
651
        return -errno;
652
    memset(&header, 0, sizeof(header));
653
    header.magic = cpu_to_be32(QCOW_MAGIC);
654
    header.version = cpu_to_be32(QCOW_VERSION);
655
    header.size = cpu_to_be64(total_size * 512);
656
    header_size = sizeof(header);
657
    backing_filename_len = 0;
658
    if (backing_file) {
659
        if (strcmp(backing_file, "fat:")) {
660
            header.backing_file_offset = cpu_to_be64(header_size);
661
            backing_filename_len = strlen(backing_file);
662
            header.backing_file_size = cpu_to_be32(backing_filename_len);
663
            header_size += backing_filename_len;
664
        } else {
665
            /* special backing file for vvfat */
666
            backing_file = NULL;
667
        }
668
        header.cluster_bits = 9; /* 512 byte cluster to avoid copying
669
                                    unmodifyed sectors */
670
        header.l2_bits = 12; /* 32 KB L2 tables */
671
    } else {
672
        header.cluster_bits = 12; /* 4 KB clusters */
673
        header.l2_bits = 9; /* 4 KB L2 tables */
674
    }
675
    header_size = (header_size + 7) & ~7;
676
    shift = header.cluster_bits + header.l2_bits;
677
    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
678

    
679
    header.l1_table_offset = cpu_to_be64(header_size);
680
    if (flags & BLOCK_FLAG_ENCRYPT) {
681
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
682
    } else {
683
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
684
    }
685

    
686
    /* write all the data */
687
    ret = qemu_write_full(fd, &header, sizeof(header));
688
    if (ret != sizeof(header)) {
689
        ret = -errno;
690
        goto exit;
691
    }
692

    
693
    if (backing_file) {
694
        ret = qemu_write_full(fd, backing_file, backing_filename_len);
695
        if (ret != backing_filename_len) {
696
            ret = -errno;
697
            goto exit;
698
        }
699

    
700
    }
701
    lseek(fd, header_size, SEEK_SET);
702
    tmp = 0;
703
    for(i = 0;i < l1_size; i++) {
704
        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
705
        if (ret != sizeof(tmp)) {
706
            ret = -errno;
707
            goto exit;
708
        }
709
    }
710

    
711
    ret = 0;
712
exit:
713
    close(fd);
714
    return ret;
715
}
716

    
717
static int qcow_make_empty(BlockDriverState *bs)
718
{
719
    BDRVQcowState *s = bs->opaque;
720
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
721
    int ret;
722

    
723
    memset(s->l1_table, 0, l1_length);
724
    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
725
            l1_length) < 0)
726
        return -1;
727
    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
728
    if (ret < 0)
729
        return ret;
730

    
731
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
732
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
733
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
734

    
735
    return 0;
736
}
737

    
738
/* XXX: put compressed sectors first, then all the cluster aligned
739
   tables to avoid losing bytes in alignment */
740
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
741
                                 const uint8_t *buf, int nb_sectors)
742
{
743
    BDRVQcowState *s = bs->opaque;
744
    z_stream strm;
745
    int ret, out_len;
746
    uint8_t *out_buf;
747
    uint64_t cluster_offset;
748

    
749
    if (nb_sectors != s->cluster_sectors)
750
        return -EINVAL;
751

    
752
    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
753

    
754
    /* best compression, small window, no zlib header */
755
    memset(&strm, 0, sizeof(strm));
756
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
757
                       Z_DEFLATED, -12,
758
                       9, Z_DEFAULT_STRATEGY);
759
    if (ret != 0) {
760
        ret = -EINVAL;
761
        goto fail;
762
    }
763

    
764
    strm.avail_in = s->cluster_size;
765
    strm.next_in = (uint8_t *)buf;
766
    strm.avail_out = s->cluster_size;
767
    strm.next_out = out_buf;
768

    
769
    ret = deflate(&strm, Z_FINISH);
770
    if (ret != Z_STREAM_END && ret != Z_OK) {
771
        deflateEnd(&strm);
772
        ret = -EINVAL;
773
        goto fail;
774
    }
775
    out_len = strm.next_out - out_buf;
776

    
777
    deflateEnd(&strm);
778

    
779
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
780
        /* could not compress: write normal cluster */
781
        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
782
        if (ret < 0) {
783
            goto fail;
784
        }
785
    } else {
786
        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
787
                                            out_len, 0, 0);
788
        if (cluster_offset == 0) {
789
            ret = -EIO;
790
            goto fail;
791
        }
792

    
793
        cluster_offset &= s->cluster_offset_mask;
794
        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
795
        if (ret < 0) {
796
            goto fail;
797
        }
798
    }
799

    
800
    ret = 0;
801
fail:
802
    g_free(out_buf);
803
    return ret;
804
}
805

    
806
static coroutine_fn int qcow_co_flush(BlockDriverState *bs)
807
{
808
    return bdrv_co_flush(bs->file);
809
}
810

    
811
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
812
{
813
    BDRVQcowState *s = bs->opaque;
814
    bdi->cluster_size = s->cluster_size;
815
    return 0;
816
}
817

    
818

    
819
static QEMUOptionParameter qcow_create_options[] = {
820
    {
821
        .name = BLOCK_OPT_SIZE,
822
        .type = OPT_SIZE,
823
        .help = "Virtual disk size"
824
    },
825
    {
826
        .name = BLOCK_OPT_BACKING_FILE,
827
        .type = OPT_STRING,
828
        .help = "File name of a base image"
829
    },
830
    {
831
        .name = BLOCK_OPT_ENCRYPT,
832
        .type = OPT_FLAG,
833
        .help = "Encrypt the image"
834
    },
835
    { NULL }
836
};
837

    
838
static BlockDriver bdrv_qcow = {
839
    .format_name        = "qcow",
840
    .instance_size        = sizeof(BDRVQcowState),
841
    .bdrv_probe                = qcow_probe,
842
    .bdrv_open                = qcow_open,
843
    .bdrv_close                = qcow_close,
844
    .bdrv_create        = qcow_create,
845

    
846
    .bdrv_co_readv          = qcow_co_readv,
847
    .bdrv_co_writev         = qcow_co_writev,
848
    .bdrv_co_flush_to_disk  = qcow_co_flush,
849
    .bdrv_co_is_allocated   = qcow_co_is_allocated,
850

    
851
    .bdrv_set_key           = qcow_set_key,
852
    .bdrv_make_empty        = qcow_make_empty,
853
    .bdrv_write_compressed  = qcow_write_compressed,
854
    .bdrv_get_info          = qcow_get_info,
855

    
856
    .create_options = qcow_create_options,
857
};
858

    
859
static void bdrv_qcow_init(void)
860
{
861
    bdrv_register(&bdrv_qcow);
862
}
863

    
864
block_init(bdrv_qcow_init);