Statistics
| Branch: | Revision:

root / block / qcow2.c @ f7d0fe02

History | View | Annotate | Download (63.2 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include "module.h"
27
#include <zlib.h>
28
#include "aes.h"
29
#include "block/qcow2.h"
30

    
31
/*
32
  Differences with QCOW:
33

34
  - Support for multiple incremental snapshots.
35
  - Memory management by reference counts.
36
  - Clusters which have a reference count of one have the bit
37
    QCOW_OFLAG_COPIED to optimize write performance.
38
  - Size of compressed clusters is stored in sectors to reduce bit usage
39
    in the cluster offsets.
40
  - Support for storing additional data (such as the VM state) in the
41
    snapshots.
42
  - If a backing store is used, the cluster size is not constrained
43
    (could be backported to QCOW).
44
  - L2 tables have always a size of one cluster.
45
*/
46

    
47
//#define DEBUG_ALLOC
48
//#define DEBUG_ALLOC2
49
//#define DEBUG_EXT
50

    
51

    
52
typedef struct {
53
    uint32_t magic;
54
    uint32_t len;
55
} QCowExtension;
56
#define  QCOW_EXT_MAGIC_END 0
57
#define  QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
58

    
59

    
60
typedef struct __attribute__((packed)) QCowSnapshotHeader {
61
    /* header is 8 byte aligned */
62
    uint64_t l1_table_offset;
63

    
64
    uint32_t l1_size;
65
    uint16_t id_str_size;
66
    uint16_t name_size;
67

    
68
    uint32_t date_sec;
69
    uint32_t date_nsec;
70

    
71
    uint64_t vm_clock_nsec;
72

    
73
    uint32_t vm_state_size;
74
    uint32_t extra_data_size; /* for extension */
75
    /* extra data follows */
76
    /* id_str follows */
77
    /* name follows  */
78
} QCowSnapshotHeader;
79

    
80

    
81
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
82
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
83
                     uint8_t *buf, int nb_sectors);
84
static int qcow_read_snapshots(BlockDriverState *bs);
85
static void qcow_free_snapshots(BlockDriverState *bs);
86

    
87
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
88
{
89
    const QCowHeader *cow_header = (const void *)buf;
90

    
91
    if (buf_size >= sizeof(QCowHeader) &&
92
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
93
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
94
        return 100;
95
    else
96
        return 0;
97
}
98

    
99

    
100
/* 
101
 * read qcow2 extension and fill bs
102
 * start reading from start_offset
103
 * finish reading upon magic of value 0 or when end_offset reached
104
 * unknown magic is skipped (future extension this version knows nothing about)
105
 * return 0 upon success, non-0 otherwise
106
 */
107
static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset,
108
                                uint64_t end_offset)
109
{
110
    BDRVQcowState *s = bs->opaque;
111
    QCowExtension ext;
112
    uint64_t offset;
113

    
114
#ifdef DEBUG_EXT
115
    printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
116
#endif
117
    offset = start_offset;
118
    while (offset < end_offset) {
119

    
120
#ifdef DEBUG_EXT
121
        /* Sanity check */
122
        if (offset > s->cluster_size)
123
            printf("qcow_handle_extension: suspicious offset %lu\n", offset);
124

    
125
        printf("attemting to read extended header in offset %lu\n", offset);
126
#endif
127

    
128
        if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) {
129
            fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n",
130
                    (unsigned long long)offset);
131
            return 1;
132
        }
133
        be32_to_cpus(&ext.magic);
134
        be32_to_cpus(&ext.len);
135
        offset += sizeof(ext);
136
#ifdef DEBUG_EXT
137
        printf("ext.magic = 0x%x\n", ext.magic);
138
#endif
139
        switch (ext.magic) {
140
        case QCOW_EXT_MAGIC_END:
141
            return 0;
142

    
143
        case QCOW_EXT_MAGIC_BACKING_FORMAT:
144
            if (ext.len >= sizeof(bs->backing_format)) {
145
                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
146
                        " (>=%zu)\n",
147
                        ext.len, sizeof(bs->backing_format));
148
                return 2;
149
            }
150
            if (bdrv_pread(s->hd, offset , bs->backing_format,
151
                           ext.len) != ext.len)
152
                return 3;
153
            bs->backing_format[ext.len] = '\0';
154
#ifdef DEBUG_EXT
155
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
156
#endif
157
            offset += ((ext.len + 7) & ~7);
158
            break;
159

    
160
        default:
161
            /* unknown magic -- just skip it */
162
            offset += ((ext.len + 7) & ~7);
163
            break;
164
        }
165
    }
166

    
167
    return 0;
168
}
169

    
170

    
171
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
172
{
173
    BDRVQcowState *s = bs->opaque;
174
    int len, i, shift, ret;
175
    QCowHeader header;
176
    uint64_t ext_end;
177

    
178
    /* Performance is terrible right now with cache=writethrough due mainly
179
     * to reference count updates.  If the user does not explicitly specify
180
     * a caching type, force to writeback caching.
181
     */
182
    if ((flags & BDRV_O_CACHE_DEF)) {
183
        flags |= BDRV_O_CACHE_WB;
184
        flags &= ~BDRV_O_CACHE_DEF;
185
    }
186
    ret = bdrv_file_open(&s->hd, filename, flags);
187
    if (ret < 0)
188
        return ret;
189
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
190
        goto fail;
191
    be32_to_cpus(&header.magic);
192
    be32_to_cpus(&header.version);
193
    be64_to_cpus(&header.backing_file_offset);
194
    be32_to_cpus(&header.backing_file_size);
195
    be64_to_cpus(&header.size);
196
    be32_to_cpus(&header.cluster_bits);
197
    be32_to_cpus(&header.crypt_method);
198
    be64_to_cpus(&header.l1_table_offset);
199
    be32_to_cpus(&header.l1_size);
200
    be64_to_cpus(&header.refcount_table_offset);
201
    be32_to_cpus(&header.refcount_table_clusters);
202
    be64_to_cpus(&header.snapshots_offset);
203
    be32_to_cpus(&header.nb_snapshots);
204

    
205
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
206
        goto fail;
207
    if (header.size <= 1 ||
208
        header.cluster_bits < MIN_CLUSTER_BITS ||
209
        header.cluster_bits > MAX_CLUSTER_BITS)
210
        goto fail;
211
    if (header.crypt_method > QCOW_CRYPT_AES)
212
        goto fail;
213
    s->crypt_method_header = header.crypt_method;
214
    if (s->crypt_method_header)
215
        bs->encrypted = 1;
216
    s->cluster_bits = header.cluster_bits;
217
    s->cluster_size = 1 << s->cluster_bits;
218
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
219
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
220
    s->l2_size = 1 << s->l2_bits;
221
    bs->total_sectors = header.size / 512;
222
    s->csize_shift = (62 - (s->cluster_bits - 8));
223
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
224
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
225
    s->refcount_table_offset = header.refcount_table_offset;
226
    s->refcount_table_size =
227
        header.refcount_table_clusters << (s->cluster_bits - 3);
228

    
229
    s->snapshots_offset = header.snapshots_offset;
230
    s->nb_snapshots = header.nb_snapshots;
231

    
232
    /* read the level 1 table */
233
    s->l1_size = header.l1_size;
234
    shift = s->cluster_bits + s->l2_bits;
235
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
236
    /* the L1 table must contain at least enough entries to put
237
       header.size bytes */
238
    if (s->l1_size < s->l1_vm_state_index)
239
        goto fail;
240
    s->l1_table_offset = header.l1_table_offset;
241
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
242
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
243
        s->l1_size * sizeof(uint64_t))
244
        goto fail;
245
    for(i = 0;i < s->l1_size; i++) {
246
        be64_to_cpus(&s->l1_table[i]);
247
    }
248
    /* alloc L2 cache */
249
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
250
    s->cluster_cache = qemu_malloc(s->cluster_size);
251
    /* one more sector for decompressed data alignment */
252
    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
253
                                  + 512);
254
    s->cluster_cache_offset = -1;
255

    
256
    if (refcount_init(bs) < 0)
257
        goto fail;
258

    
259
    /* read qcow2 extensions */
260
    if (header.backing_file_offset)
261
        ext_end = header.backing_file_offset;
262
    else
263
        ext_end = s->cluster_size;
264
    if (qcow_read_extensions(bs, sizeof(header), ext_end))
265
        goto fail;
266

    
267
    /* read the backing file name */
268
    if (header.backing_file_offset != 0) {
269
        len = header.backing_file_size;
270
        if (len > 1023)
271
            len = 1023;
272
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
273
            goto fail;
274
        bs->backing_file[len] = '\0';
275
    }
276
    if (qcow_read_snapshots(bs) < 0)
277
        goto fail;
278

    
279
#ifdef DEBUG_ALLOC
280
    check_refcounts(bs);
281
#endif
282
    return 0;
283

    
284
 fail:
285
    qcow_free_snapshots(bs);
286
    refcount_close(bs);
287
    qemu_free(s->l1_table);
288
    qemu_free(s->l2_cache);
289
    qemu_free(s->cluster_cache);
290
    qemu_free(s->cluster_data);
291
    bdrv_delete(s->hd);
292
    return -1;
293
}
294

    
295
static int qcow_set_key(BlockDriverState *bs, const char *key)
296
{
297
    BDRVQcowState *s = bs->opaque;
298
    uint8_t keybuf[16];
299
    int len, i;
300

    
301
    memset(keybuf, 0, 16);
302
    len = strlen(key);
303
    if (len > 16)
304
        len = 16;
305
    /* XXX: we could compress the chars to 7 bits to increase
306
       entropy */
307
    for(i = 0;i < len;i++) {
308
        keybuf[i] = key[i];
309
    }
310
    s->crypt_method = s->crypt_method_header;
311

    
312
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
313
        return -1;
314
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
315
        return -1;
316
#if 0
317
    /* test */
318
    {
319
        uint8_t in[16];
320
        uint8_t out[16];
321
        uint8_t tmp[16];
322
        for(i=0;i<16;i++)
323
            in[i] = i;
324
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
325
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
326
        for(i = 0; i < 16; i++)
327
            printf(" %02x", tmp[i]);
328
        printf("\n");
329
        for(i = 0; i < 16; i++)
330
            printf(" %02x", out[i]);
331
        printf("\n");
332
    }
333
#endif
334
    return 0;
335
}
336

    
337
/* The crypt function is compatible with the linux cryptoloop
338
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
339
   supported */
340
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
341
                            uint8_t *out_buf, const uint8_t *in_buf,
342
                            int nb_sectors, int enc,
343
                            const AES_KEY *key)
344
{
345
    union {
346
        uint64_t ll[2];
347
        uint8_t b[16];
348
    } ivec;
349
    int i;
350

    
351
    for(i = 0; i < nb_sectors; i++) {
352
        ivec.ll[0] = cpu_to_le64(sector_num);
353
        ivec.ll[1] = 0;
354
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
355
                        ivec.b, enc);
356
        sector_num++;
357
        in_buf += 512;
358
        out_buf += 512;
359
    }
360
}
361

    
362
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
363
                        uint64_t cluster_offset, int n_start, int n_end)
364
{
365
    BDRVQcowState *s = bs->opaque;
366
    int n, ret;
367

    
368
    n = n_end - n_start;
369
    if (n <= 0)
370
        return 0;
371
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
372
    if (ret < 0)
373
        return ret;
374
    if (s->crypt_method) {
375
        encrypt_sectors(s, start_sect + n_start,
376
                        s->cluster_data,
377
                        s->cluster_data, n, 1,
378
                        &s->aes_encrypt_key);
379
    }
380
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
381
                     s->cluster_data, n);
382
    if (ret < 0)
383
        return ret;
384
    return 0;
385
}
386

    
387
void l2_cache_reset(BlockDriverState *bs)
388
{
389
    BDRVQcowState *s = bs->opaque;
390

    
391
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
392
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
393
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
394
}
395

    
396
static inline int l2_cache_new_entry(BlockDriverState *bs)
397
{
398
    BDRVQcowState *s = bs->opaque;
399
    uint32_t min_count;
400
    int min_index, i;
401

    
402
    /* find a new entry in the least used one */
403
    min_index = 0;
404
    min_count = 0xffffffff;
405
    for(i = 0; i < L2_CACHE_SIZE; i++) {
406
        if (s->l2_cache_counts[i] < min_count) {
407
            min_count = s->l2_cache_counts[i];
408
            min_index = i;
409
        }
410
    }
411
    return min_index;
412
}
413

    
414
static int64_t align_offset(int64_t offset, int n)
415
{
416
    offset = (offset + n - 1) & ~(n - 1);
417
    return offset;
418
}
419

    
420
static int grow_l1_table(BlockDriverState *bs, int min_size)
421
{
422
    BDRVQcowState *s = bs->opaque;
423
    int new_l1_size, new_l1_size2, ret, i;
424
    uint64_t *new_l1_table;
425
    uint64_t new_l1_table_offset;
426
    uint8_t data[12];
427

    
428
    new_l1_size = s->l1_size;
429
    if (min_size <= new_l1_size)
430
        return 0;
431
    while (min_size > new_l1_size) {
432
        new_l1_size = (new_l1_size * 3 + 1) / 2;
433
    }
434
#ifdef DEBUG_ALLOC2
435
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
436
#endif
437

    
438
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
439
    new_l1_table = qemu_mallocz(new_l1_size2);
440
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
441

    
442
    /* write new table (align to cluster) */
443
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
444

    
445
    for(i = 0; i < s->l1_size; i++)
446
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
447
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
448
    if (ret != new_l1_size2)
449
        goto fail;
450
    for(i = 0; i < s->l1_size; i++)
451
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
452

    
453
    /* set new table */
454
    cpu_to_be32w((uint32_t*)data, new_l1_size);
455
    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
456
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
457
                sizeof(data)) != sizeof(data))
458
        goto fail;
459
    qemu_free(s->l1_table);
460
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
461
    s->l1_table_offset = new_l1_table_offset;
462
    s->l1_table = new_l1_table;
463
    s->l1_size = new_l1_size;
464
    return 0;
465
 fail:
466
    qemu_free(s->l1_table);
467
    return -EIO;
468
}
469

    
470
/*
471
 * seek_l2_table
472
 *
473
 * seek l2_offset in the l2_cache table
474
 * if not found, return NULL,
475
 * if found,
476
 *   increments the l2 cache hit count of the entry,
477
 *   if counter overflow, divide by two all counters
478
 *   return the pointer to the l2 cache entry
479
 *
480
 */
481

    
482
static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
483
{
484
    int i, j;
485

    
486
    for(i = 0; i < L2_CACHE_SIZE; i++) {
487
        if (l2_offset == s->l2_cache_offsets[i]) {
488
            /* increment the hit count */
489
            if (++s->l2_cache_counts[i] == 0xffffffff) {
490
                for(j = 0; j < L2_CACHE_SIZE; j++) {
491
                    s->l2_cache_counts[j] >>= 1;
492
                }
493
            }
494
            return s->l2_cache + (i << s->l2_bits);
495
        }
496
    }
497
    return NULL;
498
}
499

    
500
/*
501
 * l2_load
502
 *
503
 * Loads a L2 table into memory. If the table is in the cache, the cache
504
 * is used; otherwise the L2 table is loaded from the image file.
505
 *
506
 * Returns a pointer to the L2 table on success, or NULL if the read from
507
 * the image file failed.
508
 */
509

    
510
static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
511
{
512
    BDRVQcowState *s = bs->opaque;
513
    int min_index;
514
    uint64_t *l2_table;
515

    
516
    /* seek if the table for the given offset is in the cache */
517

    
518
    l2_table = seek_l2_table(s, l2_offset);
519
    if (l2_table != NULL)
520
        return l2_table;
521

    
522
    /* not found: load a new entry in the least used one */
523

    
524
    min_index = l2_cache_new_entry(bs);
525
    l2_table = s->l2_cache + (min_index << s->l2_bits);
526
    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
527
        s->l2_size * sizeof(uint64_t))
528
        return NULL;
529
    s->l2_cache_offsets[min_index] = l2_offset;
530
    s->l2_cache_counts[min_index] = 1;
531

    
532
    return l2_table;
533
}
534

    
535
/*
536
 * l2_allocate
537
 *
538
 * Allocate a new l2 entry in the file. If l1_index points to an already
539
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
540
 * table) copy the contents of the old L2 table into the newly allocated one.
541
 * Otherwise the new table is initialized with zeros.
542
 *
543
 */
544

    
545
static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
546
{
547
    BDRVQcowState *s = bs->opaque;
548
    int min_index;
549
    uint64_t old_l2_offset, tmp;
550
    uint64_t *l2_table, l2_offset;
551

    
552
    old_l2_offset = s->l1_table[l1_index];
553

    
554
    /* allocate a new l2 entry */
555

    
556
    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
557

    
558
    /* update the L1 entry */
559

    
560
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
561

    
562
    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
563
    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
564
                    &tmp, sizeof(tmp)) != sizeof(tmp))
565
        return NULL;
566

    
567
    /* allocate a new entry in the l2 cache */
568

    
569
    min_index = l2_cache_new_entry(bs);
570
    l2_table = s->l2_cache + (min_index << s->l2_bits);
571

    
572
    if (old_l2_offset == 0) {
573
        /* if there was no old l2 table, clear the new table */
574
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
575
    } else {
576
        /* if there was an old l2 table, read it from the disk */
577
        if (bdrv_pread(s->hd, old_l2_offset,
578
                       l2_table, s->l2_size * sizeof(uint64_t)) !=
579
            s->l2_size * sizeof(uint64_t))
580
            return NULL;
581
    }
582
    /* write the l2 table to the file */
583
    if (bdrv_pwrite(s->hd, l2_offset,
584
                    l2_table, s->l2_size * sizeof(uint64_t)) !=
585
        s->l2_size * sizeof(uint64_t))
586
        return NULL;
587

    
588
    /* update the l2 cache entry */
589

    
590
    s->l2_cache_offsets[min_index] = l2_offset;
591
    s->l2_cache_counts[min_index] = 1;
592

    
593
    return l2_table;
594
}
595

    
596
static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
597
        uint64_t *l2_table, uint64_t start, uint64_t mask)
598
{
599
    int i;
600
    uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
601

    
602
    if (!offset)
603
        return 0;
604

    
605
    for (i = start; i < start + nb_clusters; i++)
606
        if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
607
            break;
608

    
609
        return (i - start);
610
}
611

    
612
static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
613
{
614
    int i = 0;
615

    
616
    while(nb_clusters-- && l2_table[i] == 0)
617
        i++;
618

    
619
    return i;
620
}
621

    
622
/*
623
 * get_cluster_offset
624
 *
625
 * For a given offset of the disk image, return cluster offset in
626
 * qcow2 file.
627
 *
628
 * on entry, *num is the number of contiguous clusters we'd like to
629
 * access following offset.
630
 *
631
 * on exit, *num is the number of contiguous clusters we can read.
632
 *
633
 * Return 1, if the offset is found
634
 * Return 0, otherwise.
635
 *
636
 */
637

    
638
static uint64_t get_cluster_offset(BlockDriverState *bs,
639
                                   uint64_t offset, int *num)
640
{
641
    BDRVQcowState *s = bs->opaque;
642
    int l1_index, l2_index;
643
    uint64_t l2_offset, *l2_table, cluster_offset;
644
    int l1_bits, c;
645
    int index_in_cluster, nb_available, nb_needed, nb_clusters;
646

    
647
    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
648
    nb_needed = *num + index_in_cluster;
649

    
650
    l1_bits = s->l2_bits + s->cluster_bits;
651

    
652
    /* compute how many bytes there are between the offset and
653
     * the end of the l1 entry
654
     */
655

    
656
    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
657

    
658
    /* compute the number of available sectors */
659

    
660
    nb_available = (nb_available >> 9) + index_in_cluster;
661

    
662
    if (nb_needed > nb_available) {
663
        nb_needed = nb_available;
664
    }
665

    
666
    cluster_offset = 0;
667

    
668
    /* seek the the l2 offset in the l1 table */
669

    
670
    l1_index = offset >> l1_bits;
671
    if (l1_index >= s->l1_size)
672
        goto out;
673

    
674
    l2_offset = s->l1_table[l1_index];
675

    
676
    /* seek the l2 table of the given l2 offset */
677

    
678
    if (!l2_offset)
679
        goto out;
680

    
681
    /* load the l2 table in memory */
682

    
683
    l2_offset &= ~QCOW_OFLAG_COPIED;
684
    l2_table = l2_load(bs, l2_offset);
685
    if (l2_table == NULL)
686
        return 0;
687

    
688
    /* find the cluster offset for the given disk offset */
689

    
690
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
691
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
692
    nb_clusters = size_to_clusters(s, nb_needed << 9);
693

    
694
    if (!cluster_offset) {
695
        /* how many empty clusters ? */
696
        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
697
    } else {
698
        /* how many allocated clusters ? */
699
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
700
                &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
701
    }
702

    
703
   nb_available = (c * s->cluster_sectors);
704
out:
705
    if (nb_available > nb_needed)
706
        nb_available = nb_needed;
707

    
708
    *num = nb_available - index_in_cluster;
709

    
710
    return cluster_offset & ~QCOW_OFLAG_COPIED;
711
}
712

    
713
/*
714
 * free_any_clusters
715
 *
716
 * free clusters according to its type: compressed or not
717
 *
718
 */
719

    
720
static void free_any_clusters(BlockDriverState *bs,
721
                              uint64_t cluster_offset, int nb_clusters)
722
{
723
    BDRVQcowState *s = bs->opaque;
724

    
725
    /* free the cluster */
726

    
727
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
728
        int nb_csectors;
729
        nb_csectors = ((cluster_offset >> s->csize_shift) &
730
                       s->csize_mask) + 1;
731
        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
732
                      nb_csectors * 512);
733
        return;
734
    }
735

    
736
    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
737

    
738
    return;
739
}
740

    
741
/*
742
 * get_cluster_table
743
 *
744
 * for a given disk offset, load (and allocate if needed)
745
 * the l2 table.
746
 *
747
 * the l2 table offset in the qcow2 file and the cluster index
748
 * in the l2 table are given to the caller.
749
 *
750
 */
751

    
752
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
753
                             uint64_t **new_l2_table,
754
                             uint64_t *new_l2_offset,
755
                             int *new_l2_index)
756
{
757
    BDRVQcowState *s = bs->opaque;
758
    int l1_index, l2_index, ret;
759
    uint64_t l2_offset, *l2_table;
760

    
761
    /* seek the the l2 offset in the l1 table */
762

    
763
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
764
    if (l1_index >= s->l1_size) {
765
        ret = grow_l1_table(bs, l1_index + 1);
766
        if (ret < 0)
767
            return 0;
768
    }
769
    l2_offset = s->l1_table[l1_index];
770

    
771
    /* seek the l2 table of the given l2 offset */
772

    
773
    if (l2_offset & QCOW_OFLAG_COPIED) {
774
        /* load the l2 table in memory */
775
        l2_offset &= ~QCOW_OFLAG_COPIED;
776
        l2_table = l2_load(bs, l2_offset);
777
        if (l2_table == NULL)
778
            return 0;
779
    } else {
780
        if (l2_offset)
781
            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
782
        l2_table = l2_allocate(bs, l1_index);
783
        if (l2_table == NULL)
784
            return 0;
785
        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
786
    }
787

    
788
    /* find the cluster offset for the given disk offset */
789

    
790
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
791

    
792
    *new_l2_table = l2_table;
793
    *new_l2_offset = l2_offset;
794
    *new_l2_index = l2_index;
795

    
796
    return 1;
797
}
798

    
799
/*
800
 * alloc_compressed_cluster_offset
801
 *
802
 * For a given offset of the disk image, return cluster offset in
803
 * qcow2 file.
804
 *
805
 * If the offset is not found, allocate a new compressed cluster.
806
 *
807
 * Return the cluster offset if successful,
808
 * Return 0, otherwise.
809
 *
810
 */
811

    
812
static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
813
                                                uint64_t offset,
814
                                                int compressed_size)
815
{
816
    BDRVQcowState *s = bs->opaque;
817
    int l2_index, ret;
818
    uint64_t l2_offset, *l2_table, cluster_offset;
819
    int nb_csectors;
820

    
821
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
822
    if (ret == 0)
823
        return 0;
824

    
825
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
826
    if (cluster_offset & QCOW_OFLAG_COPIED)
827
        return cluster_offset & ~QCOW_OFLAG_COPIED;
828

    
829
    if (cluster_offset)
830
        free_any_clusters(bs, cluster_offset, 1);
831

    
832
    cluster_offset = alloc_bytes(bs, compressed_size);
833
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
834
                  (cluster_offset >> 9);
835

    
836
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
837
                      ((uint64_t)nb_csectors << s->csize_shift);
838

    
839
    /* update L2 table */
840

    
841
    /* compressed clusters never have the copied flag */
842

    
843
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
844
    if (bdrv_pwrite(s->hd,
845
                    l2_offset + l2_index * sizeof(uint64_t),
846
                    l2_table + l2_index,
847
                    sizeof(uint64_t)) != sizeof(uint64_t))
848
        return 0;
849

    
850
    return cluster_offset;
851
}
852

    
853
typedef struct QCowL2Meta
854
{
855
    uint64_t offset;
856
    int n_start;
857
    int nb_available;
858
    int nb_clusters;
859
} QCowL2Meta;
860

    
861
static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
862
        QCowL2Meta *m)
863
{
864
    BDRVQcowState *s = bs->opaque;
865
    int i, j = 0, l2_index, ret;
866
    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
867

    
868
    if (m->nb_clusters == 0)
869
        return 0;
870

    
871
    old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
872

    
873
    /* copy content of unmodified sectors */
874
    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
875
    if (m->n_start) {
876
        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
877
        if (ret < 0)
878
            goto err;
879
    }
880

    
881
    if (m->nb_available & (s->cluster_sectors - 1)) {
882
        uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
883
        ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
884
                m->nb_available - end, s->cluster_sectors);
885
        if (ret < 0)
886
            goto err;
887
    }
888

    
889
    ret = -EIO;
890
    /* update L2 table */
891
    if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
892
        goto err;
893

    
894
    for (i = 0; i < m->nb_clusters; i++) {
895
        /* if two concurrent writes happen to the same unallocated cluster
896
         * each write allocates separate cluster and writes data concurrently.
897
         * The first one to complete updates l2 table with pointer to its
898
         * cluster the second one has to do RMW (which is done above by
899
         * copy_sectors()), update l2 table with its cluster pointer and free
900
         * old cluster. This is what this loop does */
901
        if(l2_table[l2_index + i] != 0)
902
            old_cluster[j++] = l2_table[l2_index + i];
903

    
904
        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
905
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
906
     }
907

    
908
    if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
909
                l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
910
            m->nb_clusters * sizeof(uint64_t))
911
        goto err;
912

    
913
    for (i = 0; i < j; i++)
914
        free_any_clusters(bs, be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED,
915
                          1);
916

    
917
    ret = 0;
918
err:
919
    qemu_free(old_cluster);
920
    return ret;
921
 }
922

    
923
/*
924
 * alloc_cluster_offset
925
 *
926
 * For a given offset of the disk image, return cluster offset in
927
 * qcow2 file.
928
 *
929
 * If the offset is not found, allocate a new cluster.
930
 *
931
 * Return the cluster offset if successful,
932
 * Return 0, otherwise.
933
 *
934
 */
935

    
936
static uint64_t alloc_cluster_offset(BlockDriverState *bs,
937
                                     uint64_t offset,
938
                                     int n_start, int n_end,
939
                                     int *num, QCowL2Meta *m)
940
{
941
    BDRVQcowState *s = bs->opaque;
942
    int l2_index, ret;
943
    uint64_t l2_offset, *l2_table, cluster_offset;
944
    int nb_clusters, i = 0;
945

    
946
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
947
    if (ret == 0)
948
        return 0;
949

    
950
    nb_clusters = size_to_clusters(s, n_end << 9);
951

    
952
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
953

    
954
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
955

    
956
    /* We keep all QCOW_OFLAG_COPIED clusters */
957

    
958
    if (cluster_offset & QCOW_OFLAG_COPIED) {
959
        nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
960
                &l2_table[l2_index], 0, 0);
961

    
962
        cluster_offset &= ~QCOW_OFLAG_COPIED;
963
        m->nb_clusters = 0;
964

    
965
        goto out;
966
    }
967

    
968
    /* for the moment, multiple compressed clusters are not managed */
969

    
970
    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
971
        nb_clusters = 1;
972

    
973
    /* how many available clusters ? */
974

    
975
    while (i < nb_clusters) {
976
        i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
977
                &l2_table[l2_index], i, 0);
978

    
979
        if(be64_to_cpu(l2_table[l2_index + i]))
980
            break;
981

    
982
        i += count_contiguous_free_clusters(nb_clusters - i,
983
                &l2_table[l2_index + i]);
984

    
985
        cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
986

    
987
        if ((cluster_offset & QCOW_OFLAG_COPIED) ||
988
                (cluster_offset & QCOW_OFLAG_COMPRESSED))
989
            break;
990
    }
991
    nb_clusters = i;
992

    
993
    /* allocate a new cluster */
994

    
995
    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
996

    
997
    /* save info needed for meta data update */
998
    m->offset = offset;
999
    m->n_start = n_start;
1000
    m->nb_clusters = nb_clusters;
1001

    
1002
out:
1003
    m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
1004

    
1005
    *num = m->nb_available - n_start;
1006

    
1007
    return cluster_offset;
1008
}
1009

    
1010
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
1011
                             int nb_sectors, int *pnum)
1012
{
1013
    uint64_t cluster_offset;
1014

    
1015
    *pnum = nb_sectors;
1016
    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
1017

    
1018
    return (cluster_offset != 0);
1019
}
1020

    
1021
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1022
                             const uint8_t *buf, int buf_size)
1023
{
1024
    z_stream strm1, *strm = &strm1;
1025
    int ret, out_len;
1026

    
1027
    memset(strm, 0, sizeof(*strm));
1028

    
1029
    strm->next_in = (uint8_t *)buf;
1030
    strm->avail_in = buf_size;
1031
    strm->next_out = out_buf;
1032
    strm->avail_out = out_buf_size;
1033

    
1034
    ret = inflateInit2(strm, -12);
1035
    if (ret != Z_OK)
1036
        return -1;
1037
    ret = inflate(strm, Z_FINISH);
1038
    out_len = strm->next_out - out_buf;
1039
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1040
        out_len != out_buf_size) {
1041
        inflateEnd(strm);
1042
        return -1;
1043
    }
1044
    inflateEnd(strm);
1045
    return 0;
1046
}
1047

    
1048
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
1049
{
1050
    int ret, csize, nb_csectors, sector_offset;
1051
    uint64_t coffset;
1052

    
1053
    coffset = cluster_offset & s->cluster_offset_mask;
1054
    if (s->cluster_cache_offset != coffset) {
1055
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1056
        sector_offset = coffset & 511;
1057
        csize = nb_csectors * 512 - sector_offset;
1058
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
1059
        if (ret < 0) {
1060
            return -1;
1061
        }
1062
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1063
                              s->cluster_data + sector_offset, csize) < 0) {
1064
            return -1;
1065
        }
1066
        s->cluster_cache_offset = coffset;
1067
    }
1068
    return 0;
1069
}
1070

    
1071
/* handle reading after the end of the backing file */
1072
static int backing_read1(BlockDriverState *bs,
1073
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
1074
{
1075
    int n1;
1076
    if ((sector_num + nb_sectors) <= bs->total_sectors)
1077
        return nb_sectors;
1078
    if (sector_num >= bs->total_sectors)
1079
        n1 = 0;
1080
    else
1081
        n1 = bs->total_sectors - sector_num;
1082
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
1083
    return n1;
1084
}
1085

    
1086
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
1087
                     uint8_t *buf, int nb_sectors)
1088
{
1089
    BDRVQcowState *s = bs->opaque;
1090
    int ret, index_in_cluster, n, n1;
1091
    uint64_t cluster_offset;
1092

    
1093
    while (nb_sectors > 0) {
1094
        n = nb_sectors;
1095
        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
1096
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1097
        if (!cluster_offset) {
1098
            if (bs->backing_hd) {
1099
                /* read from the base image */
1100
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
1101
                if (n1 > 0) {
1102
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
1103
                    if (ret < 0)
1104
                        return -1;
1105
                }
1106
            } else {
1107
                memset(buf, 0, 512 * n);
1108
            }
1109
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1110
            if (decompress_cluster(s, cluster_offset) < 0)
1111
                return -1;
1112
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
1113
        } else {
1114
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1115
            if (ret != n * 512)
1116
                return -1;
1117
            if (s->crypt_method) {
1118
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
1119
                                &s->aes_decrypt_key);
1120
            }
1121
        }
1122
        nb_sectors -= n;
1123
        sector_num += n;
1124
        buf += n * 512;
1125
    }
1126
    return 0;
1127
}
1128

    
1129
typedef struct QCowAIOCB {
1130
    BlockDriverAIOCB common;
1131
    int64_t sector_num;
1132
    QEMUIOVector *qiov;
1133
    uint8_t *buf;
1134
    void *orig_buf;
1135
    int nb_sectors;
1136
    int n;
1137
    uint64_t cluster_offset;
1138
    uint8_t *cluster_data;
1139
    BlockDriverAIOCB *hd_aiocb;
1140
    struct iovec hd_iov;
1141
    QEMUIOVector hd_qiov;
1142
    QEMUBH *bh;
1143
    QCowL2Meta l2meta;
1144
} QCowAIOCB;
1145

    
1146
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1147
{
1148
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1149
    if (acb->hd_aiocb)
1150
        bdrv_aio_cancel(acb->hd_aiocb);
1151
    qemu_aio_release(acb);
1152
}
1153

    
1154
static AIOPool qcow_aio_pool = {
1155
    .aiocb_size         = sizeof(QCowAIOCB),
1156
    .cancel             = qcow_aio_cancel,
1157
};
1158

    
1159
static void qcow_aio_read_cb(void *opaque, int ret);
1160
static void qcow_aio_read_bh(void *opaque)
1161
{
1162
    QCowAIOCB *acb = opaque;
1163
    qemu_bh_delete(acb->bh);
1164
    acb->bh = NULL;
1165
    qcow_aio_read_cb(opaque, 0);
1166
}
1167

    
1168
static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
1169
{
1170
    if (acb->bh)
1171
        return -EIO;
1172

    
1173
    acb->bh = qemu_bh_new(cb, acb);
1174
    if (!acb->bh)
1175
        return -EIO;
1176

    
1177
    qemu_bh_schedule(acb->bh);
1178

    
1179
    return 0;
1180
}
1181

    
1182
static void qcow_aio_read_cb(void *opaque, int ret)
1183
{
1184
    QCowAIOCB *acb = opaque;
1185
    BlockDriverState *bs = acb->common.bs;
1186
    BDRVQcowState *s = bs->opaque;
1187
    int index_in_cluster, n1;
1188

    
1189
    acb->hd_aiocb = NULL;
1190
    if (ret < 0)
1191
        goto done;
1192

    
1193
    /* post process the read buffer */
1194
    if (!acb->cluster_offset) {
1195
        /* nothing to do */
1196
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1197
        /* nothing to do */
1198
    } else {
1199
        if (s->crypt_method) {
1200
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
1201
                            acb->n, 0,
1202
                            &s->aes_decrypt_key);
1203
        }
1204
    }
1205

    
1206
    acb->nb_sectors -= acb->n;
1207
    acb->sector_num += acb->n;
1208
    acb->buf += acb->n * 512;
1209

    
1210
    if (acb->nb_sectors == 0) {
1211
        /* request completed */
1212
        ret = 0;
1213
        goto done;
1214
    }
1215

    
1216
    /* prepare next AIO request */
1217
    acb->n = acb->nb_sectors;
1218
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
1219
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1220

    
1221
    if (!acb->cluster_offset) {
1222
        if (bs->backing_hd) {
1223
            /* read from the base image */
1224
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
1225
                               acb->buf, acb->n);
1226
            if (n1 > 0) {
1227
                acb->hd_iov.iov_base = (void *)acb->buf;
1228
                acb->hd_iov.iov_len = acb->n * 512;
1229
                qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
1230
                acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
1231
                                    &acb->hd_qiov, acb->n,
1232
                                    qcow_aio_read_cb, acb);
1233
                if (acb->hd_aiocb == NULL)
1234
                    goto done;
1235
            } else {
1236
                ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1237
                if (ret < 0)
1238
                    goto done;
1239
            }
1240
        } else {
1241
            /* Note: in this case, no need to wait */
1242
            memset(acb->buf, 0, 512 * acb->n);
1243
            ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1244
            if (ret < 0)
1245
                goto done;
1246
        }
1247
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1248
        /* add AIO support for compressed blocks ? */
1249
        if (decompress_cluster(s, acb->cluster_offset) < 0)
1250
            goto done;
1251
        memcpy(acb->buf,
1252
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
1253
        ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1254
        if (ret < 0)
1255
            goto done;
1256
    } else {
1257
        if ((acb->cluster_offset & 511) != 0) {
1258
            ret = -EIO;
1259
            goto done;
1260
        }
1261

    
1262
        acb->hd_iov.iov_base = (void *)acb->buf;
1263
        acb->hd_iov.iov_len = acb->n * 512;
1264
        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
1265
        acb->hd_aiocb = bdrv_aio_readv(s->hd,
1266
                            (acb->cluster_offset >> 9) + index_in_cluster,
1267
                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
1268
        if (acb->hd_aiocb == NULL)
1269
            goto done;
1270
    }
1271

    
1272
    return;
1273
done:
1274
    if (acb->qiov->niov > 1) {
1275
        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
1276
        qemu_vfree(acb->orig_buf);
1277
    }
1278
    acb->common.cb(acb->common.opaque, ret);
1279
    qemu_aio_release(acb);
1280
}
1281

    
1282
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1283
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1284
        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
1285
{
1286
    QCowAIOCB *acb;
1287

    
1288
    acb = qemu_aio_get(&qcow_aio_pool, bs, cb, opaque);
1289
    if (!acb)
1290
        return NULL;
1291
    acb->hd_aiocb = NULL;
1292
    acb->sector_num = sector_num;
1293
    acb->qiov = qiov;
1294
    if (qiov->niov > 1) {
1295
        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
1296
        if (is_write)
1297
            qemu_iovec_to_buffer(qiov, acb->buf);
1298
    } else {
1299
        acb->buf = (uint8_t *)qiov->iov->iov_base;
1300
    }
1301
    acb->nb_sectors = nb_sectors;
1302
    acb->n = 0;
1303
    acb->cluster_offset = 0;
1304
    acb->l2meta.nb_clusters = 0;
1305
    return acb;
1306
}
1307

    
1308
static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
1309
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1310
        BlockDriverCompletionFunc *cb, void *opaque)
1311
{
1312
    QCowAIOCB *acb;
1313

    
1314
    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
1315
    if (!acb)
1316
        return NULL;
1317

    
1318
    qcow_aio_read_cb(acb, 0);
1319
    return &acb->common;
1320
}
1321

    
1322
static void qcow_aio_write_cb(void *opaque, int ret)
1323
{
1324
    QCowAIOCB *acb = opaque;
1325
    BlockDriverState *bs = acb->common.bs;
1326
    BDRVQcowState *s = bs->opaque;
1327
    int index_in_cluster;
1328
    const uint8_t *src_buf;
1329
    int n_end;
1330

    
1331
    acb->hd_aiocb = NULL;
1332

    
1333
    if (ret < 0)
1334
        goto done;
1335

    
1336
    if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
1337
        free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
1338
        goto done;
1339
    }
1340

    
1341
    acb->nb_sectors -= acb->n;
1342
    acb->sector_num += acb->n;
1343
    acb->buf += acb->n * 512;
1344

    
1345
    if (acb->nb_sectors == 0) {
1346
        /* request completed */
1347
        ret = 0;
1348
        goto done;
1349
    }
1350

    
1351
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1352
    n_end = index_in_cluster + acb->nb_sectors;
1353
    if (s->crypt_method &&
1354
        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1355
        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1356

    
1357
    acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1358
                                          index_in_cluster,
1359
                                          n_end, &acb->n, &acb->l2meta);
1360
    if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
1361
        ret = -EIO;
1362
        goto done;
1363
    }
1364
    if (s->crypt_method) {
1365
        if (!acb->cluster_data) {
1366
            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
1367
                                             s->cluster_size);
1368
        }
1369
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
1370
                        acb->n, 1, &s->aes_encrypt_key);
1371
        src_buf = acb->cluster_data;
1372
    } else {
1373
        src_buf = acb->buf;
1374
    }
1375
    acb->hd_iov.iov_base = (void *)src_buf;
1376
    acb->hd_iov.iov_len = acb->n * 512;
1377
    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
1378
    acb->hd_aiocb = bdrv_aio_writev(s->hd,
1379
                                    (acb->cluster_offset >> 9) + index_in_cluster,
1380
                                    &acb->hd_qiov, acb->n,
1381
                                    qcow_aio_write_cb, acb);
1382
    if (acb->hd_aiocb == NULL)
1383
        goto done;
1384

    
1385
    return;
1386

    
1387
done:
1388
    if (acb->qiov->niov > 1)
1389
        qemu_vfree(acb->orig_buf);
1390
    acb->common.cb(acb->common.opaque, ret);
1391
    qemu_aio_release(acb);
1392
}
1393

    
1394
static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
1395
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1396
        BlockDriverCompletionFunc *cb, void *opaque)
1397
{
1398
    BDRVQcowState *s = bs->opaque;
1399
    QCowAIOCB *acb;
1400

    
1401
    s->cluster_cache_offset = -1; /* disable compressed cache */
1402

    
1403
    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
1404
    if (!acb)
1405
        return NULL;
1406

    
1407
    qcow_aio_write_cb(acb, 0);
1408
    return &acb->common;
1409
}
1410

    
1411
static void qcow_close(BlockDriverState *bs)
1412
{
1413
    BDRVQcowState *s = bs->opaque;
1414
    qemu_free(s->l1_table);
1415
    qemu_free(s->l2_cache);
1416
    qemu_free(s->cluster_cache);
1417
    qemu_free(s->cluster_data);
1418
    refcount_close(bs);
1419
    bdrv_delete(s->hd);
1420
}
1421

    
1422
static int get_bits_from_size(size_t size)
1423
{
1424
    int res = 0;
1425

    
1426
    if (size == 0) {
1427
        return -1;
1428
    }
1429

    
1430
    while (size != 1) {
1431
        /* Not a power of two */
1432
        if (size & 1) {
1433
            return -1;
1434
        }
1435

    
1436
        size >>= 1;
1437
        res++;
1438
    }
1439

    
1440
    return res;
1441
}
1442

    
1443
static int qcow_create2(const char *filename, int64_t total_size,
1444
                        const char *backing_file, const char *backing_format,
1445
                        int flags, size_t cluster_size)
1446
{
1447

    
1448
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1449
    int ref_clusters, backing_format_len = 0;
1450
    QCowHeader header;
1451
    uint64_t tmp, offset;
1452
    QCowCreateState s1, *s = &s1;
1453
    QCowExtension ext_bf = {0, 0};
1454

    
1455

    
1456
    memset(s, 0, sizeof(*s));
1457

    
1458
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1459
    if (fd < 0)
1460
        return -1;
1461
    memset(&header, 0, sizeof(header));
1462
    header.magic = cpu_to_be32(QCOW_MAGIC);
1463
    header.version = cpu_to_be32(QCOW_VERSION);
1464
    header.size = cpu_to_be64(total_size * 512);
1465
    header_size = sizeof(header);
1466
    backing_filename_len = 0;
1467
    if (backing_file) {
1468
        if (backing_format) {
1469
            ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT;
1470
            backing_format_len = strlen(backing_format);
1471
            ext_bf.len = (backing_format_len + 7) & ~7;
1472
            header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7);
1473
        }
1474
        header.backing_file_offset = cpu_to_be64(header_size);
1475
        backing_filename_len = strlen(backing_file);
1476
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1477
        header_size += backing_filename_len;
1478
    }
1479

    
1480
    /* Cluster size */
1481
    s->cluster_bits = get_bits_from_size(cluster_size);
1482
    if (s->cluster_bits < MIN_CLUSTER_BITS ||
1483
        s->cluster_bits > MAX_CLUSTER_BITS)
1484
    {
1485
        fprintf(stderr, "Cluster size must be a power of two between "
1486
            "%d and %dk\n",
1487
            1 << MIN_CLUSTER_BITS,
1488
            1 << (MAX_CLUSTER_BITS - 10));
1489
        return -EINVAL;
1490
    }
1491
    s->cluster_size = 1 << s->cluster_bits;
1492

    
1493
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1494
    header_size = (header_size + 7) & ~7;
1495
    if (flags & BLOCK_FLAG_ENCRYPT) {
1496
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1497
    } else {
1498
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1499
    }
1500
    l2_bits = s->cluster_bits - 3;
1501
    shift = s->cluster_bits + l2_bits;
1502
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1503
    offset = align_offset(header_size, s->cluster_size);
1504
    s->l1_table_offset = offset;
1505
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1506
    header.l1_size = cpu_to_be32(l1_size);
1507
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1508

    
1509
    s->refcount_table = qemu_mallocz(s->cluster_size);
1510

    
1511
    s->refcount_table_offset = offset;
1512
    header.refcount_table_offset = cpu_to_be64(offset);
1513
    header.refcount_table_clusters = cpu_to_be32(1);
1514
    offset += s->cluster_size;
1515
    s->refcount_block_offset = offset;
1516

    
1517
    /* count how many refcount blocks needed */
1518
    tmp = offset >> s->cluster_bits;
1519
    ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1;
1520
    for (i=0; i < ref_clusters; i++) {
1521
        s->refcount_table[i] = cpu_to_be64(offset);
1522
        offset += s->cluster_size;
1523
    }
1524

    
1525
    s->refcount_block = qemu_mallocz(ref_clusters * s->cluster_size);
1526

    
1527
    /* update refcounts */
1528
    create_refcount_update(s, 0, header_size);
1529
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1530
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1531
    create_refcount_update(s, s->refcount_block_offset, ref_clusters * s->cluster_size);
1532

    
1533
    /* write all the data */
1534
    write(fd, &header, sizeof(header));
1535
    if (backing_file) {
1536
        if (backing_format_len) {
1537
            char zero[16];
1538
            int d = ext_bf.len - backing_format_len;
1539

    
1540
            memset(zero, 0, sizeof(zero));
1541
            cpu_to_be32s(&ext_bf.magic);
1542
            cpu_to_be32s(&ext_bf.len);
1543
            write(fd, &ext_bf, sizeof(ext_bf));
1544
            write(fd, backing_format, backing_format_len);
1545
            if (d>0) {
1546
                write(fd, zero, d);
1547
            }
1548
        }
1549
        write(fd, backing_file, backing_filename_len);
1550
    }
1551
    lseek(fd, s->l1_table_offset, SEEK_SET);
1552
    tmp = 0;
1553
    for(i = 0;i < l1_size; i++) {
1554
        write(fd, &tmp, sizeof(tmp));
1555
    }
1556
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1557
    write(fd, s->refcount_table, s->cluster_size);
1558

    
1559
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1560
    write(fd, s->refcount_block, ref_clusters * s->cluster_size);
1561

    
1562
    qemu_free(s->refcount_table);
1563
    qemu_free(s->refcount_block);
1564
    close(fd);
1565
    return 0;
1566
}
1567

    
1568
static int qcow_create(const char *filename, QEMUOptionParameter *options)
1569
{
1570
    const char *backing_file = NULL;
1571
    const char *backing_fmt = NULL;
1572
    uint64_t sectors = 0;
1573
    int flags = 0;
1574
    size_t cluster_size = 65536;
1575

    
1576
    /* Read out options */
1577
    while (options && options->name) {
1578
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1579
            sectors = options->value.n / 512;
1580
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1581
            backing_file = options->value.s;
1582
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
1583
            backing_fmt = options->value.s;
1584
        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
1585
            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
1586
        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
1587
            if (options->value.n) {
1588
                cluster_size = options->value.n;
1589
            }
1590
        }
1591
        options++;
1592
    }
1593

    
1594
    return qcow_create2(filename, sectors, backing_file, backing_fmt, flags,
1595
        cluster_size);
1596
}
1597

    
1598
static int qcow_make_empty(BlockDriverState *bs)
1599
{
1600
#if 0
1601
    /* XXX: not correct */
1602
    BDRVQcowState *s = bs->opaque;
1603
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1604
    int ret;
1605

1606
    memset(s->l1_table, 0, l1_length);
1607
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1608
        return -1;
1609
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1610
    if (ret < 0)
1611
        return ret;
1612

1613
    l2_cache_reset(bs);
1614
#endif
1615
    return 0;
1616
}
1617

    
1618
/* XXX: put compressed sectors first, then all the cluster aligned
1619
   tables to avoid losing bytes in alignment */
1620
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1621
                                 const uint8_t *buf, int nb_sectors)
1622
{
1623
    BDRVQcowState *s = bs->opaque;
1624
    z_stream strm;
1625
    int ret, out_len;
1626
    uint8_t *out_buf;
1627
    uint64_t cluster_offset;
1628

    
1629
    if (nb_sectors == 0) {
1630
        /* align end of file to a sector boundary to ease reading with
1631
           sector based I/Os */
1632
        cluster_offset = bdrv_getlength(s->hd);
1633
        cluster_offset = (cluster_offset + 511) & ~511;
1634
        bdrv_truncate(s->hd, cluster_offset);
1635
        return 0;
1636
    }
1637

    
1638
    if (nb_sectors != s->cluster_sectors)
1639
        return -EINVAL;
1640

    
1641
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1642

    
1643
    /* best compression, small window, no zlib header */
1644
    memset(&strm, 0, sizeof(strm));
1645
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1646
                       Z_DEFLATED, -12,
1647
                       9, Z_DEFAULT_STRATEGY);
1648
    if (ret != 0) {
1649
        qemu_free(out_buf);
1650
        return -1;
1651
    }
1652

    
1653
    strm.avail_in = s->cluster_size;
1654
    strm.next_in = (uint8_t *)buf;
1655
    strm.avail_out = s->cluster_size;
1656
    strm.next_out = out_buf;
1657

    
1658
    ret = deflate(&strm, Z_FINISH);
1659
    if (ret != Z_STREAM_END && ret != Z_OK) {
1660
        qemu_free(out_buf);
1661
        deflateEnd(&strm);
1662
        return -1;
1663
    }
1664
    out_len = strm.next_out - out_buf;
1665

    
1666
    deflateEnd(&strm);
1667

    
1668
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1669
        /* could not compress: write normal cluster */
1670
        bdrv_write(bs, sector_num, buf, s->cluster_sectors);
1671
    } else {
1672
        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
1673
                                              out_len);
1674
        if (!cluster_offset)
1675
            return -1;
1676
        cluster_offset &= s->cluster_offset_mask;
1677
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1678
            qemu_free(out_buf);
1679
            return -1;
1680
        }
1681
    }
1682

    
1683
    qemu_free(out_buf);
1684
    return 0;
1685
}
1686

    
1687
static void qcow_flush(BlockDriverState *bs)
1688
{
1689
    BDRVQcowState *s = bs->opaque;
1690
    bdrv_flush(s->hd);
1691
}
1692

    
1693
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1694
{
1695
    BDRVQcowState *s = bs->opaque;
1696
    bdi->cluster_size = s->cluster_size;
1697
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1698
        (s->cluster_bits + s->l2_bits);
1699
    return 0;
1700
}
1701

    
1702
/*********************************************************/
1703
/* snapshot support */
1704

    
1705

    
1706
static void qcow_free_snapshots(BlockDriverState *bs)
1707
{
1708
    BDRVQcowState *s = bs->opaque;
1709
    int i;
1710

    
1711
    for(i = 0; i < s->nb_snapshots; i++) {
1712
        qemu_free(s->snapshots[i].name);
1713
        qemu_free(s->snapshots[i].id_str);
1714
    }
1715
    qemu_free(s->snapshots);
1716
    s->snapshots = NULL;
1717
    s->nb_snapshots = 0;
1718
}
1719

    
1720
static int qcow_read_snapshots(BlockDriverState *bs)
1721
{
1722
    BDRVQcowState *s = bs->opaque;
1723
    QCowSnapshotHeader h;
1724
    QCowSnapshot *sn;
1725
    int i, id_str_size, name_size;
1726
    int64_t offset;
1727
    uint32_t extra_data_size;
1728

    
1729
    if (!s->nb_snapshots) {
1730
        s->snapshots = NULL;
1731
        s->snapshots_size = 0;
1732
        return 0;
1733
    }
1734

    
1735
    offset = s->snapshots_offset;
1736
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1737
    for(i = 0; i < s->nb_snapshots; i++) {
1738
        offset = align_offset(offset, 8);
1739
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1740
            goto fail;
1741
        offset += sizeof(h);
1742
        sn = s->snapshots + i;
1743
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1744
        sn->l1_size = be32_to_cpu(h.l1_size);
1745
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1746
        sn->date_sec = be32_to_cpu(h.date_sec);
1747
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1748
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1749
        extra_data_size = be32_to_cpu(h.extra_data_size);
1750

    
1751
        id_str_size = be16_to_cpu(h.id_str_size);
1752
        name_size = be16_to_cpu(h.name_size);
1753

    
1754
        offset += extra_data_size;
1755

    
1756
        sn->id_str = qemu_malloc(id_str_size + 1);
1757
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1758
            goto fail;
1759
        offset += id_str_size;
1760
        sn->id_str[id_str_size] = '\0';
1761

    
1762
        sn->name = qemu_malloc(name_size + 1);
1763
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1764
            goto fail;
1765
        offset += name_size;
1766
        sn->name[name_size] = '\0';
1767
    }
1768
    s->snapshots_size = offset - s->snapshots_offset;
1769
    return 0;
1770
 fail:
1771
    qcow_free_snapshots(bs);
1772
    return -1;
1773
}
1774

    
1775
/* add at the end of the file a new list of snapshots */
1776
static int qcow_write_snapshots(BlockDriverState *bs)
1777
{
1778
    BDRVQcowState *s = bs->opaque;
1779
    QCowSnapshot *sn;
1780
    QCowSnapshotHeader h;
1781
    int i, name_size, id_str_size, snapshots_size;
1782
    uint64_t data64;
1783
    uint32_t data32;
1784
    int64_t offset, snapshots_offset;
1785

    
1786
    /* compute the size of the snapshots */
1787
    offset = 0;
1788
    for(i = 0; i < s->nb_snapshots; i++) {
1789
        sn = s->snapshots + i;
1790
        offset = align_offset(offset, 8);
1791
        offset += sizeof(h);
1792
        offset += strlen(sn->id_str);
1793
        offset += strlen(sn->name);
1794
    }
1795
    snapshots_size = offset;
1796

    
1797
    snapshots_offset = alloc_clusters(bs, snapshots_size);
1798
    offset = snapshots_offset;
1799

    
1800
    for(i = 0; i < s->nb_snapshots; i++) {
1801
        sn = s->snapshots + i;
1802
        memset(&h, 0, sizeof(h));
1803
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
1804
        h.l1_size = cpu_to_be32(sn->l1_size);
1805
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
1806
        h.date_sec = cpu_to_be32(sn->date_sec);
1807
        h.date_nsec = cpu_to_be32(sn->date_nsec);
1808
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
1809

    
1810
        id_str_size = strlen(sn->id_str);
1811
        name_size = strlen(sn->name);
1812
        h.id_str_size = cpu_to_be16(id_str_size);
1813
        h.name_size = cpu_to_be16(name_size);
1814
        offset = align_offset(offset, 8);
1815
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1816
            goto fail;
1817
        offset += sizeof(h);
1818
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1819
            goto fail;
1820
        offset += id_str_size;
1821
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
1822
            goto fail;
1823
        offset += name_size;
1824
    }
1825

    
1826
    /* update the various header fields */
1827
    data64 = cpu_to_be64(snapshots_offset);
1828
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
1829
                    &data64, sizeof(data64)) != sizeof(data64))
1830
        goto fail;
1831
    data32 = cpu_to_be32(s->nb_snapshots);
1832
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
1833
                    &data32, sizeof(data32)) != sizeof(data32))
1834
        goto fail;
1835

    
1836
    /* free the old snapshot table */
1837
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
1838
    s->snapshots_offset = snapshots_offset;
1839
    s->snapshots_size = snapshots_size;
1840
    return 0;
1841
 fail:
1842
    return -1;
1843
}
1844

    
1845
static void find_new_snapshot_id(BlockDriverState *bs,
1846
                                 char *id_str, int id_str_size)
1847
{
1848
    BDRVQcowState *s = bs->opaque;
1849
    QCowSnapshot *sn;
1850
    int i, id, id_max = 0;
1851

    
1852
    for(i = 0; i < s->nb_snapshots; i++) {
1853
        sn = s->snapshots + i;
1854
        id = strtoul(sn->id_str, NULL, 10);
1855
        if (id > id_max)
1856
            id_max = id;
1857
    }
1858
    snprintf(id_str, id_str_size, "%d", id_max + 1);
1859
}
1860

    
1861
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
1862
{
1863
    BDRVQcowState *s = bs->opaque;
1864
    int i;
1865

    
1866
    for(i = 0; i < s->nb_snapshots; i++) {
1867
        if (!strcmp(s->snapshots[i].id_str, id_str))
1868
            return i;
1869
    }
1870
    return -1;
1871
}
1872

    
1873
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
1874
{
1875
    BDRVQcowState *s = bs->opaque;
1876
    int i, ret;
1877

    
1878
    ret = find_snapshot_by_id(bs, name);
1879
    if (ret >= 0)
1880
        return ret;
1881
    for(i = 0; i < s->nb_snapshots; i++) {
1882
        if (!strcmp(s->snapshots[i].name, name))
1883
            return i;
1884
    }
1885
    return -1;
1886
}
1887

    
1888
/* if no id is provided, a new one is constructed */
1889
static int qcow_snapshot_create(BlockDriverState *bs,
1890
                                QEMUSnapshotInfo *sn_info)
1891
{
1892
    BDRVQcowState *s = bs->opaque;
1893
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
1894
    int i, ret;
1895
    uint64_t *l1_table = NULL;
1896

    
1897
    memset(sn, 0, sizeof(*sn));
1898

    
1899
    if (sn_info->id_str[0] == '\0') {
1900
        /* compute a new id */
1901
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
1902
    }
1903

    
1904
    /* check that the ID is unique */
1905
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
1906
        return -ENOENT;
1907

    
1908
    sn->id_str = qemu_strdup(sn_info->id_str);
1909
    if (!sn->id_str)
1910
        goto fail;
1911
    sn->name = qemu_strdup(sn_info->name);
1912
    if (!sn->name)
1913
        goto fail;
1914
    sn->vm_state_size = sn_info->vm_state_size;
1915
    sn->date_sec = sn_info->date_sec;
1916
    sn->date_nsec = sn_info->date_nsec;
1917
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
1918

    
1919
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
1920
    if (ret < 0)
1921
        goto fail;
1922

    
1923
    /* create the L1 table of the snapshot */
1924
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
1925
    sn->l1_size = s->l1_size;
1926

    
1927
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
1928
    for(i = 0; i < s->l1_size; i++) {
1929
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
1930
    }
1931
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
1932
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
1933
        (s->l1_size * sizeof(uint64_t)))
1934
        goto fail;
1935
    qemu_free(l1_table);
1936
    l1_table = NULL;
1937

    
1938
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
1939
    if (s->snapshots) {
1940
        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
1941
        qemu_free(s->snapshots);
1942
    }
1943
    s->snapshots = snapshots1;
1944
    s->snapshots[s->nb_snapshots++] = *sn;
1945

    
1946
    if (qcow_write_snapshots(bs) < 0)
1947
        goto fail;
1948
#ifdef DEBUG_ALLOC
1949
    check_refcounts(bs);
1950
#endif
1951
    return 0;
1952
 fail:
1953
    qemu_free(sn->name);
1954
    qemu_free(l1_table);
1955
    return -1;
1956
}
1957

    
1958
/* copy the snapshot 'snapshot_name' into the current disk image */
1959
static int qcow_snapshot_goto(BlockDriverState *bs,
1960
                              const char *snapshot_id)
1961
{
1962
    BDRVQcowState *s = bs->opaque;
1963
    QCowSnapshot *sn;
1964
    int i, snapshot_index, l1_size2;
1965

    
1966
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
1967
    if (snapshot_index < 0)
1968
        return -ENOENT;
1969
    sn = &s->snapshots[snapshot_index];
1970

    
1971
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
1972
        goto fail;
1973

    
1974
    if (grow_l1_table(bs, sn->l1_size) < 0)
1975
        goto fail;
1976

    
1977
    s->l1_size = sn->l1_size;
1978
    l1_size2 = s->l1_size * sizeof(uint64_t);
1979
    /* copy the snapshot l1 table to the current l1 table */
1980
    if (bdrv_pread(s->hd, sn->l1_table_offset,
1981
                   s->l1_table, l1_size2) != l1_size2)
1982
        goto fail;
1983
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
1984
                    s->l1_table, l1_size2) != l1_size2)
1985
        goto fail;
1986
    for(i = 0;i < s->l1_size; i++) {
1987
        be64_to_cpus(&s->l1_table[i]);
1988
    }
1989

    
1990
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
1991
        goto fail;
1992

    
1993
#ifdef DEBUG_ALLOC
1994
    check_refcounts(bs);
1995
#endif
1996
    return 0;
1997
 fail:
1998
    return -EIO;
1999
}
2000

    
2001
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2002
{
2003
    BDRVQcowState *s = bs->opaque;
2004
    QCowSnapshot *sn;
2005
    int snapshot_index, ret;
2006

    
2007
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2008
    if (snapshot_index < 0)
2009
        return -ENOENT;
2010
    sn = &s->snapshots[snapshot_index];
2011

    
2012
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
2013
    if (ret < 0)
2014
        return ret;
2015
    /* must update the copied flag on the current cluster offsets */
2016
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
2017
    if (ret < 0)
2018
        return ret;
2019
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
2020

    
2021
    qemu_free(sn->id_str);
2022
    qemu_free(sn->name);
2023
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
2024
    s->nb_snapshots--;
2025
    ret = qcow_write_snapshots(bs);
2026
    if (ret < 0) {
2027
        /* XXX: restore snapshot if error ? */
2028
        return ret;
2029
    }
2030
#ifdef DEBUG_ALLOC
2031
    check_refcounts(bs);
2032
#endif
2033
    return 0;
2034
}
2035

    
2036
static int qcow_snapshot_list(BlockDriverState *bs,
2037
                              QEMUSnapshotInfo **psn_tab)
2038
{
2039
    BDRVQcowState *s = bs->opaque;
2040
    QEMUSnapshotInfo *sn_tab, *sn_info;
2041
    QCowSnapshot *sn;
2042
    int i;
2043

    
2044
    if (!s->nb_snapshots) {
2045
        *psn_tab = NULL;
2046
        return s->nb_snapshots;
2047
    }
2048

    
2049
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
2050
    for(i = 0; i < s->nb_snapshots; i++) {
2051
        sn_info = sn_tab + i;
2052
        sn = s->snapshots + i;
2053
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
2054
                sn->id_str);
2055
        pstrcpy(sn_info->name, sizeof(sn_info->name),
2056
                sn->name);
2057
        sn_info->vm_state_size = sn->vm_state_size;
2058
        sn_info->date_sec = sn->date_sec;
2059
        sn_info->date_nsec = sn->date_nsec;
2060
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
2061
    }
2062
    *psn_tab = sn_tab;
2063
    return s->nb_snapshots;
2064
}
2065

    
2066
static int qcow_check(BlockDriverState *bs)
2067
{
2068
    return check_refcounts(bs);
2069
}
2070

    
2071
#if 0
2072
static void dump_refcounts(BlockDriverState *bs)
2073
{
2074
    BDRVQcowState *s = bs->opaque;
2075
    int64_t nb_clusters, k, k1, size;
2076
    int refcount;
2077

2078
    size = bdrv_getlength(s->hd);
2079
    nb_clusters = size_to_clusters(s, size);
2080
    for(k = 0; k < nb_clusters;) {
2081
        k1 = k;
2082
        refcount = get_refcount(bs, k);
2083
        k++;
2084
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2085
            k++;
2086
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2087
    }
2088
}
2089
#endif
2090

    
2091
static int qcow_put_buffer(BlockDriverState *bs, const uint8_t *buf,
2092
                           int64_t pos, int size)
2093
{
2094
    int growable = bs->growable;
2095

    
2096
    bs->growable = 1;
2097
    bdrv_pwrite(bs, pos, buf, size);
2098
    bs->growable = growable;
2099

    
2100
    return size;
2101
}
2102

    
2103
static int qcow_get_buffer(BlockDriverState *bs, uint8_t *buf,
2104
                           int64_t pos, int size)
2105
{
2106
    int growable = bs->growable;
2107
    int ret;
2108

    
2109
    bs->growable = 1;
2110
    ret = bdrv_pread(bs, pos, buf, size);
2111
    bs->growable = growable;
2112

    
2113
    return ret;
2114
}
2115

    
2116
static QEMUOptionParameter qcow_create_options[] = {
2117
    {
2118
        .name = BLOCK_OPT_SIZE,
2119
        .type = OPT_SIZE,
2120
        .help = "Virtual disk size"
2121
    },
2122
    {
2123
        .name = BLOCK_OPT_BACKING_FILE,
2124
        .type = OPT_STRING,
2125
        .help = "File name of a base image"
2126
    },
2127
    {
2128
        .name = BLOCK_OPT_BACKING_FMT,
2129
        .type = OPT_STRING,
2130
        .help = "Image format of the base image"
2131
    },
2132
    {
2133
        .name = BLOCK_OPT_ENCRYPT,
2134
        .type = OPT_FLAG,
2135
        .help = "Encrypt the image"
2136
    },
2137
    {
2138
        .name = BLOCK_OPT_CLUSTER_SIZE,
2139
        .type = OPT_SIZE,
2140
        .help = "qcow2 cluster size"
2141
    },
2142
    { NULL }
2143
};
2144

    
2145
static BlockDriver bdrv_qcow2 = {
2146
    .format_name        = "qcow2",
2147
    .instance_size        = sizeof(BDRVQcowState),
2148
    .bdrv_probe                = qcow_probe,
2149
    .bdrv_open                = qcow_open,
2150
    .bdrv_close                = qcow_close,
2151
    .bdrv_create        = qcow_create,
2152
    .bdrv_flush                = qcow_flush,
2153
    .bdrv_is_allocated        = qcow_is_allocated,
2154
    .bdrv_set_key        = qcow_set_key,
2155
    .bdrv_make_empty        = qcow_make_empty,
2156

    
2157
    .bdrv_aio_readv        = qcow_aio_readv,
2158
    .bdrv_aio_writev        = qcow_aio_writev,
2159
    .bdrv_write_compressed = qcow_write_compressed,
2160

    
2161
    .bdrv_snapshot_create = qcow_snapshot_create,
2162
    .bdrv_snapshot_goto        = qcow_snapshot_goto,
2163
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2164
    .bdrv_snapshot_list        = qcow_snapshot_list,
2165
    .bdrv_get_info        = qcow_get_info,
2166

    
2167
    .bdrv_put_buffer    = qcow_put_buffer,
2168
    .bdrv_get_buffer    = qcow_get_buffer,
2169

    
2170
    .create_options = qcow_create_options,
2171
    .bdrv_check = qcow_check,
2172
};
2173

    
2174
static void bdrv_qcow2_init(void)
2175
{
2176
    bdrv_register(&bdrv_qcow2);
2177
}
2178

    
2179
block_init(bdrv_qcow2_init);