Statistics
| Branch: | Revision:

root / block-qcow2.c @ bc352085

History | View | Annotate | Download (79.9 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include <zlib.h>
27
#include "aes.h"
28
#include <assert.h>
29

    
30
/*
31
  Differences with QCOW:
32

33
  - Support for multiple incremental snapshots.
34
  - Memory management by reference counts.
35
  - Clusters which have a reference count of one have the bit
36
    QCOW_OFLAG_COPIED to optimize write performance.
37
  - Size of compressed clusters is stored in sectors to reduce bit usage
38
    in the cluster offsets.
39
  - Support for storing additional data (such as the VM state) in the
40
    snapshots.
41
  - If a backing store is used, the cluster size is not constrained
42
    (could be backported to QCOW).
43
  - L2 tables have always a size of one cluster.
44
*/
45

    
46
//#define DEBUG_ALLOC
47
//#define DEBUG_ALLOC2
48

    
49
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
50
#define QCOW_VERSION 2
51

    
52
#define QCOW_CRYPT_NONE 0
53
#define QCOW_CRYPT_AES  1
54

    
55
#define QCOW_MAX_CRYPT_CLUSTERS 32
56

    
57
/* indicate that the refcount of the referenced cluster is exactly one. */
58
#define QCOW_OFLAG_COPIED     (1LL << 63)
59
/* indicate that the cluster is compressed (they never have the copied flag) */
60
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
61

    
62
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
63

    
64
#ifndef offsetof
65
#define offsetof(type, field) ((size_t) &((type *)0)->field)
66
#endif
67

    
68
typedef struct QCowHeader {
69
    uint32_t magic;
70
    uint32_t version;
71
    uint64_t backing_file_offset;
72
    uint32_t backing_file_size;
73
    uint32_t cluster_bits;
74
    uint64_t size; /* in bytes */
75
    uint32_t crypt_method;
76
    uint32_t l1_size; /* XXX: save number of clusters instead ? */
77
    uint64_t l1_table_offset;
78
    uint64_t refcount_table_offset;
79
    uint32_t refcount_table_clusters;
80
    uint32_t nb_snapshots;
81
    uint64_t snapshots_offset;
82
} QCowHeader;
83

    
84
typedef struct __attribute__((packed)) QCowSnapshotHeader {
85
    /* header is 8 byte aligned */
86
    uint64_t l1_table_offset;
87

    
88
    uint32_t l1_size;
89
    uint16_t id_str_size;
90
    uint16_t name_size;
91

    
92
    uint32_t date_sec;
93
    uint32_t date_nsec;
94

    
95
    uint64_t vm_clock_nsec;
96

    
97
    uint32_t vm_state_size;
98
    uint32_t extra_data_size; /* for extension */
99
    /* extra data follows */
100
    /* id_str follows */
101
    /* name follows  */
102
} QCowSnapshotHeader;
103

    
104
#define L2_CACHE_SIZE 16
105

    
106
typedef struct QCowSnapshot {
107
    uint64_t l1_table_offset;
108
    uint32_t l1_size;
109
    char *id_str;
110
    char *name;
111
    uint32_t vm_state_size;
112
    uint32_t date_sec;
113
    uint32_t date_nsec;
114
    uint64_t vm_clock_nsec;
115
} QCowSnapshot;
116

    
117
typedef struct BDRVQcowState {
118
    BlockDriverState *hd;
119
    int cluster_bits;
120
    int cluster_size;
121
    int cluster_sectors;
122
    int l2_bits;
123
    int l2_size;
124
    int l1_size;
125
    int l1_vm_state_index;
126
    int csize_shift;
127
    int csize_mask;
128
    uint64_t cluster_offset_mask;
129
    uint64_t l1_table_offset;
130
    uint64_t *l1_table;
131
    uint64_t *l2_cache;
132
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
133
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
134
    uint8_t *cluster_cache;
135
    uint8_t *cluster_data;
136
    uint64_t cluster_cache_offset;
137

    
138
    uint64_t *refcount_table;
139
    uint64_t refcount_table_offset;
140
    uint32_t refcount_table_size;
141
    uint64_t refcount_block_cache_offset;
142
    uint16_t *refcount_block_cache;
143
    int64_t free_cluster_index;
144
    int64_t free_byte_offset;
145

    
146
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
147
    uint32_t crypt_method_header;
148
    AES_KEY aes_encrypt_key;
149
    AES_KEY aes_decrypt_key;
150
    uint64_t snapshots_offset;
151
    int snapshots_size;
152
    int nb_snapshots;
153
    QCowSnapshot *snapshots;
154
} BDRVQcowState;
155

    
156
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
157
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
158
                     uint8_t *buf, int nb_sectors);
159
static int qcow_read_snapshots(BlockDriverState *bs);
160
static void qcow_free_snapshots(BlockDriverState *bs);
161
static int refcount_init(BlockDriverState *bs);
162
static void refcount_close(BlockDriverState *bs);
163
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
164
static int update_cluster_refcount(BlockDriverState *bs,
165
                                   int64_t cluster_index,
166
                                   int addend);
167
static void update_refcount(BlockDriverState *bs,
168
                            int64_t offset, int64_t length,
169
                            int addend);
170
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
171
static int64_t alloc_bytes(BlockDriverState *bs, int size);
172
static void free_clusters(BlockDriverState *bs,
173
                          int64_t offset, int64_t size);
174
#ifdef DEBUG_ALLOC
175
static void check_refcounts(BlockDriverState *bs);
176
#endif
177

    
178
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
179
{
180
    const QCowHeader *cow_header = (const void *)buf;
181

    
182
    if (buf_size >= sizeof(QCowHeader) &&
183
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
184
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
185
        return 100;
186
    else
187
        return 0;
188
}
189

    
190
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
191
{
192
    BDRVQcowState *s = bs->opaque;
193
    int len, i, shift, ret;
194
    QCowHeader header;
195

    
196
    ret = bdrv_file_open(&s->hd, filename, flags);
197
    if (ret < 0)
198
        return ret;
199
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
200
        goto fail;
201
    be32_to_cpus(&header.magic);
202
    be32_to_cpus(&header.version);
203
    be64_to_cpus(&header.backing_file_offset);
204
    be32_to_cpus(&header.backing_file_size);
205
    be64_to_cpus(&header.size);
206
    be32_to_cpus(&header.cluster_bits);
207
    be32_to_cpus(&header.crypt_method);
208
    be64_to_cpus(&header.l1_table_offset);
209
    be32_to_cpus(&header.l1_size);
210
    be64_to_cpus(&header.refcount_table_offset);
211
    be32_to_cpus(&header.refcount_table_clusters);
212
    be64_to_cpus(&header.snapshots_offset);
213
    be32_to_cpus(&header.nb_snapshots);
214

    
215
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
216
        goto fail;
217
    if (header.size <= 1 ||
218
        header.cluster_bits < 9 ||
219
        header.cluster_bits > 16)
220
        goto fail;
221
    if (header.crypt_method > QCOW_CRYPT_AES)
222
        goto fail;
223
    s->crypt_method_header = header.crypt_method;
224
    if (s->crypt_method_header)
225
        bs->encrypted = 1;
226
    s->cluster_bits = header.cluster_bits;
227
    s->cluster_size = 1 << s->cluster_bits;
228
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
229
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
230
    s->l2_size = 1 << s->l2_bits;
231
    bs->total_sectors = header.size / 512;
232
    s->csize_shift = (62 - (s->cluster_bits - 8));
233
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
234
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
235
    s->refcount_table_offset = header.refcount_table_offset;
236
    s->refcount_table_size =
237
        header.refcount_table_clusters << (s->cluster_bits - 3);
238

    
239
    s->snapshots_offset = header.snapshots_offset;
240
    s->nb_snapshots = header.nb_snapshots;
241

    
242
    /* read the level 1 table */
243
    s->l1_size = header.l1_size;
244
    shift = s->cluster_bits + s->l2_bits;
245
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
246
    /* the L1 table must contain at least enough entries to put
247
       header.size bytes */
248
    if (s->l1_size < s->l1_vm_state_index)
249
        goto fail;
250
    s->l1_table_offset = header.l1_table_offset;
251
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
252
    if (!s->l1_table)
253
        goto fail;
254
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
255
        s->l1_size * sizeof(uint64_t))
256
        goto fail;
257
    for(i = 0;i < s->l1_size; i++) {
258
        be64_to_cpus(&s->l1_table[i]);
259
    }
260
    /* alloc L2 cache */
261
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
262
    if (!s->l2_cache)
263
        goto fail;
264
    s->cluster_cache = qemu_malloc(s->cluster_size);
265
    if (!s->cluster_cache)
266
        goto fail;
267
    /* one more sector for decompressed data alignment */
268
    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
269
                                  + 512);
270
    if (!s->cluster_data)
271
        goto fail;
272
    s->cluster_cache_offset = -1;
273

    
274
    if (refcount_init(bs) < 0)
275
        goto fail;
276

    
277
    /* read the backing file name */
278
    if (header.backing_file_offset != 0) {
279
        len = header.backing_file_size;
280
        if (len > 1023)
281
            len = 1023;
282
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
283
            goto fail;
284
        bs->backing_file[len] = '\0';
285
    }
286
    if (qcow_read_snapshots(bs) < 0)
287
        goto fail;
288

    
289
#ifdef DEBUG_ALLOC
290
    check_refcounts(bs);
291
#endif
292
    return 0;
293

    
294
 fail:
295
    qcow_free_snapshots(bs);
296
    refcount_close(bs);
297
    qemu_free(s->l1_table);
298
    qemu_free(s->l2_cache);
299
    qemu_free(s->cluster_cache);
300
    qemu_free(s->cluster_data);
301
    bdrv_delete(s->hd);
302
    return -1;
303
}
304

    
305
static int qcow_set_key(BlockDriverState *bs, const char *key)
306
{
307
    BDRVQcowState *s = bs->opaque;
308
    uint8_t keybuf[16];
309
    int len, i;
310

    
311
    memset(keybuf, 0, 16);
312
    len = strlen(key);
313
    if (len > 16)
314
        len = 16;
315
    /* XXX: we could compress the chars to 7 bits to increase
316
       entropy */
317
    for(i = 0;i < len;i++) {
318
        keybuf[i] = key[i];
319
    }
320
    s->crypt_method = s->crypt_method_header;
321

    
322
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
323
        return -1;
324
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
325
        return -1;
326
#if 0
327
    /* test */
328
    {
329
        uint8_t in[16];
330
        uint8_t out[16];
331
        uint8_t tmp[16];
332
        for(i=0;i<16;i++)
333
            in[i] = i;
334
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
335
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
336
        for(i = 0; i < 16; i++)
337
            printf(" %02x", tmp[i]);
338
        printf("\n");
339
        for(i = 0; i < 16; i++)
340
            printf(" %02x", out[i]);
341
        printf("\n");
342
    }
343
#endif
344
    return 0;
345
}
346

    
347
/* The crypt function is compatible with the linux cryptoloop
348
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
349
   supported */
350
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
351
                            uint8_t *out_buf, const uint8_t *in_buf,
352
                            int nb_sectors, int enc,
353
                            const AES_KEY *key)
354
{
355
    union {
356
        uint64_t ll[2];
357
        uint8_t b[16];
358
    } ivec;
359
    int i;
360

    
361
    for(i = 0; i < nb_sectors; i++) {
362
        ivec.ll[0] = cpu_to_le64(sector_num);
363
        ivec.ll[1] = 0;
364
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
365
                        ivec.b, enc);
366
        sector_num++;
367
        in_buf += 512;
368
        out_buf += 512;
369
    }
370
}
371

    
372
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
373
                        uint64_t cluster_offset, int n_start, int n_end)
374
{
375
    BDRVQcowState *s = bs->opaque;
376
    int n, ret;
377

    
378
    n = n_end - n_start;
379
    if (n <= 0)
380
        return 0;
381
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
382
    if (ret < 0)
383
        return ret;
384
    if (s->crypt_method) {
385
        encrypt_sectors(s, start_sect + n_start,
386
                        s->cluster_data,
387
                        s->cluster_data, n, 1,
388
                        &s->aes_encrypt_key);
389
    }
390
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
391
                     s->cluster_data, n);
392
    if (ret < 0)
393
        return ret;
394
    return 0;
395
}
396

    
397
static void l2_cache_reset(BlockDriverState *bs)
398
{
399
    BDRVQcowState *s = bs->opaque;
400

    
401
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
402
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
403
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
404
}
405

    
406
static inline int l2_cache_new_entry(BlockDriverState *bs)
407
{
408
    BDRVQcowState *s = bs->opaque;
409
    uint32_t min_count;
410
    int min_index, i;
411

    
412
    /* find a new entry in the least used one */
413
    min_index = 0;
414
    min_count = 0xffffffff;
415
    for(i = 0; i < L2_CACHE_SIZE; i++) {
416
        if (s->l2_cache_counts[i] < min_count) {
417
            min_count = s->l2_cache_counts[i];
418
            min_index = i;
419
        }
420
    }
421
    return min_index;
422
}
423

    
424
static int64_t align_offset(int64_t offset, int n)
425
{
426
    offset = (offset + n - 1) & ~(n - 1);
427
    return offset;
428
}
429

    
430
static int grow_l1_table(BlockDriverState *bs, int min_size)
431
{
432
    BDRVQcowState *s = bs->opaque;
433
    int new_l1_size, new_l1_size2, ret, i;
434
    uint64_t *new_l1_table;
435
    uint64_t new_l1_table_offset;
436
    uint64_t data64;
437
    uint32_t data32;
438

    
439
    new_l1_size = s->l1_size;
440
    if (min_size <= new_l1_size)
441
        return 0;
442
    while (min_size > new_l1_size) {
443
        new_l1_size = (new_l1_size * 3 + 1) / 2;
444
    }
445
#ifdef DEBUG_ALLOC2
446
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
447
#endif
448

    
449
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
450
    new_l1_table = qemu_mallocz(new_l1_size2);
451
    if (!new_l1_table)
452
        return -ENOMEM;
453
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
454

    
455
    /* write new table (align to cluster) */
456
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
457

    
458
    for(i = 0; i < s->l1_size; i++)
459
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
460
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
461
    if (ret != new_l1_size2)
462
        goto fail;
463
    for(i = 0; i < s->l1_size; i++)
464
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
465

    
466
    /* set new table */
467
    data64 = cpu_to_be64(new_l1_table_offset);
468
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_table_offset),
469
                    &data64, sizeof(data64)) != sizeof(data64))
470
        goto fail;
471
    data32 = cpu_to_be32(new_l1_size);
472
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size),
473
                    &data32, sizeof(data32)) != sizeof(data32))
474
        goto fail;
475
    qemu_free(s->l1_table);
476
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
477
    s->l1_table_offset = new_l1_table_offset;
478
    s->l1_table = new_l1_table;
479
    s->l1_size = new_l1_size;
480
    return 0;
481
 fail:
482
    qemu_free(s->l1_table);
483
    return -EIO;
484
}
485

    
486
/*
487
 * seek_l2_table
488
 *
489
 * seek l2_offset in the l2_cache table
490
 * if not found, return NULL,
491
 * if found,
492
 *   increments the l2 cache hit count of the entry,
493
 *   if counter overflow, divide by two all counters
494
 *   return the pointer to the l2 cache entry
495
 *
496
 */
497

    
498
static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
499
{
500
    int i, j;
501

    
502
    for(i = 0; i < L2_CACHE_SIZE; i++) {
503
        if (l2_offset == s->l2_cache_offsets[i]) {
504
            /* increment the hit count */
505
            if (++s->l2_cache_counts[i] == 0xffffffff) {
506
                for(j = 0; j < L2_CACHE_SIZE; j++) {
507
                    s->l2_cache_counts[j] >>= 1;
508
                }
509
            }
510
            return s->l2_cache + (i << s->l2_bits);
511
        }
512
    }
513
    return NULL;
514
}
515

    
516
/*
517
 * l2_load
518
 *
519
 * Loads a L2 table into memory. If the table is in the cache, the cache
520
 * is used; otherwise the L2 table is loaded from the image file.
521
 *
522
 * Returns a pointer to the L2 table on success, or NULL if the read from
523
 * the image file failed.
524
 */
525

    
526
static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
527
{
528
    BDRVQcowState *s = bs->opaque;
529
    int min_index;
530
    uint64_t *l2_table;
531

    
532
    /* seek if the table for the given offset is in the cache */
533

    
534
    l2_table = seek_l2_table(s, l2_offset);
535
    if (l2_table != NULL)
536
        return l2_table;
537

    
538
    /* not found: load a new entry in the least used one */
539

    
540
    min_index = l2_cache_new_entry(bs);
541
    l2_table = s->l2_cache + (min_index << s->l2_bits);
542
    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
543
        s->l2_size * sizeof(uint64_t))
544
        return NULL;
545
    s->l2_cache_offsets[min_index] = l2_offset;
546
    s->l2_cache_counts[min_index] = 1;
547

    
548
    return l2_table;
549
}
550

    
551
/*
552
 * l2_allocate
553
 *
554
 * Allocate a new l2 entry in the file. If l1_index points to an already
555
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
556
 * table) copy the contents of the old L2 table into the newly allocated one.
557
 * Otherwise the new table is initialized with zeros.
558
 *
559
 */
560

    
561
static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
562
{
563
    BDRVQcowState *s = bs->opaque;
564
    int min_index;
565
    uint64_t old_l2_offset, tmp;
566
    uint64_t *l2_table, l2_offset;
567

    
568
    old_l2_offset = s->l1_table[l1_index];
569

    
570
    /* allocate a new l2 entry */
571

    
572
    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
573

    
574
    /* update the L1 entry */
575

    
576
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
577

    
578
    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
579
    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
580
                    &tmp, sizeof(tmp)) != sizeof(tmp))
581
        return NULL;
582

    
583
    /* allocate a new entry in the l2 cache */
584

    
585
    min_index = l2_cache_new_entry(bs);
586
    l2_table = s->l2_cache + (min_index << s->l2_bits);
587

    
588
    if (old_l2_offset == 0) {
589
        /* if there was no old l2 table, clear the new table */
590
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
591
    } else {
592
        /* if there was an old l2 table, read it from the disk */
593
        if (bdrv_pread(s->hd, old_l2_offset,
594
                       l2_table, s->l2_size * sizeof(uint64_t)) !=
595
            s->l2_size * sizeof(uint64_t))
596
            return NULL;
597
    }
598
    /* write the l2 table to the file */
599
    if (bdrv_pwrite(s->hd, l2_offset,
600
                    l2_table, s->l2_size * sizeof(uint64_t)) !=
601
        s->l2_size * sizeof(uint64_t))
602
        return NULL;
603

    
604
    /* update the l2 cache entry */
605

    
606
    s->l2_cache_offsets[min_index] = l2_offset;
607
    s->l2_cache_counts[min_index] = 1;
608

    
609
    return l2_table;
610
}
611

    
612
/*
613
 * get_cluster_offset
614
 *
615
 * For a given offset of the disk image, return cluster offset in
616
 * qcow2 file.
617
 *
618
 * on entry, *num is the number of contiguous clusters we'd like to
619
 * access following offset.
620
 *
621
 * on exit, *num is the number of contiguous clusters we can read.
622
 *
623
 * Return 1, if the offset is found
624
 * Return 0, otherwise.
625
 *
626
 */
627

    
628
static uint64_t get_cluster_offset(BlockDriverState *bs,
629
                                   uint64_t offset, int *num)
630
{
631
    BDRVQcowState *s = bs->opaque;
632
    int l1_index, l2_index;
633
    uint64_t l2_offset, *l2_table, cluster_offset, next;
634
    int l1_bits;
635
    int index_in_cluster, nb_available, nb_needed;
636

    
637
    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
638
    nb_needed = *num + index_in_cluster;
639

    
640
    l1_bits = s->l2_bits + s->cluster_bits;
641

    
642
    /* compute how many bytes there are between the offset and
643
     * and the end of the l1 entry
644
     */
645

    
646
    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
647

    
648
    /* compute the number of available sectors */
649

    
650
    nb_available = (nb_available >> 9) + index_in_cluster;
651

    
652
    cluster_offset = 0;
653

    
654
    /* seek the the l2 offset in the l1 table */
655

    
656
    l1_index = offset >> l1_bits;
657
    if (l1_index >= s->l1_size)
658
        goto out;
659

    
660
    l2_offset = s->l1_table[l1_index];
661

    
662
    /* seek the l2 table of the given l2 offset */
663

    
664
    if (!l2_offset)
665
        goto out;
666

    
667
    /* load the l2 table in memory */
668

    
669
    l2_offset &= ~QCOW_OFLAG_COPIED;
670
    l2_table = l2_load(bs, l2_offset);
671
    if (l2_table == NULL)
672
        return 0;
673

    
674
    /* find the cluster offset for the given disk offset */
675

    
676
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
677
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
678
    nb_available = s->cluster_sectors;
679
    l2_index++;
680

    
681
    if (!cluster_offset) {
682

    
683
       /* how many empty clusters ? */
684

    
685
       while (nb_available < nb_needed && !l2_table[l2_index]) {
686
           l2_index++;
687
           nb_available += s->cluster_sectors;
688
       }
689
    } else {
690

    
691
       /* how many allocated clusters ? */
692

    
693
       cluster_offset &= ~QCOW_OFLAG_COPIED;
694
       while (nb_available < nb_needed) {
695
           next = be64_to_cpu(l2_table[l2_index]) & ~QCOW_OFLAG_COPIED;
696
           if (next != cluster_offset + (nb_available << 9))
697
               break;
698
           l2_index++;
699
           nb_available += s->cluster_sectors;
700
       }
701
   }
702

    
703
out:
704
    if (nb_available > nb_needed)
705
        nb_available = nb_needed;
706

    
707
    *num = nb_available - index_in_cluster;
708

    
709
    return cluster_offset;
710
}
711

    
712
/*
713
 * free_any_clusters
714
 *
715
 * free clusters according to its type: compressed or not
716
 *
717
 */
718

    
719
static void free_any_clusters(BlockDriverState *bs,
720
                              uint64_t cluster_offset, int nb_clusters)
721
{
722
    BDRVQcowState *s = bs->opaque;
723

    
724
    /* free the cluster */
725

    
726
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
727
        int nb_csectors;
728
        nb_csectors = ((cluster_offset >> s->csize_shift) &
729
                       s->csize_mask) + 1;
730
        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
731
                      nb_csectors * 512);
732
        return;
733
    }
734

    
735
    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
736

    
737
    return;
738
}
739

    
740
/*
741
 * get_cluster_table
742
 *
743
 * for a given disk offset, load (and allocate if needed)
744
 * the l2 table.
745
 *
746
 * the l2 table offset in the qcow2 file and the cluster index
747
 * in the l2 table are given to the caller.
748
 *
749
 */
750

    
751
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
752
                             uint64_t **new_l2_table,
753
                             uint64_t *new_l2_offset,
754
                             int *new_l2_index)
755
{
756
    BDRVQcowState *s = bs->opaque;
757
    int l1_index, l2_index, ret;
758
    uint64_t l2_offset, *l2_table;
759

    
760
    /* seek the the l2 offset in the l1 table */
761

    
762
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
763
    if (l1_index >= s->l1_size) {
764
        ret = grow_l1_table(bs, l1_index + 1);
765
        if (ret < 0)
766
            return 0;
767
    }
768
    l2_offset = s->l1_table[l1_index];
769

    
770
    /* seek the l2 table of the given l2 offset */
771

    
772
    if (l2_offset & QCOW_OFLAG_COPIED) {
773
        /* load the l2 table in memory */
774
        l2_offset &= ~QCOW_OFLAG_COPIED;
775
        l2_table = l2_load(bs, l2_offset);
776
        if (l2_table == NULL)
777
            return 0;
778
    } else {
779
        if (l2_offset)
780
            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
781
        l2_table = l2_allocate(bs, l1_index);
782
        if (l2_table == NULL)
783
            return 0;
784
        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
785
    }
786

    
787
    /* find the cluster offset for the given disk offset */
788

    
789
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
790

    
791
    *new_l2_table = l2_table;
792
    *new_l2_offset = l2_offset;
793
    *new_l2_index = l2_index;
794

    
795
    return 1;
796
}
797

    
798
/*
799
 * alloc_compressed_cluster_offset
800
 *
801
 * For a given offset of the disk image, return cluster offset in
802
 * qcow2 file.
803
 *
804
 * If the offset is not found, allocate a new compressed cluster.
805
 *
806
 * Return the cluster offset if successful,
807
 * Return 0, otherwise.
808
 *
809
 */
810

    
811
static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
812
                                                uint64_t offset,
813
                                                int compressed_size)
814
{
815
    BDRVQcowState *s = bs->opaque;
816
    int l2_index, ret;
817
    uint64_t l2_offset, *l2_table, cluster_offset;
818
    int nb_csectors;
819

    
820
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
821
    if (ret == 0)
822
        return 0;
823

    
824
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
825
    if (cluster_offset & QCOW_OFLAG_COPIED)
826
        return cluster_offset & ~QCOW_OFLAG_COPIED;
827

    
828
    if (cluster_offset)
829
        free_any_clusters(bs, cluster_offset, 1);
830

    
831
    cluster_offset = alloc_bytes(bs, compressed_size);
832
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
833
                  (cluster_offset >> 9);
834

    
835
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
836
                      ((uint64_t)nb_csectors << s->csize_shift);
837

    
838
    /* update L2 table */
839

    
840
    /* compressed clusters never have the copied flag */
841

    
842
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
843
    if (bdrv_pwrite(s->hd,
844
                    l2_offset + l2_index * sizeof(uint64_t),
845
                    l2_table + l2_index,
846
                    sizeof(uint64_t)) != sizeof(uint64_t))
847
        return 0;
848

    
849
    return cluster_offset;
850
}
851

    
852
/*
853
 * alloc_cluster_offset
854
 *
855
 * For a given offset of the disk image, return cluster offset in
856
 * qcow2 file.
857
 *
858
 * If the offset is not found, allocate a new cluster.
859
 *
860
 * Return the cluster offset if successful,
861
 * Return 0, otherwise.
862
 *
863
 */
864

    
865
static uint64_t alloc_cluster_offset(BlockDriverState *bs,
866
                                     uint64_t offset,
867
                                     int n_start, int n_end,
868
                                     int *num)
869
{
870
    BDRVQcowState *s = bs->opaque;
871
    int l2_index, ret;
872
    uint64_t l2_offset, *l2_table, cluster_offset;
873
    int nb_available, nb_clusters, i, j;
874
    uint64_t start_sect, current;
875

    
876
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
877
    if (ret == 0)
878
        return 0;
879

    
880
    nb_clusters = ((n_end << 9) + s->cluster_size - 1) >>
881
                  s->cluster_bits;
882
    if (nb_clusters > s->l2_size - l2_index)
883
            nb_clusters = s->l2_size - l2_index;
884

    
885
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
886

    
887
    /* We keep all QCOW_OFLAG_COPIED clusters */
888

    
889
    if (cluster_offset & QCOW_OFLAG_COPIED) {
890

    
891
        for (i = 1; i < nb_clusters; i++) {
892
            current = be64_to_cpu(l2_table[l2_index + i]);
893
            if (cluster_offset + (i << s->cluster_bits) != current)
894
                break;
895
        }
896
        nb_clusters = i;
897

    
898
        nb_available = nb_clusters << (s->cluster_bits - 9);
899
        if (nb_available > n_end)
900
            nb_available = n_end;
901

    
902
        cluster_offset &= ~QCOW_OFLAG_COPIED;
903

    
904
        goto out;
905
    }
906

    
907
    /* for the moment, multiple compressed clusters are not managed */
908

    
909
    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
910
        nb_clusters = 1;
911

    
912
    /* how many available clusters ? */
913

    
914
    i = 0;
915
    while (i < nb_clusters) {
916

    
917
        i++;
918

    
919
        if (!cluster_offset) {
920

    
921
            /* how many free clusters ? */
922

    
923
            while (i < nb_clusters) {
924
                cluster_offset = l2_table[l2_index + i];
925
                if (cluster_offset != 0)
926
                    break;
927
                i++;
928
            }
929

    
930
            if ((cluster_offset & QCOW_OFLAG_COPIED) ||
931
                (cluster_offset & QCOW_OFLAG_COMPRESSED))
932
                break;
933

    
934
        } else {
935

    
936
            /* how many contiguous clusters ? */
937

    
938
            j = 1;
939
            current = 0;
940
            while (i < nb_clusters) {
941
                current = be64_to_cpu(l2_table[l2_index + i]);
942
                if (cluster_offset + (j << s->cluster_bits) != current)
943
                    break;
944

    
945
                i++;
946
                j++;
947
            }
948

    
949
            free_any_clusters(bs, cluster_offset, j);
950
            if (current)
951
                break;
952
            cluster_offset = current;
953
        }
954
    }
955
    nb_clusters = i;
956

    
957
    /* allocate a new cluster */
958

    
959
    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
960

    
961
    /* we must initialize the cluster content which won't be
962
       written */
963

    
964
    nb_available = nb_clusters << (s->cluster_bits - 9);
965
    if (nb_available > n_end)
966
        nb_available = n_end;
967

    
968
    /* copy content of unmodified sectors */
969

    
970
    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
971
    if (n_start) {
972
        ret = copy_sectors(bs, start_sect, cluster_offset, 0, n_start);
973
        if (ret < 0)
974
            return 0;
975
    }
976

    
977
    if (nb_available & (s->cluster_sectors - 1)) {
978
        uint64_t end = nb_available & ~(uint64_t)(s->cluster_sectors - 1);
979
        ret = copy_sectors(bs, start_sect + end,
980
                           cluster_offset + (end << 9),
981
                           nb_available - end,
982
                           s->cluster_sectors);
983
        if (ret < 0)
984
            return 0;
985
    }
986

    
987
    /* update L2 table */
988

    
989
    for (i = 0; i < nb_clusters; i++)
990
        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
991
                                             (i << s->cluster_bits)) |
992
                                             QCOW_OFLAG_COPIED);
993

    
994
    if (bdrv_pwrite(s->hd,
995
                    l2_offset + l2_index * sizeof(uint64_t),
996
                    l2_table + l2_index,
997
                    nb_clusters * sizeof(uint64_t)) !=
998
                    nb_clusters * sizeof(uint64_t))
999
        return 0;
1000

    
1001
out:
1002
    *num = nb_available - n_start;
1003

    
1004
    return cluster_offset;
1005
}
1006

    
1007
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
1008
                             int nb_sectors, int *pnum)
1009
{
1010
    uint64_t cluster_offset;
1011

    
1012
    *pnum = nb_sectors;
1013
    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
1014

    
1015
    return (cluster_offset != 0);
1016
}
1017

    
1018
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1019
                             const uint8_t *buf, int buf_size)
1020
{
1021
    z_stream strm1, *strm = &strm1;
1022
    int ret, out_len;
1023

    
1024
    memset(strm, 0, sizeof(*strm));
1025

    
1026
    strm->next_in = (uint8_t *)buf;
1027
    strm->avail_in = buf_size;
1028
    strm->next_out = out_buf;
1029
    strm->avail_out = out_buf_size;
1030

    
1031
    ret = inflateInit2(strm, -12);
1032
    if (ret != Z_OK)
1033
        return -1;
1034
    ret = inflate(strm, Z_FINISH);
1035
    out_len = strm->next_out - out_buf;
1036
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1037
        out_len != out_buf_size) {
1038
        inflateEnd(strm);
1039
        return -1;
1040
    }
1041
    inflateEnd(strm);
1042
    return 0;
1043
}
1044

    
1045
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
1046
{
1047
    int ret, csize, nb_csectors, sector_offset;
1048
    uint64_t coffset;
1049

    
1050
    coffset = cluster_offset & s->cluster_offset_mask;
1051
    if (s->cluster_cache_offset != coffset) {
1052
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1053
        sector_offset = coffset & 511;
1054
        csize = nb_csectors * 512 - sector_offset;
1055
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
1056
        if (ret < 0) {
1057
            return -1;
1058
        }
1059
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1060
                              s->cluster_data + sector_offset, csize) < 0) {
1061
            return -1;
1062
        }
1063
        s->cluster_cache_offset = coffset;
1064
    }
1065
    return 0;
1066
}
1067

    
1068
/* handle reading after the end of the backing file */
1069
static int backing_read1(BlockDriverState *bs,
1070
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
1071
{
1072
    int n1;
1073
    if ((sector_num + nb_sectors) <= bs->total_sectors)
1074
        return nb_sectors;
1075
    if (sector_num >= bs->total_sectors)
1076
        n1 = 0;
1077
    else
1078
        n1 = bs->total_sectors - sector_num;
1079
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
1080
    return n1;
1081
}
1082

    
1083
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
1084
                     uint8_t *buf, int nb_sectors)
1085
{
1086
    BDRVQcowState *s = bs->opaque;
1087
    int ret, index_in_cluster, n, n1;
1088
    uint64_t cluster_offset;
1089

    
1090
    while (nb_sectors > 0) {
1091
        n = nb_sectors;
1092
        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
1093
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1094
        if (!cluster_offset) {
1095
            if (bs->backing_hd) {
1096
                /* read from the base image */
1097
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
1098
                if (n1 > 0) {
1099
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
1100
                    if (ret < 0)
1101
                        return -1;
1102
                }
1103
            } else {
1104
                memset(buf, 0, 512 * n);
1105
            }
1106
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1107
            if (decompress_cluster(s, cluster_offset) < 0)
1108
                return -1;
1109
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
1110
        } else {
1111
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1112
            if (ret != n * 512)
1113
                return -1;
1114
            if (s->crypt_method) {
1115
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
1116
                                &s->aes_decrypt_key);
1117
            }
1118
        }
1119
        nb_sectors -= n;
1120
        sector_num += n;
1121
        buf += n * 512;
1122
    }
1123
    return 0;
1124
}
1125

    
1126
static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1127
                     const uint8_t *buf, int nb_sectors)
1128
{
1129
    BDRVQcowState *s = bs->opaque;
1130
    int ret, index_in_cluster, n;
1131
    uint64_t cluster_offset;
1132
    int n_end;
1133

    
1134
    while (nb_sectors > 0) {
1135
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1136
        n_end = index_in_cluster + nb_sectors;
1137
        if (s->crypt_method &&
1138
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1139
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1140
        cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1141
                                              index_in_cluster,
1142
                                              n_end, &n);
1143
        if (!cluster_offset)
1144
            return -1;
1145
        if (s->crypt_method) {
1146
            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
1147
                            &s->aes_encrypt_key);
1148
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
1149
                              s->cluster_data, n * 512);
1150
        } else {
1151
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1152
        }
1153
        if (ret != n * 512)
1154
            return -1;
1155
        nb_sectors -= n;
1156
        sector_num += n;
1157
        buf += n * 512;
1158
    }
1159
    s->cluster_cache_offset = -1; /* disable compressed cache */
1160
    return 0;
1161
}
1162

    
1163
typedef struct QCowAIOCB {
1164
    BlockDriverAIOCB common;
1165
    int64_t sector_num;
1166
    uint8_t *buf;
1167
    int nb_sectors;
1168
    int n;
1169
    uint64_t cluster_offset;
1170
    uint8_t *cluster_data;
1171
    BlockDriverAIOCB *hd_aiocb;
1172
} QCowAIOCB;
1173

    
1174
static void qcow_aio_read_cb(void *opaque, int ret)
1175
{
1176
    QCowAIOCB *acb = opaque;
1177
    BlockDriverState *bs = acb->common.bs;
1178
    BDRVQcowState *s = bs->opaque;
1179
    int index_in_cluster, n1;
1180

    
1181
    acb->hd_aiocb = NULL;
1182
    if (ret < 0) {
1183
    fail:
1184
        acb->common.cb(acb->common.opaque, ret);
1185
        qemu_aio_release(acb);
1186
        return;
1187
    }
1188

    
1189
 redo:
1190
    /* post process the read buffer */
1191
    if (!acb->cluster_offset) {
1192
        /* nothing to do */
1193
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1194
        /* nothing to do */
1195
    } else {
1196
        if (s->crypt_method) {
1197
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
1198
                            acb->n, 0,
1199
                            &s->aes_decrypt_key);
1200
        }
1201
    }
1202

    
1203
    acb->nb_sectors -= acb->n;
1204
    acb->sector_num += acb->n;
1205
    acb->buf += acb->n * 512;
1206

    
1207
    if (acb->nb_sectors == 0) {
1208
        /* request completed */
1209
        acb->common.cb(acb->common.opaque, 0);
1210
        qemu_aio_release(acb);
1211
        return;
1212
    }
1213

    
1214
    /* prepare next AIO request */
1215
    acb->n = acb->nb_sectors;
1216
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
1217
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1218

    
1219
    if (!acb->cluster_offset) {
1220
        if (bs->backing_hd) {
1221
            /* read from the base image */
1222
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
1223
                               acb->buf, acb->n);
1224
            if (n1 > 0) {
1225
                acb->hd_aiocb = bdrv_aio_read(bs->backing_hd, acb->sector_num,
1226
                                    acb->buf, acb->n, qcow_aio_read_cb, acb);
1227
                if (acb->hd_aiocb == NULL)
1228
                    goto fail;
1229
            } else {
1230
                goto redo;
1231
            }
1232
        } else {
1233
            /* Note: in this case, no need to wait */
1234
            memset(acb->buf, 0, 512 * acb->n);
1235
            goto redo;
1236
        }
1237
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1238
        /* add AIO support for compressed blocks ? */
1239
        if (decompress_cluster(s, acb->cluster_offset) < 0)
1240
            goto fail;
1241
        memcpy(acb->buf,
1242
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
1243
        goto redo;
1244
    } else {
1245
        if ((acb->cluster_offset & 511) != 0) {
1246
            ret = -EIO;
1247
            goto fail;
1248
        }
1249
        acb->hd_aiocb = bdrv_aio_read(s->hd,
1250
                            (acb->cluster_offset >> 9) + index_in_cluster,
1251
                            acb->buf, acb->n, qcow_aio_read_cb, acb);
1252
        if (acb->hd_aiocb == NULL)
1253
            goto fail;
1254
    }
1255
}
1256

    
1257
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1258
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1259
        BlockDriverCompletionFunc *cb, void *opaque)
1260
{
1261
    QCowAIOCB *acb;
1262

    
1263
    acb = qemu_aio_get(bs, cb, opaque);
1264
    if (!acb)
1265
        return NULL;
1266
    acb->hd_aiocb = NULL;
1267
    acb->sector_num = sector_num;
1268
    acb->buf = buf;
1269
    acb->nb_sectors = nb_sectors;
1270
    acb->n = 0;
1271
    acb->cluster_offset = 0;
1272
    return acb;
1273
}
1274

    
1275
static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs,
1276
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1277
        BlockDriverCompletionFunc *cb, void *opaque)
1278
{
1279
    QCowAIOCB *acb;
1280

    
1281
    acb = qcow_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
1282
    if (!acb)
1283
        return NULL;
1284

    
1285
    qcow_aio_read_cb(acb, 0);
1286
    return &acb->common;
1287
}
1288

    
1289
static void qcow_aio_write_cb(void *opaque, int ret)
1290
{
1291
    QCowAIOCB *acb = opaque;
1292
    BlockDriverState *bs = acb->common.bs;
1293
    BDRVQcowState *s = bs->opaque;
1294
    int index_in_cluster;
1295
    uint64_t cluster_offset;
1296
    const uint8_t *src_buf;
1297
    int n_end;
1298

    
1299
    acb->hd_aiocb = NULL;
1300

    
1301
    if (ret < 0) {
1302
    fail:
1303
        acb->common.cb(acb->common.opaque, ret);
1304
        qemu_aio_release(acb);
1305
        return;
1306
    }
1307

    
1308
    acb->nb_sectors -= acb->n;
1309
    acb->sector_num += acb->n;
1310
    acb->buf += acb->n * 512;
1311

    
1312
    if (acb->nb_sectors == 0) {
1313
        /* request completed */
1314
        acb->common.cb(acb->common.opaque, 0);
1315
        qemu_aio_release(acb);
1316
        return;
1317
    }
1318

    
1319
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1320
    n_end = index_in_cluster + acb->nb_sectors;
1321
    if (s->crypt_method &&
1322
        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1323
        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1324

    
1325
    cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1326
                                          index_in_cluster,
1327
                                          n_end, &acb->n);
1328
    if (!cluster_offset || (cluster_offset & 511) != 0) {
1329
        ret = -EIO;
1330
        goto fail;
1331
    }
1332
    if (s->crypt_method) {
1333
        if (!acb->cluster_data) {
1334
            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
1335
                                             s->cluster_size);
1336
            if (!acb->cluster_data) {
1337
                ret = -ENOMEM;
1338
                goto fail;
1339
            }
1340
        }
1341
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
1342
                        acb->n, 1, &s->aes_encrypt_key);
1343
        src_buf = acb->cluster_data;
1344
    } else {
1345
        src_buf = acb->buf;
1346
    }
1347
    acb->hd_aiocb = bdrv_aio_write(s->hd,
1348
                                   (cluster_offset >> 9) + index_in_cluster,
1349
                                   src_buf, acb->n,
1350
                                   qcow_aio_write_cb, acb);
1351
    if (acb->hd_aiocb == NULL)
1352
        goto fail;
1353
}
1354

    
1355
static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs,
1356
        int64_t sector_num, const uint8_t *buf, int nb_sectors,
1357
        BlockDriverCompletionFunc *cb, void *opaque)
1358
{
1359
    BDRVQcowState *s = bs->opaque;
1360
    QCowAIOCB *acb;
1361

    
1362
    s->cluster_cache_offset = -1; /* disable compressed cache */
1363

    
1364
    acb = qcow_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
1365
    if (!acb)
1366
        return NULL;
1367

    
1368
    qcow_aio_write_cb(acb, 0);
1369
    return &acb->common;
1370
}
1371

    
1372
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1373
{
1374
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1375
    if (acb->hd_aiocb)
1376
        bdrv_aio_cancel(acb->hd_aiocb);
1377
    qemu_aio_release(acb);
1378
}
1379

    
1380
static void qcow_close(BlockDriverState *bs)
1381
{
1382
    BDRVQcowState *s = bs->opaque;
1383
    qemu_free(s->l1_table);
1384
    qemu_free(s->l2_cache);
1385
    qemu_free(s->cluster_cache);
1386
    qemu_free(s->cluster_data);
1387
    refcount_close(bs);
1388
    bdrv_delete(s->hd);
1389
}
1390

    
1391
/* XXX: use std qcow open function ? */
1392
typedef struct QCowCreateState {
1393
    int cluster_size;
1394
    int cluster_bits;
1395
    uint16_t *refcount_block;
1396
    uint64_t *refcount_table;
1397
    int64_t l1_table_offset;
1398
    int64_t refcount_table_offset;
1399
    int64_t refcount_block_offset;
1400
} QCowCreateState;
1401

    
1402
static void create_refcount_update(QCowCreateState *s,
1403
                                   int64_t offset, int64_t size)
1404
{
1405
    int refcount;
1406
    int64_t start, last, cluster_offset;
1407
    uint16_t *p;
1408

    
1409
    start = offset & ~(s->cluster_size - 1);
1410
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
1411
    for(cluster_offset = start; cluster_offset <= last;
1412
        cluster_offset += s->cluster_size) {
1413
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
1414
        refcount = be16_to_cpu(*p);
1415
        refcount++;
1416
        *p = cpu_to_be16(refcount);
1417
    }
1418
}
1419

    
1420
static int qcow_create(const char *filename, int64_t total_size,
1421
                      const char *backing_file, int flags)
1422
{
1423
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1424
    QCowHeader header;
1425
    uint64_t tmp, offset;
1426
    QCowCreateState s1, *s = &s1;
1427

    
1428
    memset(s, 0, sizeof(*s));
1429

    
1430
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1431
    if (fd < 0)
1432
        return -1;
1433
    memset(&header, 0, sizeof(header));
1434
    header.magic = cpu_to_be32(QCOW_MAGIC);
1435
    header.version = cpu_to_be32(QCOW_VERSION);
1436
    header.size = cpu_to_be64(total_size * 512);
1437
    header_size = sizeof(header);
1438
    backing_filename_len = 0;
1439
    if (backing_file) {
1440
        header.backing_file_offset = cpu_to_be64(header_size);
1441
        backing_filename_len = strlen(backing_file);
1442
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1443
        header_size += backing_filename_len;
1444
    }
1445
    s->cluster_bits = 12;  /* 4 KB clusters */
1446
    s->cluster_size = 1 << s->cluster_bits;
1447
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1448
    header_size = (header_size + 7) & ~7;
1449
    if (flags & BLOCK_FLAG_ENCRYPT) {
1450
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1451
    } else {
1452
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1453
    }
1454
    l2_bits = s->cluster_bits - 3;
1455
    shift = s->cluster_bits + l2_bits;
1456
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1457
    offset = align_offset(header_size, s->cluster_size);
1458
    s->l1_table_offset = offset;
1459
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1460
    header.l1_size = cpu_to_be32(l1_size);
1461
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1462

    
1463
    s->refcount_table = qemu_mallocz(s->cluster_size);
1464
    if (!s->refcount_table)
1465
        goto fail;
1466
    s->refcount_block = qemu_mallocz(s->cluster_size);
1467
    if (!s->refcount_block)
1468
        goto fail;
1469

    
1470
    s->refcount_table_offset = offset;
1471
    header.refcount_table_offset = cpu_to_be64(offset);
1472
    header.refcount_table_clusters = cpu_to_be32(1);
1473
    offset += s->cluster_size;
1474

    
1475
    s->refcount_table[0] = cpu_to_be64(offset);
1476
    s->refcount_block_offset = offset;
1477
    offset += s->cluster_size;
1478

    
1479
    /* update refcounts */
1480
    create_refcount_update(s, 0, header_size);
1481
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1482
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1483
    create_refcount_update(s, s->refcount_block_offset, s->cluster_size);
1484

    
1485
    /* write all the data */
1486
    write(fd, &header, sizeof(header));
1487
    if (backing_file) {
1488
        write(fd, backing_file, backing_filename_len);
1489
    }
1490
    lseek(fd, s->l1_table_offset, SEEK_SET);
1491
    tmp = 0;
1492
    for(i = 0;i < l1_size; i++) {
1493
        write(fd, &tmp, sizeof(tmp));
1494
    }
1495
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1496
    write(fd, s->refcount_table, s->cluster_size);
1497

    
1498
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1499
    write(fd, s->refcount_block, s->cluster_size);
1500

    
1501
    qemu_free(s->refcount_table);
1502
    qemu_free(s->refcount_block);
1503
    close(fd);
1504
    return 0;
1505
 fail:
1506
    qemu_free(s->refcount_table);
1507
    qemu_free(s->refcount_block);
1508
    close(fd);
1509
    return -ENOMEM;
1510
}
1511

    
1512
static int qcow_make_empty(BlockDriverState *bs)
1513
{
1514
#if 0
1515
    /* XXX: not correct */
1516
    BDRVQcowState *s = bs->opaque;
1517
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1518
    int ret;
1519

1520
    memset(s->l1_table, 0, l1_length);
1521
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1522
        return -1;
1523
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1524
    if (ret < 0)
1525
        return ret;
1526

1527
    l2_cache_reset(bs);
1528
#endif
1529
    return 0;
1530
}
1531

    
1532
/* XXX: put compressed sectors first, then all the cluster aligned
1533
   tables to avoid losing bytes in alignment */
1534
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1535
                                 const uint8_t *buf, int nb_sectors)
1536
{
1537
    BDRVQcowState *s = bs->opaque;
1538
    z_stream strm;
1539
    int ret, out_len;
1540
    uint8_t *out_buf;
1541
    uint64_t cluster_offset;
1542

    
1543
    if (nb_sectors == 0) {
1544
        /* align end of file to a sector boundary to ease reading with
1545
           sector based I/Os */
1546
        cluster_offset = bdrv_getlength(s->hd);
1547
        cluster_offset = (cluster_offset + 511) & ~511;
1548
        bdrv_truncate(s->hd, cluster_offset);
1549
        return 0;
1550
    }
1551

    
1552
    if (nb_sectors != s->cluster_sectors)
1553
        return -EINVAL;
1554

    
1555
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1556
    if (!out_buf)
1557
        return -ENOMEM;
1558

    
1559
    /* best compression, small window, no zlib header */
1560
    memset(&strm, 0, sizeof(strm));
1561
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1562
                       Z_DEFLATED, -12,
1563
                       9, Z_DEFAULT_STRATEGY);
1564
    if (ret != 0) {
1565
        qemu_free(out_buf);
1566
        return -1;
1567
    }
1568

    
1569
    strm.avail_in = s->cluster_size;
1570
    strm.next_in = (uint8_t *)buf;
1571
    strm.avail_out = s->cluster_size;
1572
    strm.next_out = out_buf;
1573

    
1574
    ret = deflate(&strm, Z_FINISH);
1575
    if (ret != Z_STREAM_END && ret != Z_OK) {
1576
        qemu_free(out_buf);
1577
        deflateEnd(&strm);
1578
        return -1;
1579
    }
1580
    out_len = strm.next_out - out_buf;
1581

    
1582
    deflateEnd(&strm);
1583

    
1584
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1585
        /* could not compress: write normal cluster */
1586
        qcow_write(bs, sector_num, buf, s->cluster_sectors);
1587
    } else {
1588
        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
1589
                                              out_len);
1590
        if (!cluster_offset)
1591
            return -1;
1592
        cluster_offset &= s->cluster_offset_mask;
1593
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1594
            qemu_free(out_buf);
1595
            return -1;
1596
        }
1597
    }
1598

    
1599
    qemu_free(out_buf);
1600
    return 0;
1601
}
1602

    
1603
static void qcow_flush(BlockDriverState *bs)
1604
{
1605
    BDRVQcowState *s = bs->opaque;
1606
    bdrv_flush(s->hd);
1607
}
1608

    
1609
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1610
{
1611
    BDRVQcowState *s = bs->opaque;
1612
    bdi->cluster_size = s->cluster_size;
1613
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1614
        (s->cluster_bits + s->l2_bits);
1615
    return 0;
1616
}
1617

    
1618
/*********************************************************/
1619
/* snapshot support */
1620

    
1621
/* update the refcounts of snapshots and the copied flag */
1622
static int update_snapshot_refcount(BlockDriverState *bs,
1623
                                    int64_t l1_table_offset,
1624
                                    int l1_size,
1625
                                    int addend)
1626
{
1627
    BDRVQcowState *s = bs->opaque;
1628
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
1629
    int64_t old_offset, old_l2_offset;
1630
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
1631

    
1632
    l2_cache_reset(bs);
1633

    
1634
    l2_table = NULL;
1635
    l1_table = NULL;
1636
    l1_size2 = l1_size * sizeof(uint64_t);
1637
    l1_allocated = 0;
1638
    if (l1_table_offset != s->l1_table_offset) {
1639
        l1_table = qemu_malloc(l1_size2);
1640
        if (!l1_table)
1641
            goto fail;
1642
        l1_allocated = 1;
1643
        if (bdrv_pread(s->hd, l1_table_offset,
1644
                       l1_table, l1_size2) != l1_size2)
1645
            goto fail;
1646
        for(i = 0;i < l1_size; i++)
1647
            be64_to_cpus(&l1_table[i]);
1648
    } else {
1649
        assert(l1_size == s->l1_size);
1650
        l1_table = s->l1_table;
1651
        l1_allocated = 0;
1652
    }
1653

    
1654
    l2_size = s->l2_size * sizeof(uint64_t);
1655
    l2_table = qemu_malloc(l2_size);
1656
    if (!l2_table)
1657
        goto fail;
1658
    l1_modified = 0;
1659
    for(i = 0; i < l1_size; i++) {
1660
        l2_offset = l1_table[i];
1661
        if (l2_offset) {
1662
            old_l2_offset = l2_offset;
1663
            l2_offset &= ~QCOW_OFLAG_COPIED;
1664
            l2_modified = 0;
1665
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
1666
                goto fail;
1667
            for(j = 0; j < s->l2_size; j++) {
1668
                offset = be64_to_cpu(l2_table[j]);
1669
                if (offset != 0) {
1670
                    old_offset = offset;
1671
                    offset &= ~QCOW_OFLAG_COPIED;
1672
                    if (offset & QCOW_OFLAG_COMPRESSED) {
1673
                        nb_csectors = ((offset >> s->csize_shift) &
1674
                                       s->csize_mask) + 1;
1675
                        if (addend != 0)
1676
                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
1677
                                            nb_csectors * 512, addend);
1678
                        /* compressed clusters are never modified */
1679
                        refcount = 2;
1680
                    } else {
1681
                        if (addend != 0) {
1682
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
1683
                        } else {
1684
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
1685
                        }
1686
                    }
1687

    
1688
                    if (refcount == 1) {
1689
                        offset |= QCOW_OFLAG_COPIED;
1690
                    }
1691
                    if (offset != old_offset) {
1692
                        l2_table[j] = cpu_to_be64(offset);
1693
                        l2_modified = 1;
1694
                    }
1695
                }
1696
            }
1697
            if (l2_modified) {
1698
                if (bdrv_pwrite(s->hd,
1699
                                l2_offset, l2_table, l2_size) != l2_size)
1700
                    goto fail;
1701
            }
1702

    
1703
            if (addend != 0) {
1704
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
1705
            } else {
1706
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1707
            }
1708
            if (refcount == 1) {
1709
                l2_offset |= QCOW_OFLAG_COPIED;
1710
            }
1711
            if (l2_offset != old_l2_offset) {
1712
                l1_table[i] = l2_offset;
1713
                l1_modified = 1;
1714
            }
1715
        }
1716
    }
1717
    if (l1_modified) {
1718
        for(i = 0; i < l1_size; i++)
1719
            cpu_to_be64s(&l1_table[i]);
1720
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
1721
                        l1_size2) != l1_size2)
1722
            goto fail;
1723
        for(i = 0; i < l1_size; i++)
1724
            be64_to_cpus(&l1_table[i]);
1725
    }
1726
    if (l1_allocated)
1727
        qemu_free(l1_table);
1728
    qemu_free(l2_table);
1729
    return 0;
1730
 fail:
1731
    if (l1_allocated)
1732
        qemu_free(l1_table);
1733
    qemu_free(l2_table);
1734
    return -EIO;
1735
}
1736

    
1737
static void qcow_free_snapshots(BlockDriverState *bs)
1738
{
1739
    BDRVQcowState *s = bs->opaque;
1740
    int i;
1741

    
1742
    for(i = 0; i < s->nb_snapshots; i++) {
1743
        qemu_free(s->snapshots[i].name);
1744
        qemu_free(s->snapshots[i].id_str);
1745
    }
1746
    qemu_free(s->snapshots);
1747
    s->snapshots = NULL;
1748
    s->nb_snapshots = 0;
1749
}
1750

    
1751
static int qcow_read_snapshots(BlockDriverState *bs)
1752
{
1753
    BDRVQcowState *s = bs->opaque;
1754
    QCowSnapshotHeader h;
1755
    QCowSnapshot *sn;
1756
    int i, id_str_size, name_size;
1757
    int64_t offset;
1758
    uint32_t extra_data_size;
1759

    
1760
    offset = s->snapshots_offset;
1761
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1762
    if (!s->snapshots)
1763
        goto fail;
1764
    for(i = 0; i < s->nb_snapshots; i++) {
1765
        offset = align_offset(offset, 8);
1766
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1767
            goto fail;
1768
        offset += sizeof(h);
1769
        sn = s->snapshots + i;
1770
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1771
        sn->l1_size = be32_to_cpu(h.l1_size);
1772
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1773
        sn->date_sec = be32_to_cpu(h.date_sec);
1774
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1775
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1776
        extra_data_size = be32_to_cpu(h.extra_data_size);
1777

    
1778
        id_str_size = be16_to_cpu(h.id_str_size);
1779
        name_size = be16_to_cpu(h.name_size);
1780

    
1781
        offset += extra_data_size;
1782

    
1783
        sn->id_str = qemu_malloc(id_str_size + 1);
1784
        if (!sn->id_str)
1785
            goto fail;
1786
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1787
            goto fail;
1788
        offset += id_str_size;
1789
        sn->id_str[id_str_size] = '\0';
1790

    
1791
        sn->name = qemu_malloc(name_size + 1);
1792
        if (!sn->name)
1793
            goto fail;
1794
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1795
            goto fail;
1796
        offset += name_size;
1797
        sn->name[name_size] = '\0';
1798
    }
1799
    s->snapshots_size = offset - s->snapshots_offset;
1800
    return 0;
1801
 fail:
1802
    qcow_free_snapshots(bs);
1803
    return -1;
1804
}
1805

    
1806
/* add at the end of the file a new list of snapshots */
1807
static int qcow_write_snapshots(BlockDriverState *bs)
1808
{
1809
    BDRVQcowState *s = bs->opaque;
1810
    QCowSnapshot *sn;
1811
    QCowSnapshotHeader h;
1812
    int i, name_size, id_str_size, snapshots_size;
1813
    uint64_t data64;
1814
    uint32_t data32;
1815
    int64_t offset, snapshots_offset;
1816

    
1817
    /* compute the size of the snapshots */
1818
    offset = 0;
1819
    for(i = 0; i < s->nb_snapshots; i++) {
1820
        sn = s->snapshots + i;
1821
        offset = align_offset(offset, 8);
1822
        offset += sizeof(h);
1823
        offset += strlen(sn->id_str);
1824
        offset += strlen(sn->name);
1825
    }
1826
    snapshots_size = offset;
1827

    
1828
    snapshots_offset = alloc_clusters(bs, snapshots_size);
1829
    offset = snapshots_offset;
1830

    
1831
    for(i = 0; i < s->nb_snapshots; i++) {
1832
        sn = s->snapshots + i;
1833
        memset(&h, 0, sizeof(h));
1834
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
1835
        h.l1_size = cpu_to_be32(sn->l1_size);
1836
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
1837
        h.date_sec = cpu_to_be32(sn->date_sec);
1838
        h.date_nsec = cpu_to_be32(sn->date_nsec);
1839
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
1840

    
1841
        id_str_size = strlen(sn->id_str);
1842
        name_size = strlen(sn->name);
1843
        h.id_str_size = cpu_to_be16(id_str_size);
1844
        h.name_size = cpu_to_be16(name_size);
1845
        offset = align_offset(offset, 8);
1846
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1847
            goto fail;
1848
        offset += sizeof(h);
1849
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1850
            goto fail;
1851
        offset += id_str_size;
1852
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
1853
            goto fail;
1854
        offset += name_size;
1855
    }
1856

    
1857
    /* update the various header fields */
1858
    data64 = cpu_to_be64(snapshots_offset);
1859
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
1860
                    &data64, sizeof(data64)) != sizeof(data64))
1861
        goto fail;
1862
    data32 = cpu_to_be32(s->nb_snapshots);
1863
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
1864
                    &data32, sizeof(data32)) != sizeof(data32))
1865
        goto fail;
1866

    
1867
    /* free the old snapshot table */
1868
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
1869
    s->snapshots_offset = snapshots_offset;
1870
    s->snapshots_size = snapshots_size;
1871
    return 0;
1872
 fail:
1873
    return -1;
1874
}
1875

    
1876
static void find_new_snapshot_id(BlockDriverState *bs,
1877
                                 char *id_str, int id_str_size)
1878
{
1879
    BDRVQcowState *s = bs->opaque;
1880
    QCowSnapshot *sn;
1881
    int i, id, id_max = 0;
1882

    
1883
    for(i = 0; i < s->nb_snapshots; i++) {
1884
        sn = s->snapshots + i;
1885
        id = strtoul(sn->id_str, NULL, 10);
1886
        if (id > id_max)
1887
            id_max = id;
1888
    }
1889
    snprintf(id_str, id_str_size, "%d", id_max + 1);
1890
}
1891

    
1892
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
1893
{
1894
    BDRVQcowState *s = bs->opaque;
1895
    int i;
1896

    
1897
    for(i = 0; i < s->nb_snapshots; i++) {
1898
        if (!strcmp(s->snapshots[i].id_str, id_str))
1899
            return i;
1900
    }
1901
    return -1;
1902
}
1903

    
1904
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
1905
{
1906
    BDRVQcowState *s = bs->opaque;
1907
    int i, ret;
1908

    
1909
    ret = find_snapshot_by_id(bs, name);
1910
    if (ret >= 0)
1911
        return ret;
1912
    for(i = 0; i < s->nb_snapshots; i++) {
1913
        if (!strcmp(s->snapshots[i].name, name))
1914
            return i;
1915
    }
1916
    return -1;
1917
}
1918

    
1919
/* if no id is provided, a new one is constructed */
1920
static int qcow_snapshot_create(BlockDriverState *bs,
1921
                                QEMUSnapshotInfo *sn_info)
1922
{
1923
    BDRVQcowState *s = bs->opaque;
1924
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
1925
    int i, ret;
1926
    uint64_t *l1_table = NULL;
1927

    
1928
    memset(sn, 0, sizeof(*sn));
1929

    
1930
    if (sn_info->id_str[0] == '\0') {
1931
        /* compute a new id */
1932
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
1933
    }
1934

    
1935
    /* check that the ID is unique */
1936
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
1937
        return -ENOENT;
1938

    
1939
    sn->id_str = qemu_strdup(sn_info->id_str);
1940
    if (!sn->id_str)
1941
        goto fail;
1942
    sn->name = qemu_strdup(sn_info->name);
1943
    if (!sn->name)
1944
        goto fail;
1945
    sn->vm_state_size = sn_info->vm_state_size;
1946
    sn->date_sec = sn_info->date_sec;
1947
    sn->date_nsec = sn_info->date_nsec;
1948
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
1949

    
1950
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
1951
    if (ret < 0)
1952
        goto fail;
1953

    
1954
    /* create the L1 table of the snapshot */
1955
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
1956
    sn->l1_size = s->l1_size;
1957

    
1958
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
1959
    if (!l1_table)
1960
        goto fail;
1961
    for(i = 0; i < s->l1_size; i++) {
1962
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
1963
    }
1964
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
1965
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
1966
        (s->l1_size * sizeof(uint64_t)))
1967
        goto fail;
1968
    qemu_free(l1_table);
1969
    l1_table = NULL;
1970

    
1971
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
1972
    if (!snapshots1)
1973
        goto fail;
1974
    memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
1975
    s->snapshots = snapshots1;
1976
    s->snapshots[s->nb_snapshots++] = *sn;
1977

    
1978
    if (qcow_write_snapshots(bs) < 0)
1979
        goto fail;
1980
#ifdef DEBUG_ALLOC
1981
    check_refcounts(bs);
1982
#endif
1983
    return 0;
1984
 fail:
1985
    qemu_free(sn->name);
1986
    qemu_free(l1_table);
1987
    return -1;
1988
}
1989

    
1990
/* copy the snapshot 'snapshot_name' into the current disk image */
1991
static int qcow_snapshot_goto(BlockDriverState *bs,
1992
                              const char *snapshot_id)
1993
{
1994
    BDRVQcowState *s = bs->opaque;
1995
    QCowSnapshot *sn;
1996
    int i, snapshot_index, l1_size2;
1997

    
1998
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
1999
    if (snapshot_index < 0)
2000
        return -ENOENT;
2001
    sn = &s->snapshots[snapshot_index];
2002

    
2003
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
2004
        goto fail;
2005

    
2006
    if (grow_l1_table(bs, sn->l1_size) < 0)
2007
        goto fail;
2008

    
2009
    s->l1_size = sn->l1_size;
2010
    l1_size2 = s->l1_size * sizeof(uint64_t);
2011
    /* copy the snapshot l1 table to the current l1 table */
2012
    if (bdrv_pread(s->hd, sn->l1_table_offset,
2013
                   s->l1_table, l1_size2) != l1_size2)
2014
        goto fail;
2015
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
2016
                    s->l1_table, l1_size2) != l1_size2)
2017
        goto fail;
2018
    for(i = 0;i < s->l1_size; i++) {
2019
        be64_to_cpus(&s->l1_table[i]);
2020
    }
2021

    
2022
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
2023
        goto fail;
2024

    
2025
#ifdef DEBUG_ALLOC
2026
    check_refcounts(bs);
2027
#endif
2028
    return 0;
2029
 fail:
2030
    return -EIO;
2031
}
2032

    
2033
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2034
{
2035
    BDRVQcowState *s = bs->opaque;
2036
    QCowSnapshot *sn;
2037
    int snapshot_index, ret;
2038

    
2039
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2040
    if (snapshot_index < 0)
2041
        return -ENOENT;
2042
    sn = &s->snapshots[snapshot_index];
2043

    
2044
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
2045
    if (ret < 0)
2046
        return ret;
2047
    /* must update the copied flag on the current cluster offsets */
2048
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
2049
    if (ret < 0)
2050
        return ret;
2051
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
2052

    
2053
    qemu_free(sn->id_str);
2054
    qemu_free(sn->name);
2055
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
2056
    s->nb_snapshots--;
2057
    ret = qcow_write_snapshots(bs);
2058
    if (ret < 0) {
2059
        /* XXX: restore snapshot if error ? */
2060
        return ret;
2061
    }
2062
#ifdef DEBUG_ALLOC
2063
    check_refcounts(bs);
2064
#endif
2065
    return 0;
2066
}
2067

    
2068
static int qcow_snapshot_list(BlockDriverState *bs,
2069
                              QEMUSnapshotInfo **psn_tab)
2070
{
2071
    BDRVQcowState *s = bs->opaque;
2072
    QEMUSnapshotInfo *sn_tab, *sn_info;
2073
    QCowSnapshot *sn;
2074
    int i;
2075

    
2076
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
2077
    if (!sn_tab)
2078
        goto fail;
2079
    for(i = 0; i < s->nb_snapshots; i++) {
2080
        sn_info = sn_tab + i;
2081
        sn = s->snapshots + i;
2082
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
2083
                sn->id_str);
2084
        pstrcpy(sn_info->name, sizeof(sn_info->name),
2085
                sn->name);
2086
        sn_info->vm_state_size = sn->vm_state_size;
2087
        sn_info->date_sec = sn->date_sec;
2088
        sn_info->date_nsec = sn->date_nsec;
2089
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
2090
    }
2091
    *psn_tab = sn_tab;
2092
    return s->nb_snapshots;
2093
 fail:
2094
    qemu_free(sn_tab);
2095
    *psn_tab = NULL;
2096
    return -ENOMEM;
2097
}
2098

    
2099
/*********************************************************/
2100
/* refcount handling */
2101

    
2102
static int refcount_init(BlockDriverState *bs)
2103
{
2104
    BDRVQcowState *s = bs->opaque;
2105
    int ret, refcount_table_size2, i;
2106

    
2107
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
2108
    if (!s->refcount_block_cache)
2109
        goto fail;
2110
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
2111
    s->refcount_table = qemu_malloc(refcount_table_size2);
2112
    if (!s->refcount_table)
2113
        goto fail;
2114
    if (s->refcount_table_size > 0) {
2115
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
2116
                         s->refcount_table, refcount_table_size2);
2117
        if (ret != refcount_table_size2)
2118
            goto fail;
2119
        for(i = 0; i < s->refcount_table_size; i++)
2120
            be64_to_cpus(&s->refcount_table[i]);
2121
    }
2122
    return 0;
2123
 fail:
2124
    return -ENOMEM;
2125
}
2126

    
2127
static void refcount_close(BlockDriverState *bs)
2128
{
2129
    BDRVQcowState *s = bs->opaque;
2130
    qemu_free(s->refcount_block_cache);
2131
    qemu_free(s->refcount_table);
2132
}
2133

    
2134

    
2135
static int load_refcount_block(BlockDriverState *bs,
2136
                               int64_t refcount_block_offset)
2137
{
2138
    BDRVQcowState *s = bs->opaque;
2139
    int ret;
2140
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
2141
                     s->cluster_size);
2142
    if (ret != s->cluster_size)
2143
        return -EIO;
2144
    s->refcount_block_cache_offset = refcount_block_offset;
2145
    return 0;
2146
}
2147

    
2148
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
2149
{
2150
    BDRVQcowState *s = bs->opaque;
2151
    int refcount_table_index, block_index;
2152
    int64_t refcount_block_offset;
2153

    
2154
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2155
    if (refcount_table_index >= s->refcount_table_size)
2156
        return 0;
2157
    refcount_block_offset = s->refcount_table[refcount_table_index];
2158
    if (!refcount_block_offset)
2159
        return 0;
2160
    if (refcount_block_offset != s->refcount_block_cache_offset) {
2161
        /* better than nothing: return allocated if read error */
2162
        if (load_refcount_block(bs, refcount_block_offset) < 0)
2163
            return 1;
2164
    }
2165
    block_index = cluster_index &
2166
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2167
    return be16_to_cpu(s->refcount_block_cache[block_index]);
2168
}
2169

    
2170
/* return < 0 if error */
2171
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
2172
{
2173
    BDRVQcowState *s = bs->opaque;
2174
    int i, nb_clusters;
2175

    
2176
    nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
2177
    for(;;) {
2178
        if (get_refcount(bs, s->free_cluster_index) == 0) {
2179
            s->free_cluster_index++;
2180
            for(i = 1; i < nb_clusters; i++) {
2181
                if (get_refcount(bs, s->free_cluster_index) != 0)
2182
                    goto not_found;
2183
                s->free_cluster_index++;
2184
            }
2185
#ifdef DEBUG_ALLOC2
2186
            printf("alloc_clusters: size=%lld -> %lld\n",
2187
                   size,
2188
                   (s->free_cluster_index - nb_clusters) << s->cluster_bits);
2189
#endif
2190
            return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
2191
        } else {
2192
        not_found:
2193
            s->free_cluster_index++;
2194
        }
2195
    }
2196
}
2197

    
2198
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
2199
{
2200
    int64_t offset;
2201

    
2202
    offset = alloc_clusters_noref(bs, size);
2203
    update_refcount(bs, offset, size, 1);
2204
    return offset;
2205
}
2206

    
2207
/* only used to allocate compressed sectors. We try to allocate
2208
   contiguous sectors. size must be <= cluster_size */
2209
static int64_t alloc_bytes(BlockDriverState *bs, int size)
2210
{
2211
    BDRVQcowState *s = bs->opaque;
2212
    int64_t offset, cluster_offset;
2213
    int free_in_cluster;
2214

    
2215
    assert(size > 0 && size <= s->cluster_size);
2216
    if (s->free_byte_offset == 0) {
2217
        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
2218
    }
2219
 redo:
2220
    free_in_cluster = s->cluster_size -
2221
        (s->free_byte_offset & (s->cluster_size - 1));
2222
    if (size <= free_in_cluster) {
2223
        /* enough space in current cluster */
2224
        offset = s->free_byte_offset;
2225
        s->free_byte_offset += size;
2226
        free_in_cluster -= size;
2227
        if (free_in_cluster == 0)
2228
            s->free_byte_offset = 0;
2229
        if ((offset & (s->cluster_size - 1)) != 0)
2230
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2231
    } else {
2232
        offset = alloc_clusters(bs, s->cluster_size);
2233
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
2234
        if ((cluster_offset + s->cluster_size) == offset) {
2235
            /* we are lucky: contiguous data */
2236
            offset = s->free_byte_offset;
2237
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2238
            s->free_byte_offset += size;
2239
        } else {
2240
            s->free_byte_offset = offset;
2241
            goto redo;
2242
        }
2243
    }
2244
    return offset;
2245
}
2246

    
2247
static void free_clusters(BlockDriverState *bs,
2248
                          int64_t offset, int64_t size)
2249
{
2250
    update_refcount(bs, offset, size, -1);
2251
}
2252

    
2253
static int grow_refcount_table(BlockDriverState *bs, int min_size)
2254
{
2255
    BDRVQcowState *s = bs->opaque;
2256
    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
2257
    uint64_t *new_table;
2258
    int64_t table_offset;
2259
    uint64_t data64;
2260
    uint32_t data32;
2261
    int old_table_size;
2262
    int64_t old_table_offset;
2263

    
2264
    if (min_size <= s->refcount_table_size)
2265
        return 0;
2266
    /* compute new table size */
2267
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2268
    for(;;) {
2269
        if (refcount_table_clusters == 0) {
2270
            refcount_table_clusters = 1;
2271
        } else {
2272
            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
2273
        }
2274
        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
2275
        if (min_size <= new_table_size)
2276
            break;
2277
    }
2278
#ifdef DEBUG_ALLOC2
2279
    printf("grow_refcount_table from %d to %d\n",
2280
           s->refcount_table_size,
2281
           new_table_size);
2282
#endif
2283
    new_table_size2 = new_table_size * sizeof(uint64_t);
2284
    new_table = qemu_mallocz(new_table_size2);
2285
    if (!new_table)
2286
        return -ENOMEM;
2287
    memcpy(new_table, s->refcount_table,
2288
           s->refcount_table_size * sizeof(uint64_t));
2289
    for(i = 0; i < s->refcount_table_size; i++)
2290
        cpu_to_be64s(&new_table[i]);
2291
    /* Note: we cannot update the refcount now to avoid recursion */
2292
    table_offset = alloc_clusters_noref(bs, new_table_size2);
2293
    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
2294
    if (ret != new_table_size2)
2295
        goto fail;
2296
    for(i = 0; i < s->refcount_table_size; i++)
2297
        be64_to_cpus(&new_table[i]);
2298

    
2299
    data64 = cpu_to_be64(table_offset);
2300
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
2301
                    &data64, sizeof(data64)) != sizeof(data64))
2302
        goto fail;
2303
    data32 = cpu_to_be32(refcount_table_clusters);
2304
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_clusters),
2305
                    &data32, sizeof(data32)) != sizeof(data32))
2306
        goto fail;
2307
    qemu_free(s->refcount_table);
2308
    old_table_offset = s->refcount_table_offset;
2309
    old_table_size = s->refcount_table_size;
2310
    s->refcount_table = new_table;
2311
    s->refcount_table_size = new_table_size;
2312
    s->refcount_table_offset = table_offset;
2313

    
2314
    update_refcount(bs, table_offset, new_table_size2, 1);
2315
    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
2316
    return 0;
2317
 fail:
2318
    free_clusters(bs, table_offset, new_table_size2);
2319
    qemu_free(new_table);
2320
    return -EIO;
2321
}
2322

    
2323
/* addend must be 1 or -1 */
2324
/* XXX: cache several refcount block clusters ? */
2325
static int update_cluster_refcount(BlockDriverState *bs,
2326
                                   int64_t cluster_index,
2327
                                   int addend)
2328
{
2329
    BDRVQcowState *s = bs->opaque;
2330
    int64_t offset, refcount_block_offset;
2331
    int ret, refcount_table_index, block_index, refcount;
2332
    uint64_t data64;
2333

    
2334
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2335
    if (refcount_table_index >= s->refcount_table_size) {
2336
        if (addend < 0)
2337
            return -EINVAL;
2338
        ret = grow_refcount_table(bs, refcount_table_index + 1);
2339
        if (ret < 0)
2340
            return ret;
2341
    }
2342
    refcount_block_offset = s->refcount_table[refcount_table_index];
2343
    if (!refcount_block_offset) {
2344
        if (addend < 0)
2345
            return -EINVAL;
2346
        /* create a new refcount block */
2347
        /* Note: we cannot update the refcount now to avoid recursion */
2348
        offset = alloc_clusters_noref(bs, s->cluster_size);
2349
        memset(s->refcount_block_cache, 0, s->cluster_size);
2350
        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
2351
        if (ret != s->cluster_size)
2352
            return -EINVAL;
2353
        s->refcount_table[refcount_table_index] = offset;
2354
        data64 = cpu_to_be64(offset);
2355
        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
2356
                          refcount_table_index * sizeof(uint64_t),
2357
                          &data64, sizeof(data64));
2358
        if (ret != sizeof(data64))
2359
            return -EINVAL;
2360

    
2361
        refcount_block_offset = offset;
2362
        s->refcount_block_cache_offset = offset;
2363
        update_refcount(bs, offset, s->cluster_size, 1);
2364
    } else {
2365
        if (refcount_block_offset != s->refcount_block_cache_offset) {
2366
            if (load_refcount_block(bs, refcount_block_offset) < 0)
2367
                return -EIO;
2368
        }
2369
    }
2370
    /* we can update the count and save it */
2371
    block_index = cluster_index &
2372
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2373
    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
2374
    refcount += addend;
2375
    if (refcount < 0 || refcount > 0xffff)
2376
        return -EINVAL;
2377
    if (refcount == 0 && cluster_index < s->free_cluster_index) {
2378
        s->free_cluster_index = cluster_index;
2379
    }
2380
    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
2381
    if (bdrv_pwrite(s->hd,
2382
                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
2383
                    &s->refcount_block_cache[block_index], 2) != 2)
2384
        return -EIO;
2385
    return refcount;
2386
}
2387

    
2388
static void update_refcount(BlockDriverState *bs,
2389
                            int64_t offset, int64_t length,
2390
                            int addend)
2391
{
2392
    BDRVQcowState *s = bs->opaque;
2393
    int64_t start, last, cluster_offset;
2394

    
2395
#ifdef DEBUG_ALLOC2
2396
    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
2397
           offset, length, addend);
2398
#endif
2399
    if (length <= 0)
2400
        return;
2401
    start = offset & ~(s->cluster_size - 1);
2402
    last = (offset + length - 1) & ~(s->cluster_size - 1);
2403
    for(cluster_offset = start; cluster_offset <= last;
2404
        cluster_offset += s->cluster_size) {
2405
        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
2406
    }
2407
}
2408

    
2409
#ifdef DEBUG_ALLOC
2410
static void inc_refcounts(BlockDriverState *bs,
2411
                          uint16_t *refcount_table,
2412
                          int refcount_table_size,
2413
                          int64_t offset, int64_t size)
2414
{
2415
    BDRVQcowState *s = bs->opaque;
2416
    int64_t start, last, cluster_offset;
2417
    int k;
2418

    
2419
    if (size <= 0)
2420
        return;
2421

    
2422
    start = offset & ~(s->cluster_size - 1);
2423
    last = (offset + size - 1) & ~(s->cluster_size - 1);
2424
    for(cluster_offset = start; cluster_offset <= last;
2425
        cluster_offset += s->cluster_size) {
2426
        k = cluster_offset >> s->cluster_bits;
2427
        if (k < 0 || k >= refcount_table_size) {
2428
            printf("ERROR: invalid cluster offset=0x%llx\n", cluster_offset);
2429
        } else {
2430
            if (++refcount_table[k] == 0) {
2431
                printf("ERROR: overflow cluster offset=0x%llx\n", cluster_offset);
2432
            }
2433
        }
2434
    }
2435
}
2436

    
2437
static int check_refcounts_l1(BlockDriverState *bs,
2438
                              uint16_t *refcount_table,
2439
                              int refcount_table_size,
2440
                              int64_t l1_table_offset, int l1_size,
2441
                              int check_copied)
2442
{
2443
    BDRVQcowState *s = bs->opaque;
2444
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
2445
    int l2_size, i, j, nb_csectors, refcount;
2446

    
2447
    l2_table = NULL;
2448
    l1_size2 = l1_size * sizeof(uint64_t);
2449

    
2450
    inc_refcounts(bs, refcount_table, refcount_table_size,
2451
                  l1_table_offset, l1_size2);
2452

    
2453
    l1_table = qemu_malloc(l1_size2);
2454
    if (!l1_table)
2455
        goto fail;
2456
    if (bdrv_pread(s->hd, l1_table_offset,
2457
                   l1_table, l1_size2) != l1_size2)
2458
        goto fail;
2459
    for(i = 0;i < l1_size; i++)
2460
        be64_to_cpus(&l1_table[i]);
2461

    
2462
    l2_size = s->l2_size * sizeof(uint64_t);
2463
    l2_table = qemu_malloc(l2_size);
2464
    if (!l2_table)
2465
        goto fail;
2466
    for(i = 0; i < l1_size; i++) {
2467
        l2_offset = l1_table[i];
2468
        if (l2_offset) {
2469
            if (check_copied) {
2470
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2471
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
2472
                    printf("ERROR OFLAG_COPIED: l2_offset=%llx refcount=%d\n",
2473
                           l2_offset, refcount);
2474
                }
2475
            }
2476
            l2_offset &= ~QCOW_OFLAG_COPIED;
2477
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
2478
                goto fail;
2479
            for(j = 0; j < s->l2_size; j++) {
2480
                offset = be64_to_cpu(l2_table[j]);
2481
                if (offset != 0) {
2482
                    if (offset & QCOW_OFLAG_COMPRESSED) {
2483
                        if (offset & QCOW_OFLAG_COPIED) {
2484
                            printf("ERROR: cluster %lld: copied flag must never be set for compressed clusters\n",
2485
                                   offset >> s->cluster_bits);
2486
                            offset &= ~QCOW_OFLAG_COPIED;
2487
                        }
2488
                        nb_csectors = ((offset >> s->csize_shift) &
2489
                                       s->csize_mask) + 1;
2490
                        offset &= s->cluster_offset_mask;
2491
                        inc_refcounts(bs, refcount_table,
2492
                                      refcount_table_size,
2493
                                      offset & ~511, nb_csectors * 512);
2494
                    } else {
2495
                        if (check_copied) {
2496
                            refcount = get_refcount(bs, (offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2497
                            if ((refcount == 1) != ((offset & QCOW_OFLAG_COPIED) != 0)) {
2498
                                printf("ERROR OFLAG_COPIED: offset=%llx refcount=%d\n",
2499
                                       offset, refcount);
2500
                            }
2501
                        }
2502
                        offset &= ~QCOW_OFLAG_COPIED;
2503
                        inc_refcounts(bs, refcount_table,
2504
                                      refcount_table_size,
2505
                                      offset, s->cluster_size);
2506
                    }
2507
                }
2508
            }
2509
            inc_refcounts(bs, refcount_table,
2510
                          refcount_table_size,
2511
                          l2_offset,
2512
                          s->cluster_size);
2513
        }
2514
    }
2515
    qemu_free(l1_table);
2516
    qemu_free(l2_table);
2517
    return 0;
2518
 fail:
2519
    printf("ERROR: I/O error in check_refcounts_l1\n");
2520
    qemu_free(l1_table);
2521
    qemu_free(l2_table);
2522
    return -EIO;
2523
}
2524

    
2525
static void check_refcounts(BlockDriverState *bs)
2526
{
2527
    BDRVQcowState *s = bs->opaque;
2528
    int64_t size;
2529
    int nb_clusters, refcount1, refcount2, i;
2530
    QCowSnapshot *sn;
2531
    uint16_t *refcount_table;
2532

    
2533
    size = bdrv_getlength(s->hd);
2534
    nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
2535
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
2536

    
2537
    /* header */
2538
    inc_refcounts(bs, refcount_table, nb_clusters,
2539
                  0, s->cluster_size);
2540

    
2541
    check_refcounts_l1(bs, refcount_table, nb_clusters,
2542
                       s->l1_table_offset, s->l1_size, 1);
2543

    
2544
    /* snapshots */
2545
    for(i = 0; i < s->nb_snapshots; i++) {
2546
        sn = s->snapshots + i;
2547
        check_refcounts_l1(bs, refcount_table, nb_clusters,
2548
                           sn->l1_table_offset, sn->l1_size, 0);
2549
    }
2550
    inc_refcounts(bs, refcount_table, nb_clusters,
2551
                  s->snapshots_offset, s->snapshots_size);
2552

    
2553
    /* refcount data */
2554
    inc_refcounts(bs, refcount_table, nb_clusters,
2555
                  s->refcount_table_offset,
2556
                  s->refcount_table_size * sizeof(uint64_t));
2557
    for(i = 0; i < s->refcount_table_size; i++) {
2558
        int64_t offset;
2559
        offset = s->refcount_table[i];
2560
        if (offset != 0) {
2561
            inc_refcounts(bs, refcount_table, nb_clusters,
2562
                          offset, s->cluster_size);
2563
        }
2564
    }
2565

    
2566
    /* compare ref counts */
2567
    for(i = 0; i < nb_clusters; i++) {
2568
        refcount1 = get_refcount(bs, i);
2569
        refcount2 = refcount_table[i];
2570
        if (refcount1 != refcount2)
2571
            printf("ERROR cluster %d refcount=%d reference=%d\n",
2572
                   i, refcount1, refcount2);
2573
    }
2574

    
2575
    qemu_free(refcount_table);
2576
}
2577

    
2578
#if 0
2579
static void dump_refcounts(BlockDriverState *bs)
2580
{
2581
    BDRVQcowState *s = bs->opaque;
2582
    int64_t nb_clusters, k, k1, size;
2583
    int refcount;
2584

2585
    size = bdrv_getlength(s->hd);
2586
    nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
2587
    for(k = 0; k < nb_clusters;) {
2588
        k1 = k;
2589
        refcount = get_refcount(bs, k);
2590
        k++;
2591
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2592
            k++;
2593
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2594
    }
2595
}
2596
#endif
2597
#endif
2598

    
2599
BlockDriver bdrv_qcow2 = {
2600
    "qcow2",
2601
    sizeof(BDRVQcowState),
2602
    qcow_probe,
2603
    qcow_open,
2604
    NULL,
2605
    NULL,
2606
    qcow_close,
2607
    qcow_create,
2608
    qcow_flush,
2609
    qcow_is_allocated,
2610
    qcow_set_key,
2611
    qcow_make_empty,
2612

    
2613
    .bdrv_aio_read = qcow_aio_read,
2614
    .bdrv_aio_write = qcow_aio_write,
2615
    .bdrv_aio_cancel = qcow_aio_cancel,
2616
    .aiocb_size = sizeof(QCowAIOCB),
2617
    .bdrv_write_compressed = qcow_write_compressed,
2618

    
2619
    .bdrv_snapshot_create = qcow_snapshot_create,
2620
    .bdrv_snapshot_goto = qcow_snapshot_goto,
2621
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2622
    .bdrv_snapshot_list = qcow_snapshot_list,
2623
    .bdrv_get_info = qcow_get_info,
2624
};