Statistics
| Branch: | Revision:

root / block-qcow2.c @ f4af02ed

History | View | Annotate | Download (79.8 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include <zlib.h>
27
#include "aes.h"
28
#include <assert.h>
29

    
30
/*
31
  Differences with QCOW:
32

33
  - Support for multiple incremental snapshots.
34
  - Memory management by reference counts.
35
  - Clusters which have a reference count of one have the bit
36
    QCOW_OFLAG_COPIED to optimize write performance.
37
  - Size of compressed clusters is stored in sectors to reduce bit usage
38
    in the cluster offsets.
39
  - Support for storing additional data (such as the VM state) in the
40
    snapshots.
41
  - If a backing store is used, the cluster size is not constrained
42
    (could be backported to QCOW).
43
  - L2 tables have always a size of one cluster.
44
*/
45

    
46
//#define DEBUG_ALLOC
47
//#define DEBUG_ALLOC2
48

    
49
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
50
#define QCOW_VERSION 2
51

    
52
#define QCOW_CRYPT_NONE 0
53
#define QCOW_CRYPT_AES  1
54

    
55
#define QCOW_MAX_CRYPT_CLUSTERS 32
56

    
57
/* indicate that the refcount of the referenced cluster is exactly one. */
58
#define QCOW_OFLAG_COPIED     (1LL << 63)
59
/* indicate that the cluster is compressed (they never have the copied flag) */
60
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
61

    
62
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
63

    
64
typedef struct QCowHeader {
65
    uint32_t magic;
66
    uint32_t version;
67
    uint64_t backing_file_offset;
68
    uint32_t backing_file_size;
69
    uint32_t cluster_bits;
70
    uint64_t size; /* in bytes */
71
    uint32_t crypt_method;
72
    uint32_t l1_size; /* XXX: save number of clusters instead ? */
73
    uint64_t l1_table_offset;
74
    uint64_t refcount_table_offset;
75
    uint32_t refcount_table_clusters;
76
    uint32_t nb_snapshots;
77
    uint64_t snapshots_offset;
78
} QCowHeader;
79

    
80
typedef struct __attribute__((packed)) QCowSnapshotHeader {
81
    /* header is 8 byte aligned */
82
    uint64_t l1_table_offset;
83

    
84
    uint32_t l1_size;
85
    uint16_t id_str_size;
86
    uint16_t name_size;
87

    
88
    uint32_t date_sec;
89
    uint32_t date_nsec;
90

    
91
    uint64_t vm_clock_nsec;
92

    
93
    uint32_t vm_state_size;
94
    uint32_t extra_data_size; /* for extension */
95
    /* extra data follows */
96
    /* id_str follows */
97
    /* name follows  */
98
} QCowSnapshotHeader;
99

    
100
#define L2_CACHE_SIZE 16
101

    
102
typedef struct QCowSnapshot {
103
    uint64_t l1_table_offset;
104
    uint32_t l1_size;
105
    char *id_str;
106
    char *name;
107
    uint32_t vm_state_size;
108
    uint32_t date_sec;
109
    uint32_t date_nsec;
110
    uint64_t vm_clock_nsec;
111
} QCowSnapshot;
112

    
113
typedef struct BDRVQcowState {
114
    BlockDriverState *hd;
115
    int cluster_bits;
116
    int cluster_size;
117
    int cluster_sectors;
118
    int l2_bits;
119
    int l2_size;
120
    int l1_size;
121
    int l1_vm_state_index;
122
    int csize_shift;
123
    int csize_mask;
124
    uint64_t cluster_offset_mask;
125
    uint64_t l1_table_offset;
126
    uint64_t *l1_table;
127
    uint64_t *l2_cache;
128
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
129
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
130
    uint8_t *cluster_cache;
131
    uint8_t *cluster_data;
132
    uint64_t cluster_cache_offset;
133

    
134
    uint64_t *refcount_table;
135
    uint64_t refcount_table_offset;
136
    uint32_t refcount_table_size;
137
    uint64_t refcount_block_cache_offset;
138
    uint16_t *refcount_block_cache;
139
    int64_t free_cluster_index;
140
    int64_t free_byte_offset;
141

    
142
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
143
    uint32_t crypt_method_header;
144
    AES_KEY aes_encrypt_key;
145
    AES_KEY aes_decrypt_key;
146
    uint64_t snapshots_offset;
147
    int snapshots_size;
148
    int nb_snapshots;
149
    QCowSnapshot *snapshots;
150
} BDRVQcowState;
151

    
152
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
153
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
154
                     uint8_t *buf, int nb_sectors);
155
static int qcow_read_snapshots(BlockDriverState *bs);
156
static void qcow_free_snapshots(BlockDriverState *bs);
157
static int refcount_init(BlockDriverState *bs);
158
static void refcount_close(BlockDriverState *bs);
159
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
160
static int update_cluster_refcount(BlockDriverState *bs,
161
                                   int64_t cluster_index,
162
                                   int addend);
163
static void update_refcount(BlockDriverState *bs,
164
                            int64_t offset, int64_t length,
165
                            int addend);
166
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
167
static int64_t alloc_bytes(BlockDriverState *bs, int size);
168
static void free_clusters(BlockDriverState *bs,
169
                          int64_t offset, int64_t size);
170
#ifdef DEBUG_ALLOC
171
static void check_refcounts(BlockDriverState *bs);
172
#endif
173

    
174
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
175
{
176
    const QCowHeader *cow_header = (const void *)buf;
177

    
178
    if (buf_size >= sizeof(QCowHeader) &&
179
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
180
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
181
        return 100;
182
    else
183
        return 0;
184
}
185

    
186
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
187
{
188
    BDRVQcowState *s = bs->opaque;
189
    int len, i, shift, ret;
190
    QCowHeader header;
191

    
192
    ret = bdrv_file_open(&s->hd, filename, flags);
193
    if (ret < 0)
194
        return ret;
195
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
196
        goto fail;
197
    be32_to_cpus(&header.magic);
198
    be32_to_cpus(&header.version);
199
    be64_to_cpus(&header.backing_file_offset);
200
    be32_to_cpus(&header.backing_file_size);
201
    be64_to_cpus(&header.size);
202
    be32_to_cpus(&header.cluster_bits);
203
    be32_to_cpus(&header.crypt_method);
204
    be64_to_cpus(&header.l1_table_offset);
205
    be32_to_cpus(&header.l1_size);
206
    be64_to_cpus(&header.refcount_table_offset);
207
    be32_to_cpus(&header.refcount_table_clusters);
208
    be64_to_cpus(&header.snapshots_offset);
209
    be32_to_cpus(&header.nb_snapshots);
210

    
211
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
212
        goto fail;
213
    if (header.size <= 1 ||
214
        header.cluster_bits < 9 ||
215
        header.cluster_bits > 16)
216
        goto fail;
217
    if (header.crypt_method > QCOW_CRYPT_AES)
218
        goto fail;
219
    s->crypt_method_header = header.crypt_method;
220
    if (s->crypt_method_header)
221
        bs->encrypted = 1;
222
    s->cluster_bits = header.cluster_bits;
223
    s->cluster_size = 1 << s->cluster_bits;
224
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
225
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
226
    s->l2_size = 1 << s->l2_bits;
227
    bs->total_sectors = header.size / 512;
228
    s->csize_shift = (62 - (s->cluster_bits - 8));
229
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
230
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
231
    s->refcount_table_offset = header.refcount_table_offset;
232
    s->refcount_table_size =
233
        header.refcount_table_clusters << (s->cluster_bits - 3);
234

    
235
    s->snapshots_offset = header.snapshots_offset;
236
    s->nb_snapshots = header.nb_snapshots;
237

    
238
    /* read the level 1 table */
239
    s->l1_size = header.l1_size;
240
    shift = s->cluster_bits + s->l2_bits;
241
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
242
    /* the L1 table must contain at least enough entries to put
243
       header.size bytes */
244
    if (s->l1_size < s->l1_vm_state_index)
245
        goto fail;
246
    s->l1_table_offset = header.l1_table_offset;
247
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
248
    if (!s->l1_table)
249
        goto fail;
250
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
251
        s->l1_size * sizeof(uint64_t))
252
        goto fail;
253
    for(i = 0;i < s->l1_size; i++) {
254
        be64_to_cpus(&s->l1_table[i]);
255
    }
256
    /* alloc L2 cache */
257
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
258
    if (!s->l2_cache)
259
        goto fail;
260
    s->cluster_cache = qemu_malloc(s->cluster_size);
261
    if (!s->cluster_cache)
262
        goto fail;
263
    /* one more sector for decompressed data alignment */
264
    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
265
                                  + 512);
266
    if (!s->cluster_data)
267
        goto fail;
268
    s->cluster_cache_offset = -1;
269

    
270
    if (refcount_init(bs) < 0)
271
        goto fail;
272

    
273
    /* read the backing file name */
274
    if (header.backing_file_offset != 0) {
275
        len = header.backing_file_size;
276
        if (len > 1023)
277
            len = 1023;
278
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
279
            goto fail;
280
        bs->backing_file[len] = '\0';
281
    }
282
    if (qcow_read_snapshots(bs) < 0)
283
        goto fail;
284

    
285
#ifdef DEBUG_ALLOC
286
    check_refcounts(bs);
287
#endif
288
    return 0;
289

    
290
 fail:
291
    qcow_free_snapshots(bs);
292
    refcount_close(bs);
293
    qemu_free(s->l1_table);
294
    qemu_free(s->l2_cache);
295
    qemu_free(s->cluster_cache);
296
    qemu_free(s->cluster_data);
297
    bdrv_delete(s->hd);
298
    return -1;
299
}
300

    
301
static int qcow_set_key(BlockDriverState *bs, const char *key)
302
{
303
    BDRVQcowState *s = bs->opaque;
304
    uint8_t keybuf[16];
305
    int len, i;
306

    
307
    memset(keybuf, 0, 16);
308
    len = strlen(key);
309
    if (len > 16)
310
        len = 16;
311
    /* XXX: we could compress the chars to 7 bits to increase
312
       entropy */
313
    for(i = 0;i < len;i++) {
314
        keybuf[i] = key[i];
315
    }
316
    s->crypt_method = s->crypt_method_header;
317

    
318
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
319
        return -1;
320
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
321
        return -1;
322
#if 0
323
    /* test */
324
    {
325
        uint8_t in[16];
326
        uint8_t out[16];
327
        uint8_t tmp[16];
328
        for(i=0;i<16;i++)
329
            in[i] = i;
330
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
331
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
332
        for(i = 0; i < 16; i++)
333
            printf(" %02x", tmp[i]);
334
        printf("\n");
335
        for(i = 0; i < 16; i++)
336
            printf(" %02x", out[i]);
337
        printf("\n");
338
    }
339
#endif
340
    return 0;
341
}
342

    
343
/* The crypt function is compatible with the linux cryptoloop
344
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
345
   supported */
346
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
347
                            uint8_t *out_buf, const uint8_t *in_buf,
348
                            int nb_sectors, int enc,
349
                            const AES_KEY *key)
350
{
351
    union {
352
        uint64_t ll[2];
353
        uint8_t b[16];
354
    } ivec;
355
    int i;
356

    
357
    for(i = 0; i < nb_sectors; i++) {
358
        ivec.ll[0] = cpu_to_le64(sector_num);
359
        ivec.ll[1] = 0;
360
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
361
                        ivec.b, enc);
362
        sector_num++;
363
        in_buf += 512;
364
        out_buf += 512;
365
    }
366
}
367

    
368
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
369
                        uint64_t cluster_offset, int n_start, int n_end)
370
{
371
    BDRVQcowState *s = bs->opaque;
372
    int n, ret;
373

    
374
    n = n_end - n_start;
375
    if (n <= 0)
376
        return 0;
377
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
378
    if (ret < 0)
379
        return ret;
380
    if (s->crypt_method) {
381
        encrypt_sectors(s, start_sect + n_start,
382
                        s->cluster_data,
383
                        s->cluster_data, n, 1,
384
                        &s->aes_encrypt_key);
385
    }
386
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
387
                     s->cluster_data, n);
388
    if (ret < 0)
389
        return ret;
390
    return 0;
391
}
392

    
393
static void l2_cache_reset(BlockDriverState *bs)
394
{
395
    BDRVQcowState *s = bs->opaque;
396

    
397
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
398
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
399
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
400
}
401

    
402
static inline int l2_cache_new_entry(BlockDriverState *bs)
403
{
404
    BDRVQcowState *s = bs->opaque;
405
    uint32_t min_count;
406
    int min_index, i;
407

    
408
    /* find a new entry in the least used one */
409
    min_index = 0;
410
    min_count = 0xffffffff;
411
    for(i = 0; i < L2_CACHE_SIZE; i++) {
412
        if (s->l2_cache_counts[i] < min_count) {
413
            min_count = s->l2_cache_counts[i];
414
            min_index = i;
415
        }
416
    }
417
    return min_index;
418
}
419

    
420
static int64_t align_offset(int64_t offset, int n)
421
{
422
    offset = (offset + n - 1) & ~(n - 1);
423
    return offset;
424
}
425

    
426
static int grow_l1_table(BlockDriverState *bs, int min_size)
427
{
428
    BDRVQcowState *s = bs->opaque;
429
    int new_l1_size, new_l1_size2, ret, i;
430
    uint64_t *new_l1_table;
431
    uint64_t new_l1_table_offset;
432
    uint64_t data64;
433
    uint32_t data32;
434

    
435
    new_l1_size = s->l1_size;
436
    if (min_size <= new_l1_size)
437
        return 0;
438
    while (min_size > new_l1_size) {
439
        new_l1_size = (new_l1_size * 3 + 1) / 2;
440
    }
441
#ifdef DEBUG_ALLOC2
442
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
443
#endif
444

    
445
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
446
    new_l1_table = qemu_mallocz(new_l1_size2);
447
    if (!new_l1_table)
448
        return -ENOMEM;
449
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
450

    
451
    /* write new table (align to cluster) */
452
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
453

    
454
    for(i = 0; i < s->l1_size; i++)
455
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
456
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
457
    if (ret != new_l1_size2)
458
        goto fail;
459
    for(i = 0; i < s->l1_size; i++)
460
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
461

    
462
    /* set new table */
463
    data64 = cpu_to_be64(new_l1_table_offset);
464
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_table_offset),
465
                    &data64, sizeof(data64)) != sizeof(data64))
466
        goto fail;
467
    data32 = cpu_to_be32(new_l1_size);
468
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size),
469
                    &data32, sizeof(data32)) != sizeof(data32))
470
        goto fail;
471
    qemu_free(s->l1_table);
472
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
473
    s->l1_table_offset = new_l1_table_offset;
474
    s->l1_table = new_l1_table;
475
    s->l1_size = new_l1_size;
476
    return 0;
477
 fail:
478
    qemu_free(s->l1_table);
479
    return -EIO;
480
}
481

    
482
/*
483
 * seek_l2_table
484
 *
485
 * seek l2_offset in the l2_cache table
486
 * if not found, return NULL,
487
 * if found,
488
 *   increments the l2 cache hit count of the entry,
489
 *   if counter overflow, divide by two all counters
490
 *   return the pointer to the l2 cache entry
491
 *
492
 */
493

    
494
static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
495
{
496
    int i, j;
497

    
498
    for(i = 0; i < L2_CACHE_SIZE; i++) {
499
        if (l2_offset == s->l2_cache_offsets[i]) {
500
            /* increment the hit count */
501
            if (++s->l2_cache_counts[i] == 0xffffffff) {
502
                for(j = 0; j < L2_CACHE_SIZE; j++) {
503
                    s->l2_cache_counts[j] >>= 1;
504
                }
505
            }
506
            return s->l2_cache + (i << s->l2_bits);
507
        }
508
    }
509
    return NULL;
510
}
511

    
512
/*
513
 * l2_load
514
 *
515
 * Loads a L2 table into memory. If the table is in the cache, the cache
516
 * is used; otherwise the L2 table is loaded from the image file.
517
 *
518
 * Returns a pointer to the L2 table on success, or NULL if the read from
519
 * the image file failed.
520
 */
521

    
522
static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
523
{
524
    BDRVQcowState *s = bs->opaque;
525
    int min_index;
526
    uint64_t *l2_table;
527

    
528
    /* seek if the table for the given offset is in the cache */
529

    
530
    l2_table = seek_l2_table(s, l2_offset);
531
    if (l2_table != NULL)
532
        return l2_table;
533

    
534
    /* not found: load a new entry in the least used one */
535

    
536
    min_index = l2_cache_new_entry(bs);
537
    l2_table = s->l2_cache + (min_index << s->l2_bits);
538
    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
539
        s->l2_size * sizeof(uint64_t))
540
        return NULL;
541
    s->l2_cache_offsets[min_index] = l2_offset;
542
    s->l2_cache_counts[min_index] = 1;
543

    
544
    return l2_table;
545
}
546

    
547
/*
548
 * l2_allocate
549
 *
550
 * Allocate a new l2 entry in the file. If l1_index points to an already
551
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
552
 * table) copy the contents of the old L2 table into the newly allocated one.
553
 * Otherwise the new table is initialized with zeros.
554
 *
555
 */
556

    
557
static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
558
{
559
    BDRVQcowState *s = bs->opaque;
560
    int min_index;
561
    uint64_t old_l2_offset, tmp;
562
    uint64_t *l2_table, l2_offset;
563

    
564
    old_l2_offset = s->l1_table[l1_index];
565

    
566
    /* allocate a new l2 entry */
567

    
568
    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
569

    
570
    /* update the L1 entry */
571

    
572
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
573

    
574
    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
575
    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
576
                    &tmp, sizeof(tmp)) != sizeof(tmp))
577
        return NULL;
578

    
579
    /* allocate a new entry in the l2 cache */
580

    
581
    min_index = l2_cache_new_entry(bs);
582
    l2_table = s->l2_cache + (min_index << s->l2_bits);
583

    
584
    if (old_l2_offset == 0) {
585
        /* if there was no old l2 table, clear the new table */
586
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
587
    } else {
588
        /* if there was an old l2 table, read it from the disk */
589
        if (bdrv_pread(s->hd, old_l2_offset,
590
                       l2_table, s->l2_size * sizeof(uint64_t)) !=
591
            s->l2_size * sizeof(uint64_t))
592
            return NULL;
593
    }
594
    /* write the l2 table to the file */
595
    if (bdrv_pwrite(s->hd, l2_offset,
596
                    l2_table, s->l2_size * sizeof(uint64_t)) !=
597
        s->l2_size * sizeof(uint64_t))
598
        return NULL;
599

    
600
    /* update the l2 cache entry */
601

    
602
    s->l2_cache_offsets[min_index] = l2_offset;
603
    s->l2_cache_counts[min_index] = 1;
604

    
605
    return l2_table;
606
}
607

    
608
/*
609
 * get_cluster_offset
610
 *
611
 * For a given offset of the disk image, return cluster offset in
612
 * qcow2 file.
613
 *
614
 * on entry, *num is the number of contiguous clusters we'd like to
615
 * access following offset.
616
 *
617
 * on exit, *num is the number of contiguous clusters we can read.
618
 *
619
 * Return 1, if the offset is found
620
 * Return 0, otherwise.
621
 *
622
 */
623

    
624
static uint64_t get_cluster_offset(BlockDriverState *bs,
625
                                   uint64_t offset, int *num)
626
{
627
    BDRVQcowState *s = bs->opaque;
628
    int l1_index, l2_index;
629
    uint64_t l2_offset, *l2_table, cluster_offset, next;
630
    int l1_bits;
631
    int index_in_cluster, nb_available, nb_needed;
632

    
633
    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
634
    nb_needed = *num + index_in_cluster;
635

    
636
    l1_bits = s->l2_bits + s->cluster_bits;
637

    
638
    /* compute how many bytes there are between the offset and
639
     * and the end of the l1 entry
640
     */
641

    
642
    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
643

    
644
    /* compute the number of available sectors */
645

    
646
    nb_available = (nb_available >> 9) + index_in_cluster;
647

    
648
    cluster_offset = 0;
649

    
650
    /* seek the the l2 offset in the l1 table */
651

    
652
    l1_index = offset >> l1_bits;
653
    if (l1_index >= s->l1_size)
654
        goto out;
655

    
656
    l2_offset = s->l1_table[l1_index];
657

    
658
    /* seek the l2 table of the given l2 offset */
659

    
660
    if (!l2_offset)
661
        goto out;
662

    
663
    /* load the l2 table in memory */
664

    
665
    l2_offset &= ~QCOW_OFLAG_COPIED;
666
    l2_table = l2_load(bs, l2_offset);
667
    if (l2_table == NULL)
668
        return 0;
669

    
670
    /* find the cluster offset for the given disk offset */
671

    
672
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
673
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
674
    nb_available = s->cluster_sectors;
675
    l2_index++;
676

    
677
    if (!cluster_offset) {
678

    
679
       /* how many empty clusters ? */
680

    
681
       while (nb_available < nb_needed && !l2_table[l2_index]) {
682
           l2_index++;
683
           nb_available += s->cluster_sectors;
684
       }
685
    } else {
686

    
687
       /* how many allocated clusters ? */
688

    
689
       cluster_offset &= ~QCOW_OFLAG_COPIED;
690
       while (nb_available < nb_needed) {
691
           next = be64_to_cpu(l2_table[l2_index]) & ~QCOW_OFLAG_COPIED;
692
           if (next != cluster_offset + (nb_available << 9))
693
               break;
694
           l2_index++;
695
           nb_available += s->cluster_sectors;
696
       }
697
   }
698

    
699
out:
700
    if (nb_available > nb_needed)
701
        nb_available = nb_needed;
702

    
703
    *num = nb_available - index_in_cluster;
704

    
705
    return cluster_offset;
706
}
707

    
708
/*
709
 * free_any_clusters
710
 *
711
 * free clusters according to its type: compressed or not
712
 *
713
 */
714

    
715
static void free_any_clusters(BlockDriverState *bs,
716
                              uint64_t cluster_offset, int nb_clusters)
717
{
718
    BDRVQcowState *s = bs->opaque;
719

    
720
    /* free the cluster */
721

    
722
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
723
        int nb_csectors;
724
        nb_csectors = ((cluster_offset >> s->csize_shift) &
725
                       s->csize_mask) + 1;
726
        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
727
                      nb_csectors * 512);
728
        return;
729
    }
730

    
731
    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
732

    
733
    return;
734
}
735

    
736
/*
737
 * get_cluster_table
738
 *
739
 * for a given disk offset, load (and allocate if needed)
740
 * the l2 table.
741
 *
742
 * the l2 table offset in the qcow2 file and the cluster index
743
 * in the l2 table are given to the caller.
744
 *
745
 */
746

    
747
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
748
                             uint64_t **new_l2_table,
749
                             uint64_t *new_l2_offset,
750
                             int *new_l2_index)
751
{
752
    BDRVQcowState *s = bs->opaque;
753
    int l1_index, l2_index, ret;
754
    uint64_t l2_offset, *l2_table;
755

    
756
    /* seek the the l2 offset in the l1 table */
757

    
758
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
759
    if (l1_index >= s->l1_size) {
760
        ret = grow_l1_table(bs, l1_index + 1);
761
        if (ret < 0)
762
            return 0;
763
    }
764
    l2_offset = s->l1_table[l1_index];
765

    
766
    /* seek the l2 table of the given l2 offset */
767

    
768
    if (l2_offset & QCOW_OFLAG_COPIED) {
769
        /* load the l2 table in memory */
770
        l2_offset &= ~QCOW_OFLAG_COPIED;
771
        l2_table = l2_load(bs, l2_offset);
772
        if (l2_table == NULL)
773
            return 0;
774
    } else {
775
        if (l2_offset)
776
            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
777
        l2_table = l2_allocate(bs, l1_index);
778
        if (l2_table == NULL)
779
            return 0;
780
        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
781
    }
782

    
783
    /* find the cluster offset for the given disk offset */
784

    
785
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
786

    
787
    *new_l2_table = l2_table;
788
    *new_l2_offset = l2_offset;
789
    *new_l2_index = l2_index;
790

    
791
    return 1;
792
}
793

    
794
/*
795
 * alloc_compressed_cluster_offset
796
 *
797
 * For a given offset of the disk image, return cluster offset in
798
 * qcow2 file.
799
 *
800
 * If the offset is not found, allocate a new compressed cluster.
801
 *
802
 * Return the cluster offset if successful,
803
 * Return 0, otherwise.
804
 *
805
 */
806

    
807
static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
808
                                                uint64_t offset,
809
                                                int compressed_size)
810
{
811
    BDRVQcowState *s = bs->opaque;
812
    int l2_index, ret;
813
    uint64_t l2_offset, *l2_table, cluster_offset;
814
    int nb_csectors;
815

    
816
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
817
    if (ret == 0)
818
        return 0;
819

    
820
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
821
    if (cluster_offset & QCOW_OFLAG_COPIED)
822
        return cluster_offset & ~QCOW_OFLAG_COPIED;
823

    
824
    if (cluster_offset)
825
        free_any_clusters(bs, cluster_offset, 1);
826

    
827
    cluster_offset = alloc_bytes(bs, compressed_size);
828
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
829
                  (cluster_offset >> 9);
830

    
831
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
832
                      ((uint64_t)nb_csectors << s->csize_shift);
833

    
834
    /* update L2 table */
835

    
836
    /* compressed clusters never have the copied flag */
837

    
838
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
839
    if (bdrv_pwrite(s->hd,
840
                    l2_offset + l2_index * sizeof(uint64_t),
841
                    l2_table + l2_index,
842
                    sizeof(uint64_t)) != sizeof(uint64_t))
843
        return 0;
844

    
845
    return cluster_offset;
846
}
847

    
848
/*
849
 * alloc_cluster_offset
850
 *
851
 * For a given offset of the disk image, return cluster offset in
852
 * qcow2 file.
853
 *
854
 * If the offset is not found, allocate a new cluster.
855
 *
856
 * Return the cluster offset if successful,
857
 * Return 0, otherwise.
858
 *
859
 */
860

    
861
static uint64_t alloc_cluster_offset(BlockDriverState *bs,
862
                                     uint64_t offset,
863
                                     int n_start, int n_end,
864
                                     int *num)
865
{
866
    BDRVQcowState *s = bs->opaque;
867
    int l2_index, ret;
868
    uint64_t l2_offset, *l2_table, cluster_offset;
869
    int nb_available, nb_clusters, i, j;
870
    uint64_t start_sect, current;
871

    
872
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
873
    if (ret == 0)
874
        return 0;
875

    
876
    nb_clusters = ((n_end << 9) + s->cluster_size - 1) >>
877
                  s->cluster_bits;
878
    if (nb_clusters > s->l2_size - l2_index)
879
            nb_clusters = s->l2_size - l2_index;
880

    
881
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
882

    
883
    /* We keep all QCOW_OFLAG_COPIED clusters */
884

    
885
    if (cluster_offset & QCOW_OFLAG_COPIED) {
886

    
887
        for (i = 1; i < nb_clusters; i++) {
888
            current = be64_to_cpu(l2_table[l2_index + i]);
889
            if (cluster_offset + (i << s->cluster_bits) != current)
890
                break;
891
        }
892
        nb_clusters = i;
893

    
894
        nb_available = nb_clusters << (s->cluster_bits - 9);
895
        if (nb_available > n_end)
896
            nb_available = n_end;
897

    
898
        cluster_offset &= ~QCOW_OFLAG_COPIED;
899

    
900
        goto out;
901
    }
902

    
903
    /* for the moment, multiple compressed clusters are not managed */
904

    
905
    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
906
        nb_clusters = 1;
907

    
908
    /* how many available clusters ? */
909

    
910
    i = 0;
911
    while (i < nb_clusters) {
912

    
913
        i++;
914

    
915
        if (!cluster_offset) {
916

    
917
            /* how many free clusters ? */
918

    
919
            while (i < nb_clusters) {
920
                cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
921
                if (cluster_offset != 0)
922
                    break;
923
                i++;
924
            }
925

    
926
            if ((cluster_offset & QCOW_OFLAG_COPIED) ||
927
                (cluster_offset & QCOW_OFLAG_COMPRESSED))
928
                break;
929

    
930
        } else {
931

    
932
            /* how many contiguous clusters ? */
933

    
934
            j = 1;
935
            current = 0;
936
            while (i < nb_clusters) {
937
                current = be64_to_cpu(l2_table[l2_index + i]);
938
                if (cluster_offset + (j << s->cluster_bits) != current)
939
                    break;
940

    
941
                i++;
942
                j++;
943
            }
944

    
945
            free_any_clusters(bs, cluster_offset, j);
946
            if (current)
947
                break;
948
            cluster_offset = current;
949
        }
950
    }
951
    nb_clusters = i;
952

    
953
    /* allocate a new cluster */
954

    
955
    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
956

    
957
    /* we must initialize the cluster content which won't be
958
       written */
959

    
960
    nb_available = nb_clusters << (s->cluster_bits - 9);
961
    if (nb_available > n_end)
962
        nb_available = n_end;
963

    
964
    /* copy content of unmodified sectors */
965

    
966
    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
967
    if (n_start) {
968
        ret = copy_sectors(bs, start_sect, cluster_offset, 0, n_start);
969
        if (ret < 0)
970
            return 0;
971
    }
972

    
973
    if (nb_available & (s->cluster_sectors - 1)) {
974
        uint64_t end = nb_available & ~(uint64_t)(s->cluster_sectors - 1);
975
        ret = copy_sectors(bs, start_sect + end,
976
                           cluster_offset + (end << 9),
977
                           nb_available - end,
978
                           s->cluster_sectors);
979
        if (ret < 0)
980
            return 0;
981
    }
982

    
983
    /* update L2 table */
984

    
985
    for (i = 0; i < nb_clusters; i++)
986
        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
987
                                             (i << s->cluster_bits)) |
988
                                             QCOW_OFLAG_COPIED);
989

    
990
    if (bdrv_pwrite(s->hd,
991
                    l2_offset + l2_index * sizeof(uint64_t),
992
                    l2_table + l2_index,
993
                    nb_clusters * sizeof(uint64_t)) !=
994
                    nb_clusters * sizeof(uint64_t))
995
        return 0;
996

    
997
out:
998
    *num = nb_available - n_start;
999

    
1000
    return cluster_offset;
1001
}
1002

    
1003
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
1004
                             int nb_sectors, int *pnum)
1005
{
1006
    uint64_t cluster_offset;
1007

    
1008
    *pnum = nb_sectors;
1009
    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
1010

    
1011
    return (cluster_offset != 0);
1012
}
1013

    
1014
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1015
                             const uint8_t *buf, int buf_size)
1016
{
1017
    z_stream strm1, *strm = &strm1;
1018
    int ret, out_len;
1019

    
1020
    memset(strm, 0, sizeof(*strm));
1021

    
1022
    strm->next_in = (uint8_t *)buf;
1023
    strm->avail_in = buf_size;
1024
    strm->next_out = out_buf;
1025
    strm->avail_out = out_buf_size;
1026

    
1027
    ret = inflateInit2(strm, -12);
1028
    if (ret != Z_OK)
1029
        return -1;
1030
    ret = inflate(strm, Z_FINISH);
1031
    out_len = strm->next_out - out_buf;
1032
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1033
        out_len != out_buf_size) {
1034
        inflateEnd(strm);
1035
        return -1;
1036
    }
1037
    inflateEnd(strm);
1038
    return 0;
1039
}
1040

    
1041
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
1042
{
1043
    int ret, csize, nb_csectors, sector_offset;
1044
    uint64_t coffset;
1045

    
1046
    coffset = cluster_offset & s->cluster_offset_mask;
1047
    if (s->cluster_cache_offset != coffset) {
1048
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1049
        sector_offset = coffset & 511;
1050
        csize = nb_csectors * 512 - sector_offset;
1051
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
1052
        if (ret < 0) {
1053
            return -1;
1054
        }
1055
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1056
                              s->cluster_data + sector_offset, csize) < 0) {
1057
            return -1;
1058
        }
1059
        s->cluster_cache_offset = coffset;
1060
    }
1061
    return 0;
1062
}
1063

    
1064
/* handle reading after the end of the backing file */
1065
static int backing_read1(BlockDriverState *bs,
1066
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
1067
{
1068
    int n1;
1069
    if ((sector_num + nb_sectors) <= bs->total_sectors)
1070
        return nb_sectors;
1071
    if (sector_num >= bs->total_sectors)
1072
        n1 = 0;
1073
    else
1074
        n1 = bs->total_sectors - sector_num;
1075
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
1076
    return n1;
1077
}
1078

    
1079
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
1080
                     uint8_t *buf, int nb_sectors)
1081
{
1082
    BDRVQcowState *s = bs->opaque;
1083
    int ret, index_in_cluster, n, n1;
1084
    uint64_t cluster_offset;
1085

    
1086
    while (nb_sectors > 0) {
1087
        n = nb_sectors;
1088
        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
1089
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1090
        if (!cluster_offset) {
1091
            if (bs->backing_hd) {
1092
                /* read from the base image */
1093
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
1094
                if (n1 > 0) {
1095
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
1096
                    if (ret < 0)
1097
                        return -1;
1098
                }
1099
            } else {
1100
                memset(buf, 0, 512 * n);
1101
            }
1102
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1103
            if (decompress_cluster(s, cluster_offset) < 0)
1104
                return -1;
1105
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
1106
        } else {
1107
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1108
            if (ret != n * 512)
1109
                return -1;
1110
            if (s->crypt_method) {
1111
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
1112
                                &s->aes_decrypt_key);
1113
            }
1114
        }
1115
        nb_sectors -= n;
1116
        sector_num += n;
1117
        buf += n * 512;
1118
    }
1119
    return 0;
1120
}
1121

    
1122
static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1123
                     const uint8_t *buf, int nb_sectors)
1124
{
1125
    BDRVQcowState *s = bs->opaque;
1126
    int ret, index_in_cluster, n;
1127
    uint64_t cluster_offset;
1128
    int n_end;
1129

    
1130
    while (nb_sectors > 0) {
1131
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1132
        n_end = index_in_cluster + nb_sectors;
1133
        if (s->crypt_method &&
1134
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1135
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1136
        cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1137
                                              index_in_cluster,
1138
                                              n_end, &n);
1139
        if (!cluster_offset)
1140
            return -1;
1141
        if (s->crypt_method) {
1142
            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
1143
                            &s->aes_encrypt_key);
1144
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
1145
                              s->cluster_data, n * 512);
1146
        } else {
1147
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1148
        }
1149
        if (ret != n * 512)
1150
            return -1;
1151
        nb_sectors -= n;
1152
        sector_num += n;
1153
        buf += n * 512;
1154
    }
1155
    s->cluster_cache_offset = -1; /* disable compressed cache */
1156
    return 0;
1157
}
1158

    
1159
typedef struct QCowAIOCB {
1160
    BlockDriverAIOCB common;
1161
    int64_t sector_num;
1162
    uint8_t *buf;
1163
    int nb_sectors;
1164
    int n;
1165
    uint64_t cluster_offset;
1166
    uint8_t *cluster_data;
1167
    BlockDriverAIOCB *hd_aiocb;
1168
} QCowAIOCB;
1169

    
1170
static void qcow_aio_read_cb(void *opaque, int ret)
1171
{
1172
    QCowAIOCB *acb = opaque;
1173
    BlockDriverState *bs = acb->common.bs;
1174
    BDRVQcowState *s = bs->opaque;
1175
    int index_in_cluster, n1;
1176

    
1177
    acb->hd_aiocb = NULL;
1178
    if (ret < 0) {
1179
    fail:
1180
        acb->common.cb(acb->common.opaque, ret);
1181
        qemu_aio_release(acb);
1182
        return;
1183
    }
1184

    
1185
 redo:
1186
    /* post process the read buffer */
1187
    if (!acb->cluster_offset) {
1188
        /* nothing to do */
1189
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1190
        /* nothing to do */
1191
    } else {
1192
        if (s->crypt_method) {
1193
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
1194
                            acb->n, 0,
1195
                            &s->aes_decrypt_key);
1196
        }
1197
    }
1198

    
1199
    acb->nb_sectors -= acb->n;
1200
    acb->sector_num += acb->n;
1201
    acb->buf += acb->n * 512;
1202

    
1203
    if (acb->nb_sectors == 0) {
1204
        /* request completed */
1205
        acb->common.cb(acb->common.opaque, 0);
1206
        qemu_aio_release(acb);
1207
        return;
1208
    }
1209

    
1210
    /* prepare next AIO request */
1211
    acb->n = acb->nb_sectors;
1212
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
1213
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1214

    
1215
    if (!acb->cluster_offset) {
1216
        if (bs->backing_hd) {
1217
            /* read from the base image */
1218
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
1219
                               acb->buf, acb->n);
1220
            if (n1 > 0) {
1221
                acb->hd_aiocb = bdrv_aio_read(bs->backing_hd, acb->sector_num,
1222
                                    acb->buf, acb->n, qcow_aio_read_cb, acb);
1223
                if (acb->hd_aiocb == NULL)
1224
                    goto fail;
1225
            } else {
1226
                goto redo;
1227
            }
1228
        } else {
1229
            /* Note: in this case, no need to wait */
1230
            memset(acb->buf, 0, 512 * acb->n);
1231
            goto redo;
1232
        }
1233
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1234
        /* add AIO support for compressed blocks ? */
1235
        if (decompress_cluster(s, acb->cluster_offset) < 0)
1236
            goto fail;
1237
        memcpy(acb->buf,
1238
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
1239
        goto redo;
1240
    } else {
1241
        if ((acb->cluster_offset & 511) != 0) {
1242
            ret = -EIO;
1243
            goto fail;
1244
        }
1245
        acb->hd_aiocb = bdrv_aio_read(s->hd,
1246
                            (acb->cluster_offset >> 9) + index_in_cluster,
1247
                            acb->buf, acb->n, qcow_aio_read_cb, acb);
1248
        if (acb->hd_aiocb == NULL)
1249
            goto fail;
1250
    }
1251
}
1252

    
1253
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1254
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1255
        BlockDriverCompletionFunc *cb, void *opaque)
1256
{
1257
    QCowAIOCB *acb;
1258

    
1259
    acb = qemu_aio_get(bs, cb, opaque);
1260
    if (!acb)
1261
        return NULL;
1262
    acb->hd_aiocb = NULL;
1263
    acb->sector_num = sector_num;
1264
    acb->buf = buf;
1265
    acb->nb_sectors = nb_sectors;
1266
    acb->n = 0;
1267
    acb->cluster_offset = 0;
1268
    return acb;
1269
}
1270

    
1271
static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs,
1272
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1273
        BlockDriverCompletionFunc *cb, void *opaque)
1274
{
1275
    QCowAIOCB *acb;
1276

    
1277
    acb = qcow_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
1278
    if (!acb)
1279
        return NULL;
1280

    
1281
    qcow_aio_read_cb(acb, 0);
1282
    return &acb->common;
1283
}
1284

    
1285
static void qcow_aio_write_cb(void *opaque, int ret)
1286
{
1287
    QCowAIOCB *acb = opaque;
1288
    BlockDriverState *bs = acb->common.bs;
1289
    BDRVQcowState *s = bs->opaque;
1290
    int index_in_cluster;
1291
    uint64_t cluster_offset;
1292
    const uint8_t *src_buf;
1293
    int n_end;
1294

    
1295
    acb->hd_aiocb = NULL;
1296

    
1297
    if (ret < 0) {
1298
    fail:
1299
        acb->common.cb(acb->common.opaque, ret);
1300
        qemu_aio_release(acb);
1301
        return;
1302
    }
1303

    
1304
    acb->nb_sectors -= acb->n;
1305
    acb->sector_num += acb->n;
1306
    acb->buf += acb->n * 512;
1307

    
1308
    if (acb->nb_sectors == 0) {
1309
        /* request completed */
1310
        acb->common.cb(acb->common.opaque, 0);
1311
        qemu_aio_release(acb);
1312
        return;
1313
    }
1314

    
1315
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1316
    n_end = index_in_cluster + acb->nb_sectors;
1317
    if (s->crypt_method &&
1318
        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1319
        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1320

    
1321
    cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1322
                                          index_in_cluster,
1323
                                          n_end, &acb->n);
1324
    if (!cluster_offset || (cluster_offset & 511) != 0) {
1325
        ret = -EIO;
1326
        goto fail;
1327
    }
1328
    if (s->crypt_method) {
1329
        if (!acb->cluster_data) {
1330
            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
1331
                                             s->cluster_size);
1332
            if (!acb->cluster_data) {
1333
                ret = -ENOMEM;
1334
                goto fail;
1335
            }
1336
        }
1337
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
1338
                        acb->n, 1, &s->aes_encrypt_key);
1339
        src_buf = acb->cluster_data;
1340
    } else {
1341
        src_buf = acb->buf;
1342
    }
1343
    acb->hd_aiocb = bdrv_aio_write(s->hd,
1344
                                   (cluster_offset >> 9) + index_in_cluster,
1345
                                   src_buf, acb->n,
1346
                                   qcow_aio_write_cb, acb);
1347
    if (acb->hd_aiocb == NULL)
1348
        goto fail;
1349
}
1350

    
1351
static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs,
1352
        int64_t sector_num, const uint8_t *buf, int nb_sectors,
1353
        BlockDriverCompletionFunc *cb, void *opaque)
1354
{
1355
    BDRVQcowState *s = bs->opaque;
1356
    QCowAIOCB *acb;
1357

    
1358
    s->cluster_cache_offset = -1; /* disable compressed cache */
1359

    
1360
    acb = qcow_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
1361
    if (!acb)
1362
        return NULL;
1363

    
1364
    qcow_aio_write_cb(acb, 0);
1365
    return &acb->common;
1366
}
1367

    
1368
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1369
{
1370
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1371
    if (acb->hd_aiocb)
1372
        bdrv_aio_cancel(acb->hd_aiocb);
1373
    qemu_aio_release(acb);
1374
}
1375

    
1376
static void qcow_close(BlockDriverState *bs)
1377
{
1378
    BDRVQcowState *s = bs->opaque;
1379
    qemu_free(s->l1_table);
1380
    qemu_free(s->l2_cache);
1381
    qemu_free(s->cluster_cache);
1382
    qemu_free(s->cluster_data);
1383
    refcount_close(bs);
1384
    bdrv_delete(s->hd);
1385
}
1386

    
1387
/* XXX: use std qcow open function ? */
1388
typedef struct QCowCreateState {
1389
    int cluster_size;
1390
    int cluster_bits;
1391
    uint16_t *refcount_block;
1392
    uint64_t *refcount_table;
1393
    int64_t l1_table_offset;
1394
    int64_t refcount_table_offset;
1395
    int64_t refcount_block_offset;
1396
} QCowCreateState;
1397

    
1398
static void create_refcount_update(QCowCreateState *s,
1399
                                   int64_t offset, int64_t size)
1400
{
1401
    int refcount;
1402
    int64_t start, last, cluster_offset;
1403
    uint16_t *p;
1404

    
1405
    start = offset & ~(s->cluster_size - 1);
1406
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
1407
    for(cluster_offset = start; cluster_offset <= last;
1408
        cluster_offset += s->cluster_size) {
1409
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
1410
        refcount = be16_to_cpu(*p);
1411
        refcount++;
1412
        *p = cpu_to_be16(refcount);
1413
    }
1414
}
1415

    
1416
static int qcow_create(const char *filename, int64_t total_size,
1417
                      const char *backing_file, int flags)
1418
{
1419
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1420
    QCowHeader header;
1421
    uint64_t tmp, offset;
1422
    QCowCreateState s1, *s = &s1;
1423

    
1424
    memset(s, 0, sizeof(*s));
1425

    
1426
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1427
    if (fd < 0)
1428
        return -1;
1429
    memset(&header, 0, sizeof(header));
1430
    header.magic = cpu_to_be32(QCOW_MAGIC);
1431
    header.version = cpu_to_be32(QCOW_VERSION);
1432
    header.size = cpu_to_be64(total_size * 512);
1433
    header_size = sizeof(header);
1434
    backing_filename_len = 0;
1435
    if (backing_file) {
1436
        header.backing_file_offset = cpu_to_be64(header_size);
1437
        backing_filename_len = strlen(backing_file);
1438
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1439
        header_size += backing_filename_len;
1440
    }
1441
    s->cluster_bits = 12;  /* 4 KB clusters */
1442
    s->cluster_size = 1 << s->cluster_bits;
1443
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1444
    header_size = (header_size + 7) & ~7;
1445
    if (flags & BLOCK_FLAG_ENCRYPT) {
1446
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1447
    } else {
1448
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1449
    }
1450
    l2_bits = s->cluster_bits - 3;
1451
    shift = s->cluster_bits + l2_bits;
1452
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1453
    offset = align_offset(header_size, s->cluster_size);
1454
    s->l1_table_offset = offset;
1455
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1456
    header.l1_size = cpu_to_be32(l1_size);
1457
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1458

    
1459
    s->refcount_table = qemu_mallocz(s->cluster_size);
1460
    if (!s->refcount_table)
1461
        goto fail;
1462
    s->refcount_block = qemu_mallocz(s->cluster_size);
1463
    if (!s->refcount_block)
1464
        goto fail;
1465

    
1466
    s->refcount_table_offset = offset;
1467
    header.refcount_table_offset = cpu_to_be64(offset);
1468
    header.refcount_table_clusters = cpu_to_be32(1);
1469
    offset += s->cluster_size;
1470

    
1471
    s->refcount_table[0] = cpu_to_be64(offset);
1472
    s->refcount_block_offset = offset;
1473
    offset += s->cluster_size;
1474

    
1475
    /* update refcounts */
1476
    create_refcount_update(s, 0, header_size);
1477
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1478
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1479
    create_refcount_update(s, s->refcount_block_offset, s->cluster_size);
1480

    
1481
    /* write all the data */
1482
    write(fd, &header, sizeof(header));
1483
    if (backing_file) {
1484
        write(fd, backing_file, backing_filename_len);
1485
    }
1486
    lseek(fd, s->l1_table_offset, SEEK_SET);
1487
    tmp = 0;
1488
    for(i = 0;i < l1_size; i++) {
1489
        write(fd, &tmp, sizeof(tmp));
1490
    }
1491
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1492
    write(fd, s->refcount_table, s->cluster_size);
1493

    
1494
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1495
    write(fd, s->refcount_block, s->cluster_size);
1496

    
1497
    qemu_free(s->refcount_table);
1498
    qemu_free(s->refcount_block);
1499
    close(fd);
1500
    return 0;
1501
 fail:
1502
    qemu_free(s->refcount_table);
1503
    qemu_free(s->refcount_block);
1504
    close(fd);
1505
    return -ENOMEM;
1506
}
1507

    
1508
static int qcow_make_empty(BlockDriverState *bs)
1509
{
1510
#if 0
1511
    /* XXX: not correct */
1512
    BDRVQcowState *s = bs->opaque;
1513
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1514
    int ret;
1515

1516
    memset(s->l1_table, 0, l1_length);
1517
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1518
        return -1;
1519
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1520
    if (ret < 0)
1521
        return ret;
1522

1523
    l2_cache_reset(bs);
1524
#endif
1525
    return 0;
1526
}
1527

    
1528
/* XXX: put compressed sectors first, then all the cluster aligned
1529
   tables to avoid losing bytes in alignment */
1530
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1531
                                 const uint8_t *buf, int nb_sectors)
1532
{
1533
    BDRVQcowState *s = bs->opaque;
1534
    z_stream strm;
1535
    int ret, out_len;
1536
    uint8_t *out_buf;
1537
    uint64_t cluster_offset;
1538

    
1539
    if (nb_sectors == 0) {
1540
        /* align end of file to a sector boundary to ease reading with
1541
           sector based I/Os */
1542
        cluster_offset = bdrv_getlength(s->hd);
1543
        cluster_offset = (cluster_offset + 511) & ~511;
1544
        bdrv_truncate(s->hd, cluster_offset);
1545
        return 0;
1546
    }
1547

    
1548
    if (nb_sectors != s->cluster_sectors)
1549
        return -EINVAL;
1550

    
1551
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1552
    if (!out_buf)
1553
        return -ENOMEM;
1554

    
1555
    /* best compression, small window, no zlib header */
1556
    memset(&strm, 0, sizeof(strm));
1557
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1558
                       Z_DEFLATED, -12,
1559
                       9, Z_DEFAULT_STRATEGY);
1560
    if (ret != 0) {
1561
        qemu_free(out_buf);
1562
        return -1;
1563
    }
1564

    
1565
    strm.avail_in = s->cluster_size;
1566
    strm.next_in = (uint8_t *)buf;
1567
    strm.avail_out = s->cluster_size;
1568
    strm.next_out = out_buf;
1569

    
1570
    ret = deflate(&strm, Z_FINISH);
1571
    if (ret != Z_STREAM_END && ret != Z_OK) {
1572
        qemu_free(out_buf);
1573
        deflateEnd(&strm);
1574
        return -1;
1575
    }
1576
    out_len = strm.next_out - out_buf;
1577

    
1578
    deflateEnd(&strm);
1579

    
1580
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1581
        /* could not compress: write normal cluster */
1582
        qcow_write(bs, sector_num, buf, s->cluster_sectors);
1583
    } else {
1584
        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
1585
                                              out_len);
1586
        if (!cluster_offset)
1587
            return -1;
1588
        cluster_offset &= s->cluster_offset_mask;
1589
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1590
            qemu_free(out_buf);
1591
            return -1;
1592
        }
1593
    }
1594

    
1595
    qemu_free(out_buf);
1596
    return 0;
1597
}
1598

    
1599
static void qcow_flush(BlockDriverState *bs)
1600
{
1601
    BDRVQcowState *s = bs->opaque;
1602
    bdrv_flush(s->hd);
1603
}
1604

    
1605
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1606
{
1607
    BDRVQcowState *s = bs->opaque;
1608
    bdi->cluster_size = s->cluster_size;
1609
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1610
        (s->cluster_bits + s->l2_bits);
1611
    return 0;
1612
}
1613

    
1614
/*********************************************************/
1615
/* snapshot support */
1616

    
1617
/* update the refcounts of snapshots and the copied flag */
1618
static int update_snapshot_refcount(BlockDriverState *bs,
1619
                                    int64_t l1_table_offset,
1620
                                    int l1_size,
1621
                                    int addend)
1622
{
1623
    BDRVQcowState *s = bs->opaque;
1624
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
1625
    int64_t old_offset, old_l2_offset;
1626
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
1627

    
1628
    l2_cache_reset(bs);
1629

    
1630
    l2_table = NULL;
1631
    l1_table = NULL;
1632
    l1_size2 = l1_size * sizeof(uint64_t);
1633
    l1_allocated = 0;
1634
    if (l1_table_offset != s->l1_table_offset) {
1635
        l1_table = qemu_malloc(l1_size2);
1636
        if (!l1_table)
1637
            goto fail;
1638
        l1_allocated = 1;
1639
        if (bdrv_pread(s->hd, l1_table_offset,
1640
                       l1_table, l1_size2) != l1_size2)
1641
            goto fail;
1642
        for(i = 0;i < l1_size; i++)
1643
            be64_to_cpus(&l1_table[i]);
1644
    } else {
1645
        assert(l1_size == s->l1_size);
1646
        l1_table = s->l1_table;
1647
        l1_allocated = 0;
1648
    }
1649

    
1650
    l2_size = s->l2_size * sizeof(uint64_t);
1651
    l2_table = qemu_malloc(l2_size);
1652
    if (!l2_table)
1653
        goto fail;
1654
    l1_modified = 0;
1655
    for(i = 0; i < l1_size; i++) {
1656
        l2_offset = l1_table[i];
1657
        if (l2_offset) {
1658
            old_l2_offset = l2_offset;
1659
            l2_offset &= ~QCOW_OFLAG_COPIED;
1660
            l2_modified = 0;
1661
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
1662
                goto fail;
1663
            for(j = 0; j < s->l2_size; j++) {
1664
                offset = be64_to_cpu(l2_table[j]);
1665
                if (offset != 0) {
1666
                    old_offset = offset;
1667
                    offset &= ~QCOW_OFLAG_COPIED;
1668
                    if (offset & QCOW_OFLAG_COMPRESSED) {
1669
                        nb_csectors = ((offset >> s->csize_shift) &
1670
                                       s->csize_mask) + 1;
1671
                        if (addend != 0)
1672
                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
1673
                                            nb_csectors * 512, addend);
1674
                        /* compressed clusters are never modified */
1675
                        refcount = 2;
1676
                    } else {
1677
                        if (addend != 0) {
1678
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
1679
                        } else {
1680
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
1681
                        }
1682
                    }
1683

    
1684
                    if (refcount == 1) {
1685
                        offset |= QCOW_OFLAG_COPIED;
1686
                    }
1687
                    if (offset != old_offset) {
1688
                        l2_table[j] = cpu_to_be64(offset);
1689
                        l2_modified = 1;
1690
                    }
1691
                }
1692
            }
1693
            if (l2_modified) {
1694
                if (bdrv_pwrite(s->hd,
1695
                                l2_offset, l2_table, l2_size) != l2_size)
1696
                    goto fail;
1697
            }
1698

    
1699
            if (addend != 0) {
1700
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
1701
            } else {
1702
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1703
            }
1704
            if (refcount == 1) {
1705
                l2_offset |= QCOW_OFLAG_COPIED;
1706
            }
1707
            if (l2_offset != old_l2_offset) {
1708
                l1_table[i] = l2_offset;
1709
                l1_modified = 1;
1710
            }
1711
        }
1712
    }
1713
    if (l1_modified) {
1714
        for(i = 0; i < l1_size; i++)
1715
            cpu_to_be64s(&l1_table[i]);
1716
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
1717
                        l1_size2) != l1_size2)
1718
            goto fail;
1719
        for(i = 0; i < l1_size; i++)
1720
            be64_to_cpus(&l1_table[i]);
1721
    }
1722
    if (l1_allocated)
1723
        qemu_free(l1_table);
1724
    qemu_free(l2_table);
1725
    return 0;
1726
 fail:
1727
    if (l1_allocated)
1728
        qemu_free(l1_table);
1729
    qemu_free(l2_table);
1730
    return -EIO;
1731
}
1732

    
1733
static void qcow_free_snapshots(BlockDriverState *bs)
1734
{
1735
    BDRVQcowState *s = bs->opaque;
1736
    int i;
1737

    
1738
    for(i = 0; i < s->nb_snapshots; i++) {
1739
        qemu_free(s->snapshots[i].name);
1740
        qemu_free(s->snapshots[i].id_str);
1741
    }
1742
    qemu_free(s->snapshots);
1743
    s->snapshots = NULL;
1744
    s->nb_snapshots = 0;
1745
}
1746

    
1747
static int qcow_read_snapshots(BlockDriverState *bs)
1748
{
1749
    BDRVQcowState *s = bs->opaque;
1750
    QCowSnapshotHeader h;
1751
    QCowSnapshot *sn;
1752
    int i, id_str_size, name_size;
1753
    int64_t offset;
1754
    uint32_t extra_data_size;
1755

    
1756
    offset = s->snapshots_offset;
1757
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1758
    if (!s->snapshots)
1759
        goto fail;
1760
    for(i = 0; i < s->nb_snapshots; i++) {
1761
        offset = align_offset(offset, 8);
1762
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1763
            goto fail;
1764
        offset += sizeof(h);
1765
        sn = s->snapshots + i;
1766
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1767
        sn->l1_size = be32_to_cpu(h.l1_size);
1768
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1769
        sn->date_sec = be32_to_cpu(h.date_sec);
1770
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1771
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1772
        extra_data_size = be32_to_cpu(h.extra_data_size);
1773

    
1774
        id_str_size = be16_to_cpu(h.id_str_size);
1775
        name_size = be16_to_cpu(h.name_size);
1776

    
1777
        offset += extra_data_size;
1778

    
1779
        sn->id_str = qemu_malloc(id_str_size + 1);
1780
        if (!sn->id_str)
1781
            goto fail;
1782
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1783
            goto fail;
1784
        offset += id_str_size;
1785
        sn->id_str[id_str_size] = '\0';
1786

    
1787
        sn->name = qemu_malloc(name_size + 1);
1788
        if (!sn->name)
1789
            goto fail;
1790
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1791
            goto fail;
1792
        offset += name_size;
1793
        sn->name[name_size] = '\0';
1794
    }
1795
    s->snapshots_size = offset - s->snapshots_offset;
1796
    return 0;
1797
 fail:
1798
    qcow_free_snapshots(bs);
1799
    return -1;
1800
}
1801

    
1802
/* add at the end of the file a new list of snapshots */
1803
static int qcow_write_snapshots(BlockDriverState *bs)
1804
{
1805
    BDRVQcowState *s = bs->opaque;
1806
    QCowSnapshot *sn;
1807
    QCowSnapshotHeader h;
1808
    int i, name_size, id_str_size, snapshots_size;
1809
    uint64_t data64;
1810
    uint32_t data32;
1811
    int64_t offset, snapshots_offset;
1812

    
1813
    /* compute the size of the snapshots */
1814
    offset = 0;
1815
    for(i = 0; i < s->nb_snapshots; i++) {
1816
        sn = s->snapshots + i;
1817
        offset = align_offset(offset, 8);
1818
        offset += sizeof(h);
1819
        offset += strlen(sn->id_str);
1820
        offset += strlen(sn->name);
1821
    }
1822
    snapshots_size = offset;
1823

    
1824
    snapshots_offset = alloc_clusters(bs, snapshots_size);
1825
    offset = snapshots_offset;
1826

    
1827
    for(i = 0; i < s->nb_snapshots; i++) {
1828
        sn = s->snapshots + i;
1829
        memset(&h, 0, sizeof(h));
1830
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
1831
        h.l1_size = cpu_to_be32(sn->l1_size);
1832
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
1833
        h.date_sec = cpu_to_be32(sn->date_sec);
1834
        h.date_nsec = cpu_to_be32(sn->date_nsec);
1835
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
1836

    
1837
        id_str_size = strlen(sn->id_str);
1838
        name_size = strlen(sn->name);
1839
        h.id_str_size = cpu_to_be16(id_str_size);
1840
        h.name_size = cpu_to_be16(name_size);
1841
        offset = align_offset(offset, 8);
1842
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1843
            goto fail;
1844
        offset += sizeof(h);
1845
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1846
            goto fail;
1847
        offset += id_str_size;
1848
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
1849
            goto fail;
1850
        offset += name_size;
1851
    }
1852

    
1853
    /* update the various header fields */
1854
    data64 = cpu_to_be64(snapshots_offset);
1855
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
1856
                    &data64, sizeof(data64)) != sizeof(data64))
1857
        goto fail;
1858
    data32 = cpu_to_be32(s->nb_snapshots);
1859
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
1860
                    &data32, sizeof(data32)) != sizeof(data32))
1861
        goto fail;
1862

    
1863
    /* free the old snapshot table */
1864
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
1865
    s->snapshots_offset = snapshots_offset;
1866
    s->snapshots_size = snapshots_size;
1867
    return 0;
1868
 fail:
1869
    return -1;
1870
}
1871

    
1872
static void find_new_snapshot_id(BlockDriverState *bs,
1873
                                 char *id_str, int id_str_size)
1874
{
1875
    BDRVQcowState *s = bs->opaque;
1876
    QCowSnapshot *sn;
1877
    int i, id, id_max = 0;
1878

    
1879
    for(i = 0; i < s->nb_snapshots; i++) {
1880
        sn = s->snapshots + i;
1881
        id = strtoul(sn->id_str, NULL, 10);
1882
        if (id > id_max)
1883
            id_max = id;
1884
    }
1885
    snprintf(id_str, id_str_size, "%d", id_max + 1);
1886
}
1887

    
1888
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
1889
{
1890
    BDRVQcowState *s = bs->opaque;
1891
    int i;
1892

    
1893
    for(i = 0; i < s->nb_snapshots; i++) {
1894
        if (!strcmp(s->snapshots[i].id_str, id_str))
1895
            return i;
1896
    }
1897
    return -1;
1898
}
1899

    
1900
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
1901
{
1902
    BDRVQcowState *s = bs->opaque;
1903
    int i, ret;
1904

    
1905
    ret = find_snapshot_by_id(bs, name);
1906
    if (ret >= 0)
1907
        return ret;
1908
    for(i = 0; i < s->nb_snapshots; i++) {
1909
        if (!strcmp(s->snapshots[i].name, name))
1910
            return i;
1911
    }
1912
    return -1;
1913
}
1914

    
1915
/* if no id is provided, a new one is constructed */
1916
static int qcow_snapshot_create(BlockDriverState *bs,
1917
                                QEMUSnapshotInfo *sn_info)
1918
{
1919
    BDRVQcowState *s = bs->opaque;
1920
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
1921
    int i, ret;
1922
    uint64_t *l1_table = NULL;
1923

    
1924
    memset(sn, 0, sizeof(*sn));
1925

    
1926
    if (sn_info->id_str[0] == '\0') {
1927
        /* compute a new id */
1928
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
1929
    }
1930

    
1931
    /* check that the ID is unique */
1932
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
1933
        return -ENOENT;
1934

    
1935
    sn->id_str = qemu_strdup(sn_info->id_str);
1936
    if (!sn->id_str)
1937
        goto fail;
1938
    sn->name = qemu_strdup(sn_info->name);
1939
    if (!sn->name)
1940
        goto fail;
1941
    sn->vm_state_size = sn_info->vm_state_size;
1942
    sn->date_sec = sn_info->date_sec;
1943
    sn->date_nsec = sn_info->date_nsec;
1944
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
1945

    
1946
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
1947
    if (ret < 0)
1948
        goto fail;
1949

    
1950
    /* create the L1 table of the snapshot */
1951
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
1952
    sn->l1_size = s->l1_size;
1953

    
1954
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
1955
    if (!l1_table)
1956
        goto fail;
1957
    for(i = 0; i < s->l1_size; i++) {
1958
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
1959
    }
1960
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
1961
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
1962
        (s->l1_size * sizeof(uint64_t)))
1963
        goto fail;
1964
    qemu_free(l1_table);
1965
    l1_table = NULL;
1966

    
1967
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
1968
    if (!snapshots1)
1969
        goto fail;
1970
    memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
1971
    s->snapshots = snapshots1;
1972
    s->snapshots[s->nb_snapshots++] = *sn;
1973

    
1974
    if (qcow_write_snapshots(bs) < 0)
1975
        goto fail;
1976
#ifdef DEBUG_ALLOC
1977
    check_refcounts(bs);
1978
#endif
1979
    return 0;
1980
 fail:
1981
    qemu_free(sn->name);
1982
    qemu_free(l1_table);
1983
    return -1;
1984
}
1985

    
1986
/* copy the snapshot 'snapshot_name' into the current disk image */
1987
static int qcow_snapshot_goto(BlockDriverState *bs,
1988
                              const char *snapshot_id)
1989
{
1990
    BDRVQcowState *s = bs->opaque;
1991
    QCowSnapshot *sn;
1992
    int i, snapshot_index, l1_size2;
1993

    
1994
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
1995
    if (snapshot_index < 0)
1996
        return -ENOENT;
1997
    sn = &s->snapshots[snapshot_index];
1998

    
1999
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
2000
        goto fail;
2001

    
2002
    if (grow_l1_table(bs, sn->l1_size) < 0)
2003
        goto fail;
2004

    
2005
    s->l1_size = sn->l1_size;
2006
    l1_size2 = s->l1_size * sizeof(uint64_t);
2007
    /* copy the snapshot l1 table to the current l1 table */
2008
    if (bdrv_pread(s->hd, sn->l1_table_offset,
2009
                   s->l1_table, l1_size2) != l1_size2)
2010
        goto fail;
2011
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
2012
                    s->l1_table, l1_size2) != l1_size2)
2013
        goto fail;
2014
    for(i = 0;i < s->l1_size; i++) {
2015
        be64_to_cpus(&s->l1_table[i]);
2016
    }
2017

    
2018
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
2019
        goto fail;
2020

    
2021
#ifdef DEBUG_ALLOC
2022
    check_refcounts(bs);
2023
#endif
2024
    return 0;
2025
 fail:
2026
    return -EIO;
2027
}
2028

    
2029
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2030
{
2031
    BDRVQcowState *s = bs->opaque;
2032
    QCowSnapshot *sn;
2033
    int snapshot_index, ret;
2034

    
2035
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2036
    if (snapshot_index < 0)
2037
        return -ENOENT;
2038
    sn = &s->snapshots[snapshot_index];
2039

    
2040
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
2041
    if (ret < 0)
2042
        return ret;
2043
    /* must update the copied flag on the current cluster offsets */
2044
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
2045
    if (ret < 0)
2046
        return ret;
2047
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
2048

    
2049
    qemu_free(sn->id_str);
2050
    qemu_free(sn->name);
2051
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
2052
    s->nb_snapshots--;
2053
    ret = qcow_write_snapshots(bs);
2054
    if (ret < 0) {
2055
        /* XXX: restore snapshot if error ? */
2056
        return ret;
2057
    }
2058
#ifdef DEBUG_ALLOC
2059
    check_refcounts(bs);
2060
#endif
2061
    return 0;
2062
}
2063

    
2064
static int qcow_snapshot_list(BlockDriverState *bs,
2065
                              QEMUSnapshotInfo **psn_tab)
2066
{
2067
    BDRVQcowState *s = bs->opaque;
2068
    QEMUSnapshotInfo *sn_tab, *sn_info;
2069
    QCowSnapshot *sn;
2070
    int i;
2071

    
2072
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
2073
    if (!sn_tab)
2074
        goto fail;
2075
    for(i = 0; i < s->nb_snapshots; i++) {
2076
        sn_info = sn_tab + i;
2077
        sn = s->snapshots + i;
2078
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
2079
                sn->id_str);
2080
        pstrcpy(sn_info->name, sizeof(sn_info->name),
2081
                sn->name);
2082
        sn_info->vm_state_size = sn->vm_state_size;
2083
        sn_info->date_sec = sn->date_sec;
2084
        sn_info->date_nsec = sn->date_nsec;
2085
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
2086
    }
2087
    *psn_tab = sn_tab;
2088
    return s->nb_snapshots;
2089
 fail:
2090
    qemu_free(sn_tab);
2091
    *psn_tab = NULL;
2092
    return -ENOMEM;
2093
}
2094

    
2095
/*********************************************************/
2096
/* refcount handling */
2097

    
2098
static int refcount_init(BlockDriverState *bs)
2099
{
2100
    BDRVQcowState *s = bs->opaque;
2101
    int ret, refcount_table_size2, i;
2102

    
2103
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
2104
    if (!s->refcount_block_cache)
2105
        goto fail;
2106
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
2107
    s->refcount_table = qemu_malloc(refcount_table_size2);
2108
    if (!s->refcount_table)
2109
        goto fail;
2110
    if (s->refcount_table_size > 0) {
2111
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
2112
                         s->refcount_table, refcount_table_size2);
2113
        if (ret != refcount_table_size2)
2114
            goto fail;
2115
        for(i = 0; i < s->refcount_table_size; i++)
2116
            be64_to_cpus(&s->refcount_table[i]);
2117
    }
2118
    return 0;
2119
 fail:
2120
    return -ENOMEM;
2121
}
2122

    
2123
static void refcount_close(BlockDriverState *bs)
2124
{
2125
    BDRVQcowState *s = bs->opaque;
2126
    qemu_free(s->refcount_block_cache);
2127
    qemu_free(s->refcount_table);
2128
}
2129

    
2130

    
2131
static int load_refcount_block(BlockDriverState *bs,
2132
                               int64_t refcount_block_offset)
2133
{
2134
    BDRVQcowState *s = bs->opaque;
2135
    int ret;
2136
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
2137
                     s->cluster_size);
2138
    if (ret != s->cluster_size)
2139
        return -EIO;
2140
    s->refcount_block_cache_offset = refcount_block_offset;
2141
    return 0;
2142
}
2143

    
2144
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
2145
{
2146
    BDRVQcowState *s = bs->opaque;
2147
    int refcount_table_index, block_index;
2148
    int64_t refcount_block_offset;
2149

    
2150
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2151
    if (refcount_table_index >= s->refcount_table_size)
2152
        return 0;
2153
    refcount_block_offset = s->refcount_table[refcount_table_index];
2154
    if (!refcount_block_offset)
2155
        return 0;
2156
    if (refcount_block_offset != s->refcount_block_cache_offset) {
2157
        /* better than nothing: return allocated if read error */
2158
        if (load_refcount_block(bs, refcount_block_offset) < 0)
2159
            return 1;
2160
    }
2161
    block_index = cluster_index &
2162
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2163
    return be16_to_cpu(s->refcount_block_cache[block_index]);
2164
}
2165

    
2166
/* return < 0 if error */
2167
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
2168
{
2169
    BDRVQcowState *s = bs->opaque;
2170
    int i, nb_clusters;
2171

    
2172
    nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
2173
    for(;;) {
2174
        if (get_refcount(bs, s->free_cluster_index) == 0) {
2175
            s->free_cluster_index++;
2176
            for(i = 1; i < nb_clusters; i++) {
2177
                if (get_refcount(bs, s->free_cluster_index) != 0)
2178
                    goto not_found;
2179
                s->free_cluster_index++;
2180
            }
2181
#ifdef DEBUG_ALLOC2
2182
            printf("alloc_clusters: size=%lld -> %lld\n",
2183
                   size,
2184
                   (s->free_cluster_index - nb_clusters) << s->cluster_bits);
2185
#endif
2186
            return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
2187
        } else {
2188
        not_found:
2189
            s->free_cluster_index++;
2190
        }
2191
    }
2192
}
2193

    
2194
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
2195
{
2196
    int64_t offset;
2197

    
2198
    offset = alloc_clusters_noref(bs, size);
2199
    update_refcount(bs, offset, size, 1);
2200
    return offset;
2201
}
2202

    
2203
/* only used to allocate compressed sectors. We try to allocate
2204
   contiguous sectors. size must be <= cluster_size */
2205
static int64_t alloc_bytes(BlockDriverState *bs, int size)
2206
{
2207
    BDRVQcowState *s = bs->opaque;
2208
    int64_t offset, cluster_offset;
2209
    int free_in_cluster;
2210

    
2211
    assert(size > 0 && size <= s->cluster_size);
2212
    if (s->free_byte_offset == 0) {
2213
        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
2214
    }
2215
 redo:
2216
    free_in_cluster = s->cluster_size -
2217
        (s->free_byte_offset & (s->cluster_size - 1));
2218
    if (size <= free_in_cluster) {
2219
        /* enough space in current cluster */
2220
        offset = s->free_byte_offset;
2221
        s->free_byte_offset += size;
2222
        free_in_cluster -= size;
2223
        if (free_in_cluster == 0)
2224
            s->free_byte_offset = 0;
2225
        if ((offset & (s->cluster_size - 1)) != 0)
2226
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2227
    } else {
2228
        offset = alloc_clusters(bs, s->cluster_size);
2229
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
2230
        if ((cluster_offset + s->cluster_size) == offset) {
2231
            /* we are lucky: contiguous data */
2232
            offset = s->free_byte_offset;
2233
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2234
            s->free_byte_offset += size;
2235
        } else {
2236
            s->free_byte_offset = offset;
2237
            goto redo;
2238
        }
2239
    }
2240
    return offset;
2241
}
2242

    
2243
static void free_clusters(BlockDriverState *bs,
2244
                          int64_t offset, int64_t size)
2245
{
2246
    update_refcount(bs, offset, size, -1);
2247
}
2248

    
2249
static int grow_refcount_table(BlockDriverState *bs, int min_size)
2250
{
2251
    BDRVQcowState *s = bs->opaque;
2252
    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
2253
    uint64_t *new_table;
2254
    int64_t table_offset;
2255
    uint64_t data64;
2256
    uint32_t data32;
2257
    int old_table_size;
2258
    int64_t old_table_offset;
2259

    
2260
    if (min_size <= s->refcount_table_size)
2261
        return 0;
2262
    /* compute new table size */
2263
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2264
    for(;;) {
2265
        if (refcount_table_clusters == 0) {
2266
            refcount_table_clusters = 1;
2267
        } else {
2268
            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
2269
        }
2270
        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
2271
        if (min_size <= new_table_size)
2272
            break;
2273
    }
2274
#ifdef DEBUG_ALLOC2
2275
    printf("grow_refcount_table from %d to %d\n",
2276
           s->refcount_table_size,
2277
           new_table_size);
2278
#endif
2279
    new_table_size2 = new_table_size * sizeof(uint64_t);
2280
    new_table = qemu_mallocz(new_table_size2);
2281
    if (!new_table)
2282
        return -ENOMEM;
2283
    memcpy(new_table, s->refcount_table,
2284
           s->refcount_table_size * sizeof(uint64_t));
2285
    for(i = 0; i < s->refcount_table_size; i++)
2286
        cpu_to_be64s(&new_table[i]);
2287
    /* Note: we cannot update the refcount now to avoid recursion */
2288
    table_offset = alloc_clusters_noref(bs, new_table_size2);
2289
    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
2290
    if (ret != new_table_size2)
2291
        goto fail;
2292
    for(i = 0; i < s->refcount_table_size; i++)
2293
        be64_to_cpus(&new_table[i]);
2294

    
2295
    data64 = cpu_to_be64(table_offset);
2296
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
2297
                    &data64, sizeof(data64)) != sizeof(data64))
2298
        goto fail;
2299
    data32 = cpu_to_be32(refcount_table_clusters);
2300
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_clusters),
2301
                    &data32, sizeof(data32)) != sizeof(data32))
2302
        goto fail;
2303
    qemu_free(s->refcount_table);
2304
    old_table_offset = s->refcount_table_offset;
2305
    old_table_size = s->refcount_table_size;
2306
    s->refcount_table = new_table;
2307
    s->refcount_table_size = new_table_size;
2308
    s->refcount_table_offset = table_offset;
2309

    
2310
    update_refcount(bs, table_offset, new_table_size2, 1);
2311
    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
2312
    return 0;
2313
 fail:
2314
    free_clusters(bs, table_offset, new_table_size2);
2315
    qemu_free(new_table);
2316
    return -EIO;
2317
}
2318

    
2319
/* addend must be 1 or -1 */
2320
/* XXX: cache several refcount block clusters ? */
2321
static int update_cluster_refcount(BlockDriverState *bs,
2322
                                   int64_t cluster_index,
2323
                                   int addend)
2324
{
2325
    BDRVQcowState *s = bs->opaque;
2326
    int64_t offset, refcount_block_offset;
2327
    int ret, refcount_table_index, block_index, refcount;
2328
    uint64_t data64;
2329

    
2330
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2331
    if (refcount_table_index >= s->refcount_table_size) {
2332
        if (addend < 0)
2333
            return -EINVAL;
2334
        ret = grow_refcount_table(bs, refcount_table_index + 1);
2335
        if (ret < 0)
2336
            return ret;
2337
    }
2338
    refcount_block_offset = s->refcount_table[refcount_table_index];
2339
    if (!refcount_block_offset) {
2340
        if (addend < 0)
2341
            return -EINVAL;
2342
        /* create a new refcount block */
2343
        /* Note: we cannot update the refcount now to avoid recursion */
2344
        offset = alloc_clusters_noref(bs, s->cluster_size);
2345
        memset(s->refcount_block_cache, 0, s->cluster_size);
2346
        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
2347
        if (ret != s->cluster_size)
2348
            return -EINVAL;
2349
        s->refcount_table[refcount_table_index] = offset;
2350
        data64 = cpu_to_be64(offset);
2351
        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
2352
                          refcount_table_index * sizeof(uint64_t),
2353
                          &data64, sizeof(data64));
2354
        if (ret != sizeof(data64))
2355
            return -EINVAL;
2356

    
2357
        refcount_block_offset = offset;
2358
        s->refcount_block_cache_offset = offset;
2359
        update_refcount(bs, offset, s->cluster_size, 1);
2360
    } else {
2361
        if (refcount_block_offset != s->refcount_block_cache_offset) {
2362
            if (load_refcount_block(bs, refcount_block_offset) < 0)
2363
                return -EIO;
2364
        }
2365
    }
2366
    /* we can update the count and save it */
2367
    block_index = cluster_index &
2368
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2369
    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
2370
    refcount += addend;
2371
    if (refcount < 0 || refcount > 0xffff)
2372
        return -EINVAL;
2373
    if (refcount == 0 && cluster_index < s->free_cluster_index) {
2374
        s->free_cluster_index = cluster_index;
2375
    }
2376
    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
2377
    if (bdrv_pwrite(s->hd,
2378
                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
2379
                    &s->refcount_block_cache[block_index], 2) != 2)
2380
        return -EIO;
2381
    return refcount;
2382
}
2383

    
2384
static void update_refcount(BlockDriverState *bs,
2385
                            int64_t offset, int64_t length,
2386
                            int addend)
2387
{
2388
    BDRVQcowState *s = bs->opaque;
2389
    int64_t start, last, cluster_offset;
2390

    
2391
#ifdef DEBUG_ALLOC2
2392
    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
2393
           offset, length, addend);
2394
#endif
2395
    if (length <= 0)
2396
        return;
2397
    start = offset & ~(s->cluster_size - 1);
2398
    last = (offset + length - 1) & ~(s->cluster_size - 1);
2399
    for(cluster_offset = start; cluster_offset <= last;
2400
        cluster_offset += s->cluster_size) {
2401
        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
2402
    }
2403
}
2404

    
2405
#ifdef DEBUG_ALLOC
2406
static void inc_refcounts(BlockDriverState *bs,
2407
                          uint16_t *refcount_table,
2408
                          int refcount_table_size,
2409
                          int64_t offset, int64_t size)
2410
{
2411
    BDRVQcowState *s = bs->opaque;
2412
    int64_t start, last, cluster_offset;
2413
    int k;
2414

    
2415
    if (size <= 0)
2416
        return;
2417

    
2418
    start = offset & ~(s->cluster_size - 1);
2419
    last = (offset + size - 1) & ~(s->cluster_size - 1);
2420
    for(cluster_offset = start; cluster_offset <= last;
2421
        cluster_offset += s->cluster_size) {
2422
        k = cluster_offset >> s->cluster_bits;
2423
        if (k < 0 || k >= refcount_table_size) {
2424
            printf("ERROR: invalid cluster offset=0x%llx\n", cluster_offset);
2425
        } else {
2426
            if (++refcount_table[k] == 0) {
2427
                printf("ERROR: overflow cluster offset=0x%llx\n", cluster_offset);
2428
            }
2429
        }
2430
    }
2431
}
2432

    
2433
static int check_refcounts_l1(BlockDriverState *bs,
2434
                              uint16_t *refcount_table,
2435
                              int refcount_table_size,
2436
                              int64_t l1_table_offset, int l1_size,
2437
                              int check_copied)
2438
{
2439
    BDRVQcowState *s = bs->opaque;
2440
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
2441
    int l2_size, i, j, nb_csectors, refcount;
2442

    
2443
    l2_table = NULL;
2444
    l1_size2 = l1_size * sizeof(uint64_t);
2445

    
2446
    inc_refcounts(bs, refcount_table, refcount_table_size,
2447
                  l1_table_offset, l1_size2);
2448

    
2449
    l1_table = qemu_malloc(l1_size2);
2450
    if (!l1_table)
2451
        goto fail;
2452
    if (bdrv_pread(s->hd, l1_table_offset,
2453
                   l1_table, l1_size2) != l1_size2)
2454
        goto fail;
2455
    for(i = 0;i < l1_size; i++)
2456
        be64_to_cpus(&l1_table[i]);
2457

    
2458
    l2_size = s->l2_size * sizeof(uint64_t);
2459
    l2_table = qemu_malloc(l2_size);
2460
    if (!l2_table)
2461
        goto fail;
2462
    for(i = 0; i < l1_size; i++) {
2463
        l2_offset = l1_table[i];
2464
        if (l2_offset) {
2465
            if (check_copied) {
2466
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2467
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
2468
                    printf("ERROR OFLAG_COPIED: l2_offset=%llx refcount=%d\n",
2469
                           l2_offset, refcount);
2470
                }
2471
            }
2472
            l2_offset &= ~QCOW_OFLAG_COPIED;
2473
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
2474
                goto fail;
2475
            for(j = 0; j < s->l2_size; j++) {
2476
                offset = be64_to_cpu(l2_table[j]);
2477
                if (offset != 0) {
2478
                    if (offset & QCOW_OFLAG_COMPRESSED) {
2479
                        if (offset & QCOW_OFLAG_COPIED) {
2480
                            printf("ERROR: cluster %lld: copied flag must never be set for compressed clusters\n",
2481
                                   offset >> s->cluster_bits);
2482
                            offset &= ~QCOW_OFLAG_COPIED;
2483
                        }
2484
                        nb_csectors = ((offset >> s->csize_shift) &
2485
                                       s->csize_mask) + 1;
2486
                        offset &= s->cluster_offset_mask;
2487
                        inc_refcounts(bs, refcount_table,
2488
                                      refcount_table_size,
2489
                                      offset & ~511, nb_csectors * 512);
2490
                    } else {
2491
                        if (check_copied) {
2492
                            refcount = get_refcount(bs, (offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2493
                            if ((refcount == 1) != ((offset & QCOW_OFLAG_COPIED) != 0)) {
2494
                                printf("ERROR OFLAG_COPIED: offset=%llx refcount=%d\n",
2495
                                       offset, refcount);
2496
                            }
2497
                        }
2498
                        offset &= ~QCOW_OFLAG_COPIED;
2499
                        inc_refcounts(bs, refcount_table,
2500
                                      refcount_table_size,
2501
                                      offset, s->cluster_size);
2502
                    }
2503
                }
2504
            }
2505
            inc_refcounts(bs, refcount_table,
2506
                          refcount_table_size,
2507
                          l2_offset,
2508
                          s->cluster_size);
2509
        }
2510
    }
2511
    qemu_free(l1_table);
2512
    qemu_free(l2_table);
2513
    return 0;
2514
 fail:
2515
    printf("ERROR: I/O error in check_refcounts_l1\n");
2516
    qemu_free(l1_table);
2517
    qemu_free(l2_table);
2518
    return -EIO;
2519
}
2520

    
2521
static void check_refcounts(BlockDriverState *bs)
2522
{
2523
    BDRVQcowState *s = bs->opaque;
2524
    int64_t size;
2525
    int nb_clusters, refcount1, refcount2, i;
2526
    QCowSnapshot *sn;
2527
    uint16_t *refcount_table;
2528

    
2529
    size = bdrv_getlength(s->hd);
2530
    nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
2531
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
2532

    
2533
    /* header */
2534
    inc_refcounts(bs, refcount_table, nb_clusters,
2535
                  0, s->cluster_size);
2536

    
2537
    check_refcounts_l1(bs, refcount_table, nb_clusters,
2538
                       s->l1_table_offset, s->l1_size, 1);
2539

    
2540
    /* snapshots */
2541
    for(i = 0; i < s->nb_snapshots; i++) {
2542
        sn = s->snapshots + i;
2543
        check_refcounts_l1(bs, refcount_table, nb_clusters,
2544
                           sn->l1_table_offset, sn->l1_size, 0);
2545
    }
2546
    inc_refcounts(bs, refcount_table, nb_clusters,
2547
                  s->snapshots_offset, s->snapshots_size);
2548

    
2549
    /* refcount data */
2550
    inc_refcounts(bs, refcount_table, nb_clusters,
2551
                  s->refcount_table_offset,
2552
                  s->refcount_table_size * sizeof(uint64_t));
2553
    for(i = 0; i < s->refcount_table_size; i++) {
2554
        int64_t offset;
2555
        offset = s->refcount_table[i];
2556
        if (offset != 0) {
2557
            inc_refcounts(bs, refcount_table, nb_clusters,
2558
                          offset, s->cluster_size);
2559
        }
2560
    }
2561

    
2562
    /* compare ref counts */
2563
    for(i = 0; i < nb_clusters; i++) {
2564
        refcount1 = get_refcount(bs, i);
2565
        refcount2 = refcount_table[i];
2566
        if (refcount1 != refcount2)
2567
            printf("ERROR cluster %d refcount=%d reference=%d\n",
2568
                   i, refcount1, refcount2);
2569
    }
2570

    
2571
    qemu_free(refcount_table);
2572
}
2573

    
2574
#if 0
2575
static void dump_refcounts(BlockDriverState *bs)
2576
{
2577
    BDRVQcowState *s = bs->opaque;
2578
    int64_t nb_clusters, k, k1, size;
2579
    int refcount;
2580

2581
    size = bdrv_getlength(s->hd);
2582
    nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
2583
    for(k = 0; k < nb_clusters;) {
2584
        k1 = k;
2585
        refcount = get_refcount(bs, k);
2586
        k++;
2587
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2588
            k++;
2589
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2590
    }
2591
}
2592
#endif
2593
#endif
2594

    
2595
BlockDriver bdrv_qcow2 = {
2596
    "qcow2",
2597
    sizeof(BDRVQcowState),
2598
    qcow_probe,
2599
    qcow_open,
2600
    NULL,
2601
    NULL,
2602
    qcow_close,
2603
    qcow_create,
2604
    qcow_flush,
2605
    qcow_is_allocated,
2606
    qcow_set_key,
2607
    qcow_make_empty,
2608

    
2609
    .bdrv_aio_read = qcow_aio_read,
2610
    .bdrv_aio_write = qcow_aio_write,
2611
    .bdrv_aio_cancel = qcow_aio_cancel,
2612
    .aiocb_size = sizeof(QCowAIOCB),
2613
    .bdrv_write_compressed = qcow_write_compressed,
2614

    
2615
    .bdrv_snapshot_create = qcow_snapshot_create,
2616
    .bdrv_snapshot_goto = qcow_snapshot_goto,
2617
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2618
    .bdrv_snapshot_list = qcow_snapshot_list,
2619
    .bdrv_get_info = qcow_get_info,
2620
};