Statistics
| Branch: | Revision:

root / block-qcow2.c @ 943984c7

History | View | Annotate | Download (80.1 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include <zlib.h>
27
#include "aes.h"
28
#include <assert.h>
29

    
30
/*
31
  Differences with QCOW:
32

33
  - Support for multiple incremental snapshots.
34
  - Memory management by reference counts.
35
  - Clusters which have a reference count of one have the bit
36
    QCOW_OFLAG_COPIED to optimize write performance.
37
  - Size of compressed clusters is stored in sectors to reduce bit usage
38
    in the cluster offsets.
39
  - Support for storing additional data (such as the VM state) in the
40
    snapshots.
41
  - If a backing store is used, the cluster size is not constrained
42
    (could be backported to QCOW).
43
  - L2 tables have always a size of one cluster.
44
*/
45

    
46
//#define DEBUG_ALLOC
47
//#define DEBUG_ALLOC2
48

    
49
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
50
#define QCOW_VERSION 2
51

    
52
#define QCOW_CRYPT_NONE 0
53
#define QCOW_CRYPT_AES  1
54

    
55
#define QCOW_MAX_CRYPT_CLUSTERS 32
56

    
57
/* indicate that the refcount of the referenced cluster is exactly one. */
58
#define QCOW_OFLAG_COPIED     (1LL << 63)
59
/* indicate that the cluster is compressed (they never have the copied flag) */
60
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
61

    
62
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
63

    
64
typedef struct QCowHeader {
65
    uint32_t magic;
66
    uint32_t version;
67
    uint64_t backing_file_offset;
68
    uint32_t backing_file_size;
69
    uint32_t cluster_bits;
70
    uint64_t size; /* in bytes */
71
    uint32_t crypt_method;
72
    uint32_t l1_size; /* XXX: save number of clusters instead ? */
73
    uint64_t l1_table_offset;
74
    uint64_t refcount_table_offset;
75
    uint32_t refcount_table_clusters;
76
    uint32_t nb_snapshots;
77
    uint64_t snapshots_offset;
78
} QCowHeader;
79

    
80
typedef struct __attribute__((packed)) QCowSnapshotHeader {
81
    /* header is 8 byte aligned */
82
    uint64_t l1_table_offset;
83

    
84
    uint32_t l1_size;
85
    uint16_t id_str_size;
86
    uint16_t name_size;
87

    
88
    uint32_t date_sec;
89
    uint32_t date_nsec;
90

    
91
    uint64_t vm_clock_nsec;
92

    
93
    uint32_t vm_state_size;
94
    uint32_t extra_data_size; /* for extension */
95
    /* extra data follows */
96
    /* id_str follows */
97
    /* name follows  */
98
} QCowSnapshotHeader;
99

    
100
#define L2_CACHE_SIZE 16
101

    
102
typedef struct QCowSnapshot {
103
    uint64_t l1_table_offset;
104
    uint32_t l1_size;
105
    char *id_str;
106
    char *name;
107
    uint32_t vm_state_size;
108
    uint32_t date_sec;
109
    uint32_t date_nsec;
110
    uint64_t vm_clock_nsec;
111
} QCowSnapshot;
112

    
113
typedef struct BDRVQcowState {
114
    BlockDriverState *hd;
115
    int cluster_bits;
116
    int cluster_size;
117
    int cluster_sectors;
118
    int l2_bits;
119
    int l2_size;
120
    int l1_size;
121
    int l1_vm_state_index;
122
    int csize_shift;
123
    int csize_mask;
124
    uint64_t cluster_offset_mask;
125
    uint64_t l1_table_offset;
126
    uint64_t *l1_table;
127
    uint64_t *l2_cache;
128
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
129
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
130
    uint8_t *cluster_cache;
131
    uint8_t *cluster_data;
132
    uint64_t cluster_cache_offset;
133

    
134
    uint64_t *refcount_table;
135
    uint64_t refcount_table_offset;
136
    uint32_t refcount_table_size;
137
    uint64_t refcount_block_cache_offset;
138
    uint16_t *refcount_block_cache;
139
    int64_t free_cluster_index;
140
    int64_t free_byte_offset;
141

    
142
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
143
    uint32_t crypt_method_header;
144
    AES_KEY aes_encrypt_key;
145
    AES_KEY aes_decrypt_key;
146
    uint64_t snapshots_offset;
147
    int snapshots_size;
148
    int nb_snapshots;
149
    QCowSnapshot *snapshots;
150
} BDRVQcowState;
151

    
152
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
153
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
154
                     uint8_t *buf, int nb_sectors);
155
static int qcow_read_snapshots(BlockDriverState *bs);
156
static void qcow_free_snapshots(BlockDriverState *bs);
157
static int refcount_init(BlockDriverState *bs);
158
static void refcount_close(BlockDriverState *bs);
159
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
160
static int update_cluster_refcount(BlockDriverState *bs,
161
                                   int64_t cluster_index,
162
                                   int addend);
163
static void update_refcount(BlockDriverState *bs,
164
                            int64_t offset, int64_t length,
165
                            int addend);
166
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
167
static int64_t alloc_bytes(BlockDriverState *bs, int size);
168
static void free_clusters(BlockDriverState *bs,
169
                          int64_t offset, int64_t size);
170
#ifdef DEBUG_ALLOC
171
static void check_refcounts(BlockDriverState *bs);
172
#endif
173

    
174
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
175
{
176
    const QCowHeader *cow_header = (const void *)buf;
177

    
178
    if (buf_size >= sizeof(QCowHeader) &&
179
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
180
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
181
        return 100;
182
    else
183
        return 0;
184
}
185

    
186
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
187
{
188
    BDRVQcowState *s = bs->opaque;
189
    int len, i, shift, ret;
190
    QCowHeader header;
191

    
192
    /* Performance is terrible right now with cache=writethrough due mainly
193
     * to reference count updates.  If the user does not explicitly specify
194
     * a caching type, force to writeback caching.
195
     */
196
    if ((flags & BDRV_O_CACHE_DEF)) {
197
        flags |= BDRV_O_CACHE_WB;
198
        flags &= ~BDRV_O_CACHE_DEF;
199
    }
200
    ret = bdrv_file_open(&s->hd, filename, flags);
201
    if (ret < 0)
202
        return ret;
203
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
204
        goto fail;
205
    be32_to_cpus(&header.magic);
206
    be32_to_cpus(&header.version);
207
    be64_to_cpus(&header.backing_file_offset);
208
    be32_to_cpus(&header.backing_file_size);
209
    be64_to_cpus(&header.size);
210
    be32_to_cpus(&header.cluster_bits);
211
    be32_to_cpus(&header.crypt_method);
212
    be64_to_cpus(&header.l1_table_offset);
213
    be32_to_cpus(&header.l1_size);
214
    be64_to_cpus(&header.refcount_table_offset);
215
    be32_to_cpus(&header.refcount_table_clusters);
216
    be64_to_cpus(&header.snapshots_offset);
217
    be32_to_cpus(&header.nb_snapshots);
218

    
219
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
220
        goto fail;
221
    if (header.size <= 1 ||
222
        header.cluster_bits < 9 ||
223
        header.cluster_bits > 16)
224
        goto fail;
225
    if (header.crypt_method > QCOW_CRYPT_AES)
226
        goto fail;
227
    s->crypt_method_header = header.crypt_method;
228
    if (s->crypt_method_header)
229
        bs->encrypted = 1;
230
    s->cluster_bits = header.cluster_bits;
231
    s->cluster_size = 1 << s->cluster_bits;
232
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
233
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
234
    s->l2_size = 1 << s->l2_bits;
235
    bs->total_sectors = header.size / 512;
236
    s->csize_shift = (62 - (s->cluster_bits - 8));
237
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
238
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
239
    s->refcount_table_offset = header.refcount_table_offset;
240
    s->refcount_table_size =
241
        header.refcount_table_clusters << (s->cluster_bits - 3);
242

    
243
    s->snapshots_offset = header.snapshots_offset;
244
    s->nb_snapshots = header.nb_snapshots;
245

    
246
    /* read the level 1 table */
247
    s->l1_size = header.l1_size;
248
    shift = s->cluster_bits + s->l2_bits;
249
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
250
    /* the L1 table must contain at least enough entries to put
251
       header.size bytes */
252
    if (s->l1_size < s->l1_vm_state_index)
253
        goto fail;
254
    s->l1_table_offset = header.l1_table_offset;
255
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
256
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
257
        s->l1_size * sizeof(uint64_t))
258
        goto fail;
259
    for(i = 0;i < s->l1_size; i++) {
260
        be64_to_cpus(&s->l1_table[i]);
261
    }
262
    /* alloc L2 cache */
263
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
264
    s->cluster_cache = qemu_malloc(s->cluster_size);
265
    /* one more sector for decompressed data alignment */
266
    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
267
                                  + 512);
268
    s->cluster_cache_offset = -1;
269

    
270
    if (refcount_init(bs) < 0)
271
        goto fail;
272

    
273
    /* read the backing file name */
274
    if (header.backing_file_offset != 0) {
275
        len = header.backing_file_size;
276
        if (len > 1023)
277
            len = 1023;
278
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
279
            goto fail;
280
        bs->backing_file[len] = '\0';
281
    }
282
    if (qcow_read_snapshots(bs) < 0)
283
        goto fail;
284

    
285
#ifdef DEBUG_ALLOC
286
    check_refcounts(bs);
287
#endif
288
    return 0;
289

    
290
 fail:
291
    qcow_free_snapshots(bs);
292
    refcount_close(bs);
293
    qemu_free(s->l1_table);
294
    qemu_free(s->l2_cache);
295
    qemu_free(s->cluster_cache);
296
    qemu_free(s->cluster_data);
297
    bdrv_delete(s->hd);
298
    return -1;
299
}
300

    
301
static int qcow_set_key(BlockDriverState *bs, const char *key)
302
{
303
    BDRVQcowState *s = bs->opaque;
304
    uint8_t keybuf[16];
305
    int len, i;
306

    
307
    memset(keybuf, 0, 16);
308
    len = strlen(key);
309
    if (len > 16)
310
        len = 16;
311
    /* XXX: we could compress the chars to 7 bits to increase
312
       entropy */
313
    for(i = 0;i < len;i++) {
314
        keybuf[i] = key[i];
315
    }
316
    s->crypt_method = s->crypt_method_header;
317

    
318
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
319
        return -1;
320
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
321
        return -1;
322
#if 0
323
    /* test */
324
    {
325
        uint8_t in[16];
326
        uint8_t out[16];
327
        uint8_t tmp[16];
328
        for(i=0;i<16;i++)
329
            in[i] = i;
330
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
331
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
332
        for(i = 0; i < 16; i++)
333
            printf(" %02x", tmp[i]);
334
        printf("\n");
335
        for(i = 0; i < 16; i++)
336
            printf(" %02x", out[i]);
337
        printf("\n");
338
    }
339
#endif
340
    return 0;
341
}
342

    
343
/* The crypt function is compatible with the linux cryptoloop
344
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
345
   supported */
346
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
347
                            uint8_t *out_buf, const uint8_t *in_buf,
348
                            int nb_sectors, int enc,
349
                            const AES_KEY *key)
350
{
351
    union {
352
        uint64_t ll[2];
353
        uint8_t b[16];
354
    } ivec;
355
    int i;
356

    
357
    for(i = 0; i < nb_sectors; i++) {
358
        ivec.ll[0] = cpu_to_le64(sector_num);
359
        ivec.ll[1] = 0;
360
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
361
                        ivec.b, enc);
362
        sector_num++;
363
        in_buf += 512;
364
        out_buf += 512;
365
    }
366
}
367

    
368
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
369
                        uint64_t cluster_offset, int n_start, int n_end)
370
{
371
    BDRVQcowState *s = bs->opaque;
372
    int n, ret;
373

    
374
    n = n_end - n_start;
375
    if (n <= 0)
376
        return 0;
377
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
378
    if (ret < 0)
379
        return ret;
380
    if (s->crypt_method) {
381
        encrypt_sectors(s, start_sect + n_start,
382
                        s->cluster_data,
383
                        s->cluster_data, n, 1,
384
                        &s->aes_encrypt_key);
385
    }
386
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
387
                     s->cluster_data, n);
388
    if (ret < 0)
389
        return ret;
390
    return 0;
391
}
392

    
393
static void l2_cache_reset(BlockDriverState *bs)
394
{
395
    BDRVQcowState *s = bs->opaque;
396

    
397
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
398
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
399
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
400
}
401

    
402
static inline int l2_cache_new_entry(BlockDriverState *bs)
403
{
404
    BDRVQcowState *s = bs->opaque;
405
    uint32_t min_count;
406
    int min_index, i;
407

    
408
    /* find a new entry in the least used one */
409
    min_index = 0;
410
    min_count = 0xffffffff;
411
    for(i = 0; i < L2_CACHE_SIZE; i++) {
412
        if (s->l2_cache_counts[i] < min_count) {
413
            min_count = s->l2_cache_counts[i];
414
            min_index = i;
415
        }
416
    }
417
    return min_index;
418
}
419

    
420
static int64_t align_offset(int64_t offset, int n)
421
{
422
    offset = (offset + n - 1) & ~(n - 1);
423
    return offset;
424
}
425

    
426
static int grow_l1_table(BlockDriverState *bs, int min_size)
427
{
428
    BDRVQcowState *s = bs->opaque;
429
    int new_l1_size, new_l1_size2, ret, i;
430
    uint64_t *new_l1_table;
431
    uint64_t new_l1_table_offset;
432
    uint8_t data[12];
433

    
434
    new_l1_size = s->l1_size;
435
    if (min_size <= new_l1_size)
436
        return 0;
437
    while (min_size > new_l1_size) {
438
        new_l1_size = (new_l1_size * 3 + 1) / 2;
439
    }
440
#ifdef DEBUG_ALLOC2
441
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
442
#endif
443

    
444
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
445
    new_l1_table = qemu_mallocz(new_l1_size2);
446
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
447

    
448
    /* write new table (align to cluster) */
449
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
450

    
451
    for(i = 0; i < s->l1_size; i++)
452
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
453
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
454
    if (ret != new_l1_size2)
455
        goto fail;
456
    for(i = 0; i < s->l1_size; i++)
457
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
458

    
459
    /* set new table */
460
    cpu_to_be32w((uint32_t*)data, new_l1_size);
461
    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
462
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
463
                sizeof(data)) != sizeof(data))
464
        goto fail;
465
    qemu_free(s->l1_table);
466
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
467
    s->l1_table_offset = new_l1_table_offset;
468
    s->l1_table = new_l1_table;
469
    s->l1_size = new_l1_size;
470
    return 0;
471
 fail:
472
    qemu_free(s->l1_table);
473
    return -EIO;
474
}
475

    
476
/*
477
 * seek_l2_table
478
 *
479
 * seek l2_offset in the l2_cache table
480
 * if not found, return NULL,
481
 * if found,
482
 *   increments the l2 cache hit count of the entry,
483
 *   if counter overflow, divide by two all counters
484
 *   return the pointer to the l2 cache entry
485
 *
486
 */
487

    
488
static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
489
{
490
    int i, j;
491

    
492
    for(i = 0; i < L2_CACHE_SIZE; i++) {
493
        if (l2_offset == s->l2_cache_offsets[i]) {
494
            /* increment the hit count */
495
            if (++s->l2_cache_counts[i] == 0xffffffff) {
496
                for(j = 0; j < L2_CACHE_SIZE; j++) {
497
                    s->l2_cache_counts[j] >>= 1;
498
                }
499
            }
500
            return s->l2_cache + (i << s->l2_bits);
501
        }
502
    }
503
    return NULL;
504
}
505

    
506
/*
507
 * l2_load
508
 *
509
 * Loads a L2 table into memory. If the table is in the cache, the cache
510
 * is used; otherwise the L2 table is loaded from the image file.
511
 *
512
 * Returns a pointer to the L2 table on success, or NULL if the read from
513
 * the image file failed.
514
 */
515

    
516
static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
517
{
518
    BDRVQcowState *s = bs->opaque;
519
    int min_index;
520
    uint64_t *l2_table;
521

    
522
    /* seek if the table for the given offset is in the cache */
523

    
524
    l2_table = seek_l2_table(s, l2_offset);
525
    if (l2_table != NULL)
526
        return l2_table;
527

    
528
    /* not found: load a new entry in the least used one */
529

    
530
    min_index = l2_cache_new_entry(bs);
531
    l2_table = s->l2_cache + (min_index << s->l2_bits);
532
    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
533
        s->l2_size * sizeof(uint64_t))
534
        return NULL;
535
    s->l2_cache_offsets[min_index] = l2_offset;
536
    s->l2_cache_counts[min_index] = 1;
537

    
538
    return l2_table;
539
}
540

    
541
/*
542
 * l2_allocate
543
 *
544
 * Allocate a new l2 entry in the file. If l1_index points to an already
545
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
546
 * table) copy the contents of the old L2 table into the newly allocated one.
547
 * Otherwise the new table is initialized with zeros.
548
 *
549
 */
550

    
551
static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
552
{
553
    BDRVQcowState *s = bs->opaque;
554
    int min_index;
555
    uint64_t old_l2_offset, tmp;
556
    uint64_t *l2_table, l2_offset;
557

    
558
    old_l2_offset = s->l1_table[l1_index];
559

    
560
    /* allocate a new l2 entry */
561

    
562
    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
563

    
564
    /* update the L1 entry */
565

    
566
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
567

    
568
    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
569
    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
570
                    &tmp, sizeof(tmp)) != sizeof(tmp))
571
        return NULL;
572

    
573
    /* allocate a new entry in the l2 cache */
574

    
575
    min_index = l2_cache_new_entry(bs);
576
    l2_table = s->l2_cache + (min_index << s->l2_bits);
577

    
578
    if (old_l2_offset == 0) {
579
        /* if there was no old l2 table, clear the new table */
580
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
581
    } else {
582
        /* if there was an old l2 table, read it from the disk */
583
        if (bdrv_pread(s->hd, old_l2_offset,
584
                       l2_table, s->l2_size * sizeof(uint64_t)) !=
585
            s->l2_size * sizeof(uint64_t))
586
            return NULL;
587
    }
588
    /* write the l2 table to the file */
589
    if (bdrv_pwrite(s->hd, l2_offset,
590
                    l2_table, s->l2_size * sizeof(uint64_t)) !=
591
        s->l2_size * sizeof(uint64_t))
592
        return NULL;
593

    
594
    /* update the l2 cache entry */
595

    
596
    s->l2_cache_offsets[min_index] = l2_offset;
597
    s->l2_cache_counts[min_index] = 1;
598

    
599
    return l2_table;
600
}
601

    
602
static int size_to_clusters(BDRVQcowState *s, int64_t size)
603
{
604
    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
605
}
606

    
607
static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
608
        uint64_t *l2_table, uint64_t start, uint64_t mask)
609
{
610
    int i;
611
    uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
612

    
613
    if (!offset)
614
        return 0;
615

    
616
    for (i = start; i < start + nb_clusters; i++)
617
        if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
618
            break;
619

    
620
        return (i - start);
621
}
622

    
623
static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
624
{
625
    int i = 0;
626

    
627
    while(nb_clusters-- && l2_table[i] == 0)
628
        i++;
629

    
630
    return i;
631
}
632

    
633
/*
634
 * get_cluster_offset
635
 *
636
 * For a given offset of the disk image, return cluster offset in
637
 * qcow2 file.
638
 *
639
 * on entry, *num is the number of contiguous clusters we'd like to
640
 * access following offset.
641
 *
642
 * on exit, *num is the number of contiguous clusters we can read.
643
 *
644
 * Return 1, if the offset is found
645
 * Return 0, otherwise.
646
 *
647
 */
648

    
649
static uint64_t get_cluster_offset(BlockDriverState *bs,
650
                                   uint64_t offset, int *num)
651
{
652
    BDRVQcowState *s = bs->opaque;
653
    int l1_index, l2_index;
654
    uint64_t l2_offset, *l2_table, cluster_offset;
655
    int l1_bits, c;
656
    int index_in_cluster, nb_available, nb_needed, nb_clusters;
657

    
658
    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
659
    nb_needed = *num + index_in_cluster;
660

    
661
    l1_bits = s->l2_bits + s->cluster_bits;
662

    
663
    /* compute how many bytes there are between the offset and
664
     * the end of the l1 entry
665
     */
666

    
667
    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
668

    
669
    /* compute the number of available sectors */
670

    
671
    nb_available = (nb_available >> 9) + index_in_cluster;
672

    
673
    cluster_offset = 0;
674

    
675
    /* seek the the l2 offset in the l1 table */
676

    
677
    l1_index = offset >> l1_bits;
678
    if (l1_index >= s->l1_size)
679
        goto out;
680

    
681
    l2_offset = s->l1_table[l1_index];
682

    
683
    /* seek the l2 table of the given l2 offset */
684

    
685
    if (!l2_offset)
686
        goto out;
687

    
688
    /* load the l2 table in memory */
689

    
690
    l2_offset &= ~QCOW_OFLAG_COPIED;
691
    l2_table = l2_load(bs, l2_offset);
692
    if (l2_table == NULL)
693
        return 0;
694

    
695
    /* find the cluster offset for the given disk offset */
696

    
697
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
698
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
699
    nb_clusters = size_to_clusters(s, nb_needed << 9);
700

    
701
    if (!cluster_offset) {
702
        /* how many empty clusters ? */
703
        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
704
    } else {
705
        /* how many allocated clusters ? */
706
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
707
                &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
708
    }
709

    
710
   nb_available = (c * s->cluster_sectors);
711
out:
712
    if (nb_available > nb_needed)
713
        nb_available = nb_needed;
714

    
715
    *num = nb_available - index_in_cluster;
716

    
717
    return cluster_offset & ~QCOW_OFLAG_COPIED;
718
}
719

    
720
/*
721
 * free_any_clusters
722
 *
723
 * free clusters according to its type: compressed or not
724
 *
725
 */
726

    
727
static void free_any_clusters(BlockDriverState *bs,
728
                              uint64_t cluster_offset, int nb_clusters)
729
{
730
    BDRVQcowState *s = bs->opaque;
731

    
732
    /* free the cluster */
733

    
734
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
735
        int nb_csectors;
736
        nb_csectors = ((cluster_offset >> s->csize_shift) &
737
                       s->csize_mask) + 1;
738
        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
739
                      nb_csectors * 512);
740
        return;
741
    }
742

    
743
    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
744

    
745
    return;
746
}
747

    
748
/*
749
 * get_cluster_table
750
 *
751
 * for a given disk offset, load (and allocate if needed)
752
 * the l2 table.
753
 *
754
 * the l2 table offset in the qcow2 file and the cluster index
755
 * in the l2 table are given to the caller.
756
 *
757
 */
758

    
759
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
760
                             uint64_t **new_l2_table,
761
                             uint64_t *new_l2_offset,
762
                             int *new_l2_index)
763
{
764
    BDRVQcowState *s = bs->opaque;
765
    int l1_index, l2_index, ret;
766
    uint64_t l2_offset, *l2_table;
767

    
768
    /* seek the the l2 offset in the l1 table */
769

    
770
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
771
    if (l1_index >= s->l1_size) {
772
        ret = grow_l1_table(bs, l1_index + 1);
773
        if (ret < 0)
774
            return 0;
775
    }
776
    l2_offset = s->l1_table[l1_index];
777

    
778
    /* seek the l2 table of the given l2 offset */
779

    
780
    if (l2_offset & QCOW_OFLAG_COPIED) {
781
        /* load the l2 table in memory */
782
        l2_offset &= ~QCOW_OFLAG_COPIED;
783
        l2_table = l2_load(bs, l2_offset);
784
        if (l2_table == NULL)
785
            return 0;
786
    } else {
787
        if (l2_offset)
788
            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
789
        l2_table = l2_allocate(bs, l1_index);
790
        if (l2_table == NULL)
791
            return 0;
792
        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
793
    }
794

    
795
    /* find the cluster offset for the given disk offset */
796

    
797
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
798

    
799
    *new_l2_table = l2_table;
800
    *new_l2_offset = l2_offset;
801
    *new_l2_index = l2_index;
802

    
803
    return 1;
804
}
805

    
806
/*
807
 * alloc_compressed_cluster_offset
808
 *
809
 * For a given offset of the disk image, return cluster offset in
810
 * qcow2 file.
811
 *
812
 * If the offset is not found, allocate a new compressed cluster.
813
 *
814
 * Return the cluster offset if successful,
815
 * Return 0, otherwise.
816
 *
817
 */
818

    
819
static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
820
                                                uint64_t offset,
821
                                                int compressed_size)
822
{
823
    BDRVQcowState *s = bs->opaque;
824
    int l2_index, ret;
825
    uint64_t l2_offset, *l2_table, cluster_offset;
826
    int nb_csectors;
827

    
828
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
829
    if (ret == 0)
830
        return 0;
831

    
832
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
833
    if (cluster_offset & QCOW_OFLAG_COPIED)
834
        return cluster_offset & ~QCOW_OFLAG_COPIED;
835

    
836
    if (cluster_offset)
837
        free_any_clusters(bs, cluster_offset, 1);
838

    
839
    cluster_offset = alloc_bytes(bs, compressed_size);
840
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
841
                  (cluster_offset >> 9);
842

    
843
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
844
                      ((uint64_t)nb_csectors << s->csize_shift);
845

    
846
    /* update L2 table */
847

    
848
    /* compressed clusters never have the copied flag */
849

    
850
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
851
    if (bdrv_pwrite(s->hd,
852
                    l2_offset + l2_index * sizeof(uint64_t),
853
                    l2_table + l2_index,
854
                    sizeof(uint64_t)) != sizeof(uint64_t))
855
        return 0;
856

    
857
    return cluster_offset;
858
}
859

    
860
typedef struct QCowL2Meta
861
{
862
    uint64_t offset;
863
    int n_start;
864
    int nb_available;
865
    int nb_clusters;
866
} QCowL2Meta;
867

    
868
static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
869
        QCowL2Meta *m)
870
{
871
    BDRVQcowState *s = bs->opaque;
872
    int i, j = 0, l2_index, ret;
873
    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
874

    
875
    if (m->nb_clusters == 0)
876
        return 0;
877

    
878
    old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
879

    
880
    /* copy content of unmodified sectors */
881
    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
882
    if (m->n_start) {
883
        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
884
        if (ret < 0)
885
            goto err;
886
    }
887

    
888
    if (m->nb_available & (s->cluster_sectors - 1)) {
889
        uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
890
        ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
891
                m->nb_available - end, s->cluster_sectors);
892
        if (ret < 0)
893
            goto err;
894
    }
895

    
896
    ret = -EIO;
897
    /* update L2 table */
898
    if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
899
        goto err;
900

    
901
    for (i = 0; i < m->nb_clusters; i++) {
902
        if(l2_table[l2_index + i] != 0)
903
            old_cluster[j++] = l2_table[l2_index + i];
904

    
905
        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
906
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
907
     }
908

    
909
    if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
910
                l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
911
            m->nb_clusters * sizeof(uint64_t))
912
        goto err;
913

    
914
    for (i = 0; i < j; i++)
915
        free_any_clusters(bs, old_cluster[i], 1);
916

    
917
    ret = 0;
918
err:
919
    qemu_free(old_cluster);
920
    return ret;
921
 }
922

    
923
/*
924
 * alloc_cluster_offset
925
 *
926
 * For a given offset of the disk image, return cluster offset in
927
 * qcow2 file.
928
 *
929
 * If the offset is not found, allocate a new cluster.
930
 *
931
 * Return the cluster offset if successful,
932
 * Return 0, otherwise.
933
 *
934
 */
935

    
936
static uint64_t alloc_cluster_offset(BlockDriverState *bs,
937
                                     uint64_t offset,
938
                                     int n_start, int n_end,
939
                                     int *num, QCowL2Meta *m)
940
{
941
    BDRVQcowState *s = bs->opaque;
942
    int l2_index, ret;
943
    uint64_t l2_offset, *l2_table, cluster_offset;
944
    int nb_clusters, i = 0;
945

    
946
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
947
    if (ret == 0)
948
        return 0;
949

    
950
    nb_clusters = size_to_clusters(s, n_end << 9);
951

    
952
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
953

    
954
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
955

    
956
    /* We keep all QCOW_OFLAG_COPIED clusters */
957

    
958
    if (cluster_offset & QCOW_OFLAG_COPIED) {
959
        nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
960
                &l2_table[l2_index], 0, 0);
961

    
962
        cluster_offset &= ~QCOW_OFLAG_COPIED;
963
        m->nb_clusters = 0;
964

    
965
        goto out;
966
    }
967

    
968
    /* for the moment, multiple compressed clusters are not managed */
969

    
970
    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
971
        nb_clusters = 1;
972

    
973
    /* how many available clusters ? */
974

    
975
    while (i < nb_clusters) {
976
        i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
977
                &l2_table[l2_index], i, 0);
978

    
979
        if(be64_to_cpu(l2_table[l2_index + i]))
980
            break;
981

    
982
        i += count_contiguous_free_clusters(nb_clusters - i,
983
                &l2_table[l2_index + i]);
984

    
985
        cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
986

    
987
        if ((cluster_offset & QCOW_OFLAG_COPIED) ||
988
                (cluster_offset & QCOW_OFLAG_COMPRESSED))
989
            break;
990
    }
991
    nb_clusters = i;
992

    
993
    /* allocate a new cluster */
994

    
995
    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
996

    
997
    /* save info needed for meta data update */
998
    m->offset = offset;
999
    m->n_start = n_start;
1000
    m->nb_clusters = nb_clusters;
1001

    
1002
out:
1003
    m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
1004

    
1005
    *num = m->nb_available - n_start;
1006

    
1007
    return cluster_offset;
1008
}
1009

    
1010
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
1011
                             int nb_sectors, int *pnum)
1012
{
1013
    uint64_t cluster_offset;
1014

    
1015
    *pnum = nb_sectors;
1016
    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
1017

    
1018
    return (cluster_offset != 0);
1019
}
1020

    
1021
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1022
                             const uint8_t *buf, int buf_size)
1023
{
1024
    z_stream strm1, *strm = &strm1;
1025
    int ret, out_len;
1026

    
1027
    memset(strm, 0, sizeof(*strm));
1028

    
1029
    strm->next_in = (uint8_t *)buf;
1030
    strm->avail_in = buf_size;
1031
    strm->next_out = out_buf;
1032
    strm->avail_out = out_buf_size;
1033

    
1034
    ret = inflateInit2(strm, -12);
1035
    if (ret != Z_OK)
1036
        return -1;
1037
    ret = inflate(strm, Z_FINISH);
1038
    out_len = strm->next_out - out_buf;
1039
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1040
        out_len != out_buf_size) {
1041
        inflateEnd(strm);
1042
        return -1;
1043
    }
1044
    inflateEnd(strm);
1045
    return 0;
1046
}
1047

    
1048
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
1049
{
1050
    int ret, csize, nb_csectors, sector_offset;
1051
    uint64_t coffset;
1052

    
1053
    coffset = cluster_offset & s->cluster_offset_mask;
1054
    if (s->cluster_cache_offset != coffset) {
1055
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1056
        sector_offset = coffset & 511;
1057
        csize = nb_csectors * 512 - sector_offset;
1058
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
1059
        if (ret < 0) {
1060
            return -1;
1061
        }
1062
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1063
                              s->cluster_data + sector_offset, csize) < 0) {
1064
            return -1;
1065
        }
1066
        s->cluster_cache_offset = coffset;
1067
    }
1068
    return 0;
1069
}
1070

    
1071
/* handle reading after the end of the backing file */
1072
static int backing_read1(BlockDriverState *bs,
1073
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
1074
{
1075
    int n1;
1076
    if ((sector_num + nb_sectors) <= bs->total_sectors)
1077
        return nb_sectors;
1078
    if (sector_num >= bs->total_sectors)
1079
        n1 = 0;
1080
    else
1081
        n1 = bs->total_sectors - sector_num;
1082
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
1083
    return n1;
1084
}
1085

    
1086
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
1087
                     uint8_t *buf, int nb_sectors)
1088
{
1089
    BDRVQcowState *s = bs->opaque;
1090
    int ret, index_in_cluster, n, n1;
1091
    uint64_t cluster_offset;
1092

    
1093
    while (nb_sectors > 0) {
1094
        n = nb_sectors;
1095
        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
1096
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1097
        if (!cluster_offset) {
1098
            if (bs->backing_hd) {
1099
                /* read from the base image */
1100
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
1101
                if (n1 > 0) {
1102
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
1103
                    if (ret < 0)
1104
                        return -1;
1105
                }
1106
            } else {
1107
                memset(buf, 0, 512 * n);
1108
            }
1109
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1110
            if (decompress_cluster(s, cluster_offset) < 0)
1111
                return -1;
1112
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
1113
        } else {
1114
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1115
            if (ret != n * 512)
1116
                return -1;
1117
            if (s->crypt_method) {
1118
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
1119
                                &s->aes_decrypt_key);
1120
            }
1121
        }
1122
        nb_sectors -= n;
1123
        sector_num += n;
1124
        buf += n * 512;
1125
    }
1126
    return 0;
1127
}
1128

    
1129
static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1130
                     const uint8_t *buf, int nb_sectors)
1131
{
1132
    BDRVQcowState *s = bs->opaque;
1133
    int ret, index_in_cluster, n;
1134
    uint64_t cluster_offset;
1135
    int n_end;
1136
    QCowL2Meta l2meta;
1137

    
1138
    while (nb_sectors > 0) {
1139
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1140
        n_end = index_in_cluster + nb_sectors;
1141
        if (s->crypt_method &&
1142
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1143
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1144
        cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1145
                                              index_in_cluster,
1146
                                              n_end, &n, &l2meta);
1147
        if (!cluster_offset)
1148
            return -1;
1149
        if (s->crypt_method) {
1150
            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
1151
                            &s->aes_encrypt_key);
1152
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
1153
                              s->cluster_data, n * 512);
1154
        } else {
1155
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1156
        }
1157
        if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
1158
            free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
1159
            return -1;
1160
        }
1161
        nb_sectors -= n;
1162
        sector_num += n;
1163
        buf += n * 512;
1164
    }
1165
    s->cluster_cache_offset = -1; /* disable compressed cache */
1166
    return 0;
1167
}
1168

    
1169
typedef struct QCowAIOCB {
1170
    BlockDriverAIOCB common;
1171
    int64_t sector_num;
1172
    uint8_t *buf;
1173
    int nb_sectors;
1174
    int n;
1175
    uint64_t cluster_offset;
1176
    uint8_t *cluster_data;
1177
    BlockDriverAIOCB *hd_aiocb;
1178
    QEMUBH *bh;
1179
    QCowL2Meta l2meta;
1180
} QCowAIOCB;
1181

    
1182
static void qcow_aio_read_cb(void *opaque, int ret);
1183
static void qcow_aio_read_bh(void *opaque)
1184
{
1185
    QCowAIOCB *acb = opaque;
1186
    qemu_bh_delete(acb->bh);
1187
    acb->bh = NULL;
1188
    qcow_aio_read_cb(opaque, 0);
1189
}
1190

    
1191
static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
1192
{
1193
    if (acb->bh)
1194
        return -EIO;
1195

    
1196
    acb->bh = qemu_bh_new(cb, acb);
1197
    if (!acb->bh)
1198
        return -EIO;
1199

    
1200
    qemu_bh_schedule(acb->bh);
1201

    
1202
    return 0;
1203
}
1204

    
1205
static void qcow_aio_read_cb(void *opaque, int ret)
1206
{
1207
    QCowAIOCB *acb = opaque;
1208
    BlockDriverState *bs = acb->common.bs;
1209
    BDRVQcowState *s = bs->opaque;
1210
    int index_in_cluster, n1;
1211

    
1212
    acb->hd_aiocb = NULL;
1213
    if (ret < 0) {
1214
fail:
1215
        acb->common.cb(acb->common.opaque, ret);
1216
        qemu_aio_release(acb);
1217
        return;
1218
    }
1219

    
1220
    /* post process the read buffer */
1221
    if (!acb->cluster_offset) {
1222
        /* nothing to do */
1223
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1224
        /* nothing to do */
1225
    } else {
1226
        if (s->crypt_method) {
1227
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
1228
                            acb->n, 0,
1229
                            &s->aes_decrypt_key);
1230
        }
1231
    }
1232

    
1233
    acb->nb_sectors -= acb->n;
1234
    acb->sector_num += acb->n;
1235
    acb->buf += acb->n * 512;
1236

    
1237
    if (acb->nb_sectors == 0) {
1238
        /* request completed */
1239
        acb->common.cb(acb->common.opaque, 0);
1240
        qemu_aio_release(acb);
1241
        return;
1242
    }
1243

    
1244
    /* prepare next AIO request */
1245
    acb->n = acb->nb_sectors;
1246
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
1247
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1248

    
1249
    if (!acb->cluster_offset) {
1250
        if (bs->backing_hd) {
1251
            /* read from the base image */
1252
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
1253
                               acb->buf, acb->n);
1254
            if (n1 > 0) {
1255
                acb->hd_aiocb = bdrv_aio_read(bs->backing_hd, acb->sector_num,
1256
                                    acb->buf, acb->n, qcow_aio_read_cb, acb);
1257
                if (acb->hd_aiocb == NULL)
1258
                    goto fail;
1259
            } else {
1260
                ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1261
                if (ret < 0)
1262
                    goto fail;
1263
            }
1264
        } else {
1265
            /* Note: in this case, no need to wait */
1266
            memset(acb->buf, 0, 512 * acb->n);
1267
            ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1268
            if (ret < 0)
1269
                goto fail;
1270
        }
1271
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1272
        /* add AIO support for compressed blocks ? */
1273
        if (decompress_cluster(s, acb->cluster_offset) < 0)
1274
            goto fail;
1275
        memcpy(acb->buf,
1276
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
1277
        ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1278
        if (ret < 0)
1279
            goto fail;
1280
    } else {
1281
        if ((acb->cluster_offset & 511) != 0) {
1282
            ret = -EIO;
1283
            goto fail;
1284
        }
1285
        acb->hd_aiocb = bdrv_aio_read(s->hd,
1286
                            (acb->cluster_offset >> 9) + index_in_cluster,
1287
                            acb->buf, acb->n, qcow_aio_read_cb, acb);
1288
        if (acb->hd_aiocb == NULL)
1289
            goto fail;
1290
    }
1291
}
1292

    
1293
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1294
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1295
        BlockDriverCompletionFunc *cb, void *opaque)
1296
{
1297
    QCowAIOCB *acb;
1298

    
1299
    acb = qemu_aio_get(bs, cb, opaque);
1300
    if (!acb)
1301
        return NULL;
1302
    acb->hd_aiocb = NULL;
1303
    acb->sector_num = sector_num;
1304
    acb->buf = buf;
1305
    acb->nb_sectors = nb_sectors;
1306
    acb->n = 0;
1307
    acb->cluster_offset = 0;
1308
    acb->l2meta.nb_clusters = 0;
1309
    return acb;
1310
}
1311

    
1312
static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs,
1313
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1314
        BlockDriverCompletionFunc *cb, void *opaque)
1315
{
1316
    QCowAIOCB *acb;
1317

    
1318
    acb = qcow_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
1319
    if (!acb)
1320
        return NULL;
1321

    
1322
    qcow_aio_read_cb(acb, 0);
1323
    return &acb->common;
1324
}
1325

    
1326
static void qcow_aio_write_cb(void *opaque, int ret)
1327
{
1328
    QCowAIOCB *acb = opaque;
1329
    BlockDriverState *bs = acb->common.bs;
1330
    BDRVQcowState *s = bs->opaque;
1331
    int index_in_cluster;
1332
    const uint8_t *src_buf;
1333
    int n_end;
1334

    
1335
    acb->hd_aiocb = NULL;
1336

    
1337
    if (ret < 0) {
1338
    fail:
1339
        acb->common.cb(acb->common.opaque, ret);
1340
        qemu_aio_release(acb);
1341
        return;
1342
    }
1343

    
1344
    if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
1345
        free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
1346
        goto fail;
1347
    }
1348

    
1349
    acb->nb_sectors -= acb->n;
1350
    acb->sector_num += acb->n;
1351
    acb->buf += acb->n * 512;
1352

    
1353
    if (acb->nb_sectors == 0) {
1354
        /* request completed */
1355
        acb->common.cb(acb->common.opaque, 0);
1356
        qemu_aio_release(acb);
1357
        return;
1358
    }
1359

    
1360
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1361
    n_end = index_in_cluster + acb->nb_sectors;
1362
    if (s->crypt_method &&
1363
        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1364
        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1365

    
1366
    acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1367
                                          index_in_cluster,
1368
                                          n_end, &acb->n, &acb->l2meta);
1369
    if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
1370
        ret = -EIO;
1371
        goto fail;
1372
    }
1373
    if (s->crypt_method) {
1374
        if (!acb->cluster_data) {
1375
            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
1376
                                             s->cluster_size);
1377
        }
1378
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
1379
                        acb->n, 1, &s->aes_encrypt_key);
1380
        src_buf = acb->cluster_data;
1381
    } else {
1382
        src_buf = acb->buf;
1383
    }
1384
    acb->hd_aiocb = bdrv_aio_write(s->hd,
1385
                                   (acb->cluster_offset >> 9) + index_in_cluster,
1386
                                   src_buf, acb->n,
1387
                                   qcow_aio_write_cb, acb);
1388
    if (acb->hd_aiocb == NULL)
1389
        goto fail;
1390
}
1391

    
1392
static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs,
1393
        int64_t sector_num, const uint8_t *buf, int nb_sectors,
1394
        BlockDriverCompletionFunc *cb, void *opaque)
1395
{
1396
    BDRVQcowState *s = bs->opaque;
1397
    QCowAIOCB *acb;
1398

    
1399
    s->cluster_cache_offset = -1; /* disable compressed cache */
1400

    
1401
    acb = qcow_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
1402
    if (!acb)
1403
        return NULL;
1404

    
1405
    qcow_aio_write_cb(acb, 0);
1406
    return &acb->common;
1407
}
1408

    
1409
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1410
{
1411
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1412
    if (acb->hd_aiocb)
1413
        bdrv_aio_cancel(acb->hd_aiocb);
1414
    qemu_aio_release(acb);
1415
}
1416

    
1417
static void qcow_close(BlockDriverState *bs)
1418
{
1419
    BDRVQcowState *s = bs->opaque;
1420
    qemu_free(s->l1_table);
1421
    qemu_free(s->l2_cache);
1422
    qemu_free(s->cluster_cache);
1423
    qemu_free(s->cluster_data);
1424
    refcount_close(bs);
1425
    bdrv_delete(s->hd);
1426
}
1427

    
1428
/* XXX: use std qcow open function ? */
1429
typedef struct QCowCreateState {
1430
    int cluster_size;
1431
    int cluster_bits;
1432
    uint16_t *refcount_block;
1433
    uint64_t *refcount_table;
1434
    int64_t l1_table_offset;
1435
    int64_t refcount_table_offset;
1436
    int64_t refcount_block_offset;
1437
} QCowCreateState;
1438

    
1439
static void create_refcount_update(QCowCreateState *s,
1440
                                   int64_t offset, int64_t size)
1441
{
1442
    int refcount;
1443
    int64_t start, last, cluster_offset;
1444
    uint16_t *p;
1445

    
1446
    start = offset & ~(s->cluster_size - 1);
1447
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
1448
    for(cluster_offset = start; cluster_offset <= last;
1449
        cluster_offset += s->cluster_size) {
1450
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
1451
        refcount = be16_to_cpu(*p);
1452
        refcount++;
1453
        *p = cpu_to_be16(refcount);
1454
    }
1455
}
1456

    
1457
static int qcow_create(const char *filename, int64_t total_size,
1458
                      const char *backing_file, int flags)
1459
{
1460
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1461
    QCowHeader header;
1462
    uint64_t tmp, offset;
1463
    QCowCreateState s1, *s = &s1;
1464

    
1465
    memset(s, 0, sizeof(*s));
1466

    
1467
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1468
    if (fd < 0)
1469
        return -1;
1470
    memset(&header, 0, sizeof(header));
1471
    header.magic = cpu_to_be32(QCOW_MAGIC);
1472
    header.version = cpu_to_be32(QCOW_VERSION);
1473
    header.size = cpu_to_be64(total_size * 512);
1474
    header_size = sizeof(header);
1475
    backing_filename_len = 0;
1476
    if (backing_file) {
1477
        header.backing_file_offset = cpu_to_be64(header_size);
1478
        backing_filename_len = strlen(backing_file);
1479
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1480
        header_size += backing_filename_len;
1481
    }
1482
    s->cluster_bits = 12;  /* 4 KB clusters */
1483
    s->cluster_size = 1 << s->cluster_bits;
1484
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1485
    header_size = (header_size + 7) & ~7;
1486
    if (flags & BLOCK_FLAG_ENCRYPT) {
1487
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1488
    } else {
1489
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1490
    }
1491
    l2_bits = s->cluster_bits - 3;
1492
    shift = s->cluster_bits + l2_bits;
1493
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1494
    offset = align_offset(header_size, s->cluster_size);
1495
    s->l1_table_offset = offset;
1496
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1497
    header.l1_size = cpu_to_be32(l1_size);
1498
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1499

    
1500
    s->refcount_table = qemu_mallocz(s->cluster_size);
1501
    s->refcount_block = qemu_mallocz(s->cluster_size);
1502

    
1503
    s->refcount_table_offset = offset;
1504
    header.refcount_table_offset = cpu_to_be64(offset);
1505
    header.refcount_table_clusters = cpu_to_be32(1);
1506
    offset += s->cluster_size;
1507

    
1508
    s->refcount_table[0] = cpu_to_be64(offset);
1509
    s->refcount_block_offset = offset;
1510
    offset += s->cluster_size;
1511

    
1512
    /* update refcounts */
1513
    create_refcount_update(s, 0, header_size);
1514
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1515
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1516
    create_refcount_update(s, s->refcount_block_offset, s->cluster_size);
1517

    
1518
    /* write all the data */
1519
    write(fd, &header, sizeof(header));
1520
    if (backing_file) {
1521
        write(fd, backing_file, backing_filename_len);
1522
    }
1523
    lseek(fd, s->l1_table_offset, SEEK_SET);
1524
    tmp = 0;
1525
    for(i = 0;i < l1_size; i++) {
1526
        write(fd, &tmp, sizeof(tmp));
1527
    }
1528
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1529
    write(fd, s->refcount_table, s->cluster_size);
1530

    
1531
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1532
    write(fd, s->refcount_block, s->cluster_size);
1533

    
1534
    qemu_free(s->refcount_table);
1535
    qemu_free(s->refcount_block);
1536
    close(fd);
1537
    return 0;
1538
}
1539

    
1540
static int qcow_make_empty(BlockDriverState *bs)
1541
{
1542
#if 0
1543
    /* XXX: not correct */
1544
    BDRVQcowState *s = bs->opaque;
1545
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1546
    int ret;
1547

1548
    memset(s->l1_table, 0, l1_length);
1549
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1550
        return -1;
1551
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1552
    if (ret < 0)
1553
        return ret;
1554

1555
    l2_cache_reset(bs);
1556
#endif
1557
    return 0;
1558
}
1559

    
1560
/* XXX: put compressed sectors first, then all the cluster aligned
1561
   tables to avoid losing bytes in alignment */
1562
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1563
                                 const uint8_t *buf, int nb_sectors)
1564
{
1565
    BDRVQcowState *s = bs->opaque;
1566
    z_stream strm;
1567
    int ret, out_len;
1568
    uint8_t *out_buf;
1569
    uint64_t cluster_offset;
1570

    
1571
    if (nb_sectors == 0) {
1572
        /* align end of file to a sector boundary to ease reading with
1573
           sector based I/Os */
1574
        cluster_offset = bdrv_getlength(s->hd);
1575
        cluster_offset = (cluster_offset + 511) & ~511;
1576
        bdrv_truncate(s->hd, cluster_offset);
1577
        return 0;
1578
    }
1579

    
1580
    if (nb_sectors != s->cluster_sectors)
1581
        return -EINVAL;
1582

    
1583
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1584

    
1585
    /* best compression, small window, no zlib header */
1586
    memset(&strm, 0, sizeof(strm));
1587
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1588
                       Z_DEFLATED, -12,
1589
                       9, Z_DEFAULT_STRATEGY);
1590
    if (ret != 0) {
1591
        qemu_free(out_buf);
1592
        return -1;
1593
    }
1594

    
1595
    strm.avail_in = s->cluster_size;
1596
    strm.next_in = (uint8_t *)buf;
1597
    strm.avail_out = s->cluster_size;
1598
    strm.next_out = out_buf;
1599

    
1600
    ret = deflate(&strm, Z_FINISH);
1601
    if (ret != Z_STREAM_END && ret != Z_OK) {
1602
        qemu_free(out_buf);
1603
        deflateEnd(&strm);
1604
        return -1;
1605
    }
1606
    out_len = strm.next_out - out_buf;
1607

    
1608
    deflateEnd(&strm);
1609

    
1610
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1611
        /* could not compress: write normal cluster */
1612
        qcow_write(bs, sector_num, buf, s->cluster_sectors);
1613
    } else {
1614
        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
1615
                                              out_len);
1616
        if (!cluster_offset)
1617
            return -1;
1618
        cluster_offset &= s->cluster_offset_mask;
1619
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1620
            qemu_free(out_buf);
1621
            return -1;
1622
        }
1623
    }
1624

    
1625
    qemu_free(out_buf);
1626
    return 0;
1627
}
1628

    
1629
static void qcow_flush(BlockDriverState *bs)
1630
{
1631
    BDRVQcowState *s = bs->opaque;
1632
    bdrv_flush(s->hd);
1633
}
1634

    
1635
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1636
{
1637
    BDRVQcowState *s = bs->opaque;
1638
    bdi->cluster_size = s->cluster_size;
1639
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1640
        (s->cluster_bits + s->l2_bits);
1641
    return 0;
1642
}
1643

    
1644
/*********************************************************/
1645
/* snapshot support */
1646

    
1647
/* update the refcounts of snapshots and the copied flag */
1648
static int update_snapshot_refcount(BlockDriverState *bs,
1649
                                    int64_t l1_table_offset,
1650
                                    int l1_size,
1651
                                    int addend)
1652
{
1653
    BDRVQcowState *s = bs->opaque;
1654
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
1655
    int64_t old_offset, old_l2_offset;
1656
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
1657

    
1658
    l2_cache_reset(bs);
1659

    
1660
    l2_table = NULL;
1661
    l1_table = NULL;
1662
    l1_size2 = l1_size * sizeof(uint64_t);
1663
    l1_allocated = 0;
1664
    if (l1_table_offset != s->l1_table_offset) {
1665
        l1_table = qemu_malloc(l1_size2);
1666
        l1_allocated = 1;
1667
        if (bdrv_pread(s->hd, l1_table_offset,
1668
                       l1_table, l1_size2) != l1_size2)
1669
            goto fail;
1670
        for(i = 0;i < l1_size; i++)
1671
            be64_to_cpus(&l1_table[i]);
1672
    } else {
1673
        assert(l1_size == s->l1_size);
1674
        l1_table = s->l1_table;
1675
        l1_allocated = 0;
1676
    }
1677

    
1678
    l2_size = s->l2_size * sizeof(uint64_t);
1679
    l2_table = qemu_malloc(l2_size);
1680
    l1_modified = 0;
1681
    for(i = 0; i < l1_size; i++) {
1682
        l2_offset = l1_table[i];
1683
        if (l2_offset) {
1684
            old_l2_offset = l2_offset;
1685
            l2_offset &= ~QCOW_OFLAG_COPIED;
1686
            l2_modified = 0;
1687
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
1688
                goto fail;
1689
            for(j = 0; j < s->l2_size; j++) {
1690
                offset = be64_to_cpu(l2_table[j]);
1691
                if (offset != 0) {
1692
                    old_offset = offset;
1693
                    offset &= ~QCOW_OFLAG_COPIED;
1694
                    if (offset & QCOW_OFLAG_COMPRESSED) {
1695
                        nb_csectors = ((offset >> s->csize_shift) &
1696
                                       s->csize_mask) + 1;
1697
                        if (addend != 0)
1698
                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
1699
                                            nb_csectors * 512, addend);
1700
                        /* compressed clusters are never modified */
1701
                        refcount = 2;
1702
                    } else {
1703
                        if (addend != 0) {
1704
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
1705
                        } else {
1706
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
1707
                        }
1708
                    }
1709

    
1710
                    if (refcount == 1) {
1711
                        offset |= QCOW_OFLAG_COPIED;
1712
                    }
1713
                    if (offset != old_offset) {
1714
                        l2_table[j] = cpu_to_be64(offset);
1715
                        l2_modified = 1;
1716
                    }
1717
                }
1718
            }
1719
            if (l2_modified) {
1720
                if (bdrv_pwrite(s->hd,
1721
                                l2_offset, l2_table, l2_size) != l2_size)
1722
                    goto fail;
1723
            }
1724

    
1725
            if (addend != 0) {
1726
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
1727
            } else {
1728
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1729
            }
1730
            if (refcount == 1) {
1731
                l2_offset |= QCOW_OFLAG_COPIED;
1732
            }
1733
            if (l2_offset != old_l2_offset) {
1734
                l1_table[i] = l2_offset;
1735
                l1_modified = 1;
1736
            }
1737
        }
1738
    }
1739
    if (l1_modified) {
1740
        for(i = 0; i < l1_size; i++)
1741
            cpu_to_be64s(&l1_table[i]);
1742
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
1743
                        l1_size2) != l1_size2)
1744
            goto fail;
1745
        for(i = 0; i < l1_size; i++)
1746
            be64_to_cpus(&l1_table[i]);
1747
    }
1748
    if (l1_allocated)
1749
        qemu_free(l1_table);
1750
    qemu_free(l2_table);
1751
    return 0;
1752
 fail:
1753
    if (l1_allocated)
1754
        qemu_free(l1_table);
1755
    qemu_free(l2_table);
1756
    return -EIO;
1757
}
1758

    
1759
static void qcow_free_snapshots(BlockDriverState *bs)
1760
{
1761
    BDRVQcowState *s = bs->opaque;
1762
    int i;
1763

    
1764
    for(i = 0; i < s->nb_snapshots; i++) {
1765
        qemu_free(s->snapshots[i].name);
1766
        qemu_free(s->snapshots[i].id_str);
1767
    }
1768
    qemu_free(s->snapshots);
1769
    s->snapshots = NULL;
1770
    s->nb_snapshots = 0;
1771
}
1772

    
1773
static int qcow_read_snapshots(BlockDriverState *bs)
1774
{
1775
    BDRVQcowState *s = bs->opaque;
1776
    QCowSnapshotHeader h;
1777
    QCowSnapshot *sn;
1778
    int i, id_str_size, name_size;
1779
    int64_t offset;
1780
    uint32_t extra_data_size;
1781

    
1782
    if (!s->nb_snapshots) {
1783
        s->snapshots = NULL;
1784
        s->snapshots_size = 0;
1785
        return 0;
1786
    }
1787

    
1788
    offset = s->snapshots_offset;
1789
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1790
    for(i = 0; i < s->nb_snapshots; i++) {
1791
        offset = align_offset(offset, 8);
1792
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1793
            goto fail;
1794
        offset += sizeof(h);
1795
        sn = s->snapshots + i;
1796
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1797
        sn->l1_size = be32_to_cpu(h.l1_size);
1798
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1799
        sn->date_sec = be32_to_cpu(h.date_sec);
1800
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1801
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1802
        extra_data_size = be32_to_cpu(h.extra_data_size);
1803

    
1804
        id_str_size = be16_to_cpu(h.id_str_size);
1805
        name_size = be16_to_cpu(h.name_size);
1806

    
1807
        offset += extra_data_size;
1808

    
1809
        sn->id_str = qemu_malloc(id_str_size + 1);
1810
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1811
            goto fail;
1812
        offset += id_str_size;
1813
        sn->id_str[id_str_size] = '\0';
1814

    
1815
        sn->name = qemu_malloc(name_size + 1);
1816
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1817
            goto fail;
1818
        offset += name_size;
1819
        sn->name[name_size] = '\0';
1820
    }
1821
    s->snapshots_size = offset - s->snapshots_offset;
1822
    return 0;
1823
 fail:
1824
    qcow_free_snapshots(bs);
1825
    return -1;
1826
}
1827

    
1828
/* add at the end of the file a new list of snapshots */
1829
static int qcow_write_snapshots(BlockDriverState *bs)
1830
{
1831
    BDRVQcowState *s = bs->opaque;
1832
    QCowSnapshot *sn;
1833
    QCowSnapshotHeader h;
1834
    int i, name_size, id_str_size, snapshots_size;
1835
    uint64_t data64;
1836
    uint32_t data32;
1837
    int64_t offset, snapshots_offset;
1838

    
1839
    /* compute the size of the snapshots */
1840
    offset = 0;
1841
    for(i = 0; i < s->nb_snapshots; i++) {
1842
        sn = s->snapshots + i;
1843
        offset = align_offset(offset, 8);
1844
        offset += sizeof(h);
1845
        offset += strlen(sn->id_str);
1846
        offset += strlen(sn->name);
1847
    }
1848
    snapshots_size = offset;
1849

    
1850
    snapshots_offset = alloc_clusters(bs, snapshots_size);
1851
    offset = snapshots_offset;
1852

    
1853
    for(i = 0; i < s->nb_snapshots; i++) {
1854
        sn = s->snapshots + i;
1855
        memset(&h, 0, sizeof(h));
1856
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
1857
        h.l1_size = cpu_to_be32(sn->l1_size);
1858
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
1859
        h.date_sec = cpu_to_be32(sn->date_sec);
1860
        h.date_nsec = cpu_to_be32(sn->date_nsec);
1861
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
1862

    
1863
        id_str_size = strlen(sn->id_str);
1864
        name_size = strlen(sn->name);
1865
        h.id_str_size = cpu_to_be16(id_str_size);
1866
        h.name_size = cpu_to_be16(name_size);
1867
        offset = align_offset(offset, 8);
1868
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1869
            goto fail;
1870
        offset += sizeof(h);
1871
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1872
            goto fail;
1873
        offset += id_str_size;
1874
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
1875
            goto fail;
1876
        offset += name_size;
1877
    }
1878

    
1879
    /* update the various header fields */
1880
    data64 = cpu_to_be64(snapshots_offset);
1881
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
1882
                    &data64, sizeof(data64)) != sizeof(data64))
1883
        goto fail;
1884
    data32 = cpu_to_be32(s->nb_snapshots);
1885
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
1886
                    &data32, sizeof(data32)) != sizeof(data32))
1887
        goto fail;
1888

    
1889
    /* free the old snapshot table */
1890
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
1891
    s->snapshots_offset = snapshots_offset;
1892
    s->snapshots_size = snapshots_size;
1893
    return 0;
1894
 fail:
1895
    return -1;
1896
}
1897

    
1898
static void find_new_snapshot_id(BlockDriverState *bs,
1899
                                 char *id_str, int id_str_size)
1900
{
1901
    BDRVQcowState *s = bs->opaque;
1902
    QCowSnapshot *sn;
1903
    int i, id, id_max = 0;
1904

    
1905
    for(i = 0; i < s->nb_snapshots; i++) {
1906
        sn = s->snapshots + i;
1907
        id = strtoul(sn->id_str, NULL, 10);
1908
        if (id > id_max)
1909
            id_max = id;
1910
    }
1911
    snprintf(id_str, id_str_size, "%d", id_max + 1);
1912
}
1913

    
1914
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
1915
{
1916
    BDRVQcowState *s = bs->opaque;
1917
    int i;
1918

    
1919
    for(i = 0; i < s->nb_snapshots; i++) {
1920
        if (!strcmp(s->snapshots[i].id_str, id_str))
1921
            return i;
1922
    }
1923
    return -1;
1924
}
1925

    
1926
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
1927
{
1928
    BDRVQcowState *s = bs->opaque;
1929
    int i, ret;
1930

    
1931
    ret = find_snapshot_by_id(bs, name);
1932
    if (ret >= 0)
1933
        return ret;
1934
    for(i = 0; i < s->nb_snapshots; i++) {
1935
        if (!strcmp(s->snapshots[i].name, name))
1936
            return i;
1937
    }
1938
    return -1;
1939
}
1940

    
1941
/* if no id is provided, a new one is constructed */
1942
static int qcow_snapshot_create(BlockDriverState *bs,
1943
                                QEMUSnapshotInfo *sn_info)
1944
{
1945
    BDRVQcowState *s = bs->opaque;
1946
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
1947
    int i, ret;
1948
    uint64_t *l1_table = NULL;
1949

    
1950
    memset(sn, 0, sizeof(*sn));
1951

    
1952
    if (sn_info->id_str[0] == '\0') {
1953
        /* compute a new id */
1954
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
1955
    }
1956

    
1957
    /* check that the ID is unique */
1958
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
1959
        return -ENOENT;
1960

    
1961
    sn->id_str = qemu_strdup(sn_info->id_str);
1962
    if (!sn->id_str)
1963
        goto fail;
1964
    sn->name = qemu_strdup(sn_info->name);
1965
    if (!sn->name)
1966
        goto fail;
1967
    sn->vm_state_size = sn_info->vm_state_size;
1968
    sn->date_sec = sn_info->date_sec;
1969
    sn->date_nsec = sn_info->date_nsec;
1970
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
1971

    
1972
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
1973
    if (ret < 0)
1974
        goto fail;
1975

    
1976
    /* create the L1 table of the snapshot */
1977
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
1978
    sn->l1_size = s->l1_size;
1979

    
1980
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
1981
    for(i = 0; i < s->l1_size; i++) {
1982
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
1983
    }
1984
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
1985
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
1986
        (s->l1_size * sizeof(uint64_t)))
1987
        goto fail;
1988
    qemu_free(l1_table);
1989
    l1_table = NULL;
1990

    
1991
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
1992
    if (s->snapshots) {
1993
        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
1994
        qemu_free(s->snapshots);
1995
    }
1996
    s->snapshots = snapshots1;
1997
    s->snapshots[s->nb_snapshots++] = *sn;
1998

    
1999
    if (qcow_write_snapshots(bs) < 0)
2000
        goto fail;
2001
#ifdef DEBUG_ALLOC
2002
    check_refcounts(bs);
2003
#endif
2004
    return 0;
2005
 fail:
2006
    qemu_free(sn->name);
2007
    qemu_free(l1_table);
2008
    return -1;
2009
}
2010

    
2011
/* copy the snapshot 'snapshot_name' into the current disk image */
2012
static int qcow_snapshot_goto(BlockDriverState *bs,
2013
                              const char *snapshot_id)
2014
{
2015
    BDRVQcowState *s = bs->opaque;
2016
    QCowSnapshot *sn;
2017
    int i, snapshot_index, l1_size2;
2018

    
2019
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2020
    if (snapshot_index < 0)
2021
        return -ENOENT;
2022
    sn = &s->snapshots[snapshot_index];
2023

    
2024
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
2025
        goto fail;
2026

    
2027
    if (grow_l1_table(bs, sn->l1_size) < 0)
2028
        goto fail;
2029

    
2030
    s->l1_size = sn->l1_size;
2031
    l1_size2 = s->l1_size * sizeof(uint64_t);
2032
    /* copy the snapshot l1 table to the current l1 table */
2033
    if (bdrv_pread(s->hd, sn->l1_table_offset,
2034
                   s->l1_table, l1_size2) != l1_size2)
2035
        goto fail;
2036
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
2037
                    s->l1_table, l1_size2) != l1_size2)
2038
        goto fail;
2039
    for(i = 0;i < s->l1_size; i++) {
2040
        be64_to_cpus(&s->l1_table[i]);
2041
    }
2042

    
2043
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
2044
        goto fail;
2045

    
2046
#ifdef DEBUG_ALLOC
2047
    check_refcounts(bs);
2048
#endif
2049
    return 0;
2050
 fail:
2051
    return -EIO;
2052
}
2053

    
2054
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2055
{
2056
    BDRVQcowState *s = bs->opaque;
2057
    QCowSnapshot *sn;
2058
    int snapshot_index, ret;
2059

    
2060
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2061
    if (snapshot_index < 0)
2062
        return -ENOENT;
2063
    sn = &s->snapshots[snapshot_index];
2064

    
2065
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
2066
    if (ret < 0)
2067
        return ret;
2068
    /* must update the copied flag on the current cluster offsets */
2069
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
2070
    if (ret < 0)
2071
        return ret;
2072
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
2073

    
2074
    qemu_free(sn->id_str);
2075
    qemu_free(sn->name);
2076
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
2077
    s->nb_snapshots--;
2078
    ret = qcow_write_snapshots(bs);
2079
    if (ret < 0) {
2080
        /* XXX: restore snapshot if error ? */
2081
        return ret;
2082
    }
2083
#ifdef DEBUG_ALLOC
2084
    check_refcounts(bs);
2085
#endif
2086
    return 0;
2087
}
2088

    
2089
static int qcow_snapshot_list(BlockDriverState *bs,
2090
                              QEMUSnapshotInfo **psn_tab)
2091
{
2092
    BDRVQcowState *s = bs->opaque;
2093
    QEMUSnapshotInfo *sn_tab, *sn_info;
2094
    QCowSnapshot *sn;
2095
    int i;
2096

    
2097
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
2098
    for(i = 0; i < s->nb_snapshots; i++) {
2099
        sn_info = sn_tab + i;
2100
        sn = s->snapshots + i;
2101
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
2102
                sn->id_str);
2103
        pstrcpy(sn_info->name, sizeof(sn_info->name),
2104
                sn->name);
2105
        sn_info->vm_state_size = sn->vm_state_size;
2106
        sn_info->date_sec = sn->date_sec;
2107
        sn_info->date_nsec = sn->date_nsec;
2108
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
2109
    }
2110
    *psn_tab = sn_tab;
2111
    return s->nb_snapshots;
2112
}
2113

    
2114
/*********************************************************/
2115
/* refcount handling */
2116

    
2117
static int refcount_init(BlockDriverState *bs)
2118
{
2119
    BDRVQcowState *s = bs->opaque;
2120
    int ret, refcount_table_size2, i;
2121

    
2122
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
2123
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
2124
    s->refcount_table = qemu_malloc(refcount_table_size2);
2125
    if (s->refcount_table_size > 0) {
2126
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
2127
                         s->refcount_table, refcount_table_size2);
2128
        if (ret != refcount_table_size2)
2129
            goto fail;
2130
        for(i = 0; i < s->refcount_table_size; i++)
2131
            be64_to_cpus(&s->refcount_table[i]);
2132
    }
2133
    return 0;
2134
 fail:
2135
    return -ENOMEM;
2136
}
2137

    
2138
static void refcount_close(BlockDriverState *bs)
2139
{
2140
    BDRVQcowState *s = bs->opaque;
2141
    qemu_free(s->refcount_block_cache);
2142
    qemu_free(s->refcount_table);
2143
}
2144

    
2145

    
2146
static int load_refcount_block(BlockDriverState *bs,
2147
                               int64_t refcount_block_offset)
2148
{
2149
    BDRVQcowState *s = bs->opaque;
2150
    int ret;
2151
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
2152
                     s->cluster_size);
2153
    if (ret != s->cluster_size)
2154
        return -EIO;
2155
    s->refcount_block_cache_offset = refcount_block_offset;
2156
    return 0;
2157
}
2158

    
2159
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
2160
{
2161
    BDRVQcowState *s = bs->opaque;
2162
    int refcount_table_index, block_index;
2163
    int64_t refcount_block_offset;
2164

    
2165
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2166
    if (refcount_table_index >= s->refcount_table_size)
2167
        return 0;
2168
    refcount_block_offset = s->refcount_table[refcount_table_index];
2169
    if (!refcount_block_offset)
2170
        return 0;
2171
    if (refcount_block_offset != s->refcount_block_cache_offset) {
2172
        /* better than nothing: return allocated if read error */
2173
        if (load_refcount_block(bs, refcount_block_offset) < 0)
2174
            return 1;
2175
    }
2176
    block_index = cluster_index &
2177
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2178
    return be16_to_cpu(s->refcount_block_cache[block_index]);
2179
}
2180

    
2181
/* return < 0 if error */
2182
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
2183
{
2184
    BDRVQcowState *s = bs->opaque;
2185
    int i, nb_clusters;
2186

    
2187
    nb_clusters = size_to_clusters(s, size);
2188
retry:
2189
    for(i = 0; i < nb_clusters; i++) {
2190
        int64_t i = s->free_cluster_index++;
2191
        if (get_refcount(bs, i) != 0)
2192
            goto retry;
2193
    }
2194
#ifdef DEBUG_ALLOC2
2195
    printf("alloc_clusters: size=%lld -> %lld\n",
2196
            size,
2197
            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
2198
#endif
2199
    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
2200
}
2201

    
2202
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
2203
{
2204
    int64_t offset;
2205

    
2206
    offset = alloc_clusters_noref(bs, size);
2207
    update_refcount(bs, offset, size, 1);
2208
    return offset;
2209
}
2210

    
2211
/* only used to allocate compressed sectors. We try to allocate
2212
   contiguous sectors. size must be <= cluster_size */
2213
static int64_t alloc_bytes(BlockDriverState *bs, int size)
2214
{
2215
    BDRVQcowState *s = bs->opaque;
2216
    int64_t offset, cluster_offset;
2217
    int free_in_cluster;
2218

    
2219
    assert(size > 0 && size <= s->cluster_size);
2220
    if (s->free_byte_offset == 0) {
2221
        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
2222
    }
2223
 redo:
2224
    free_in_cluster = s->cluster_size -
2225
        (s->free_byte_offset & (s->cluster_size - 1));
2226
    if (size <= free_in_cluster) {
2227
        /* enough space in current cluster */
2228
        offset = s->free_byte_offset;
2229
        s->free_byte_offset += size;
2230
        free_in_cluster -= size;
2231
        if (free_in_cluster == 0)
2232
            s->free_byte_offset = 0;
2233
        if ((offset & (s->cluster_size - 1)) != 0)
2234
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2235
    } else {
2236
        offset = alloc_clusters(bs, s->cluster_size);
2237
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
2238
        if ((cluster_offset + s->cluster_size) == offset) {
2239
            /* we are lucky: contiguous data */
2240
            offset = s->free_byte_offset;
2241
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2242
            s->free_byte_offset += size;
2243
        } else {
2244
            s->free_byte_offset = offset;
2245
            goto redo;
2246
        }
2247
    }
2248
    return offset;
2249
}
2250

    
2251
static void free_clusters(BlockDriverState *bs,
2252
                          int64_t offset, int64_t size)
2253
{
2254
    update_refcount(bs, offset, size, -1);
2255
}
2256

    
2257
static int grow_refcount_table(BlockDriverState *bs, int min_size)
2258
{
2259
    BDRVQcowState *s = bs->opaque;
2260
    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
2261
    uint64_t *new_table;
2262
    int64_t table_offset;
2263
    uint8_t data[12];
2264
    int old_table_size;
2265
    int64_t old_table_offset;
2266

    
2267
    if (min_size <= s->refcount_table_size)
2268
        return 0;
2269
    /* compute new table size */
2270
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2271
    for(;;) {
2272
        if (refcount_table_clusters == 0) {
2273
            refcount_table_clusters = 1;
2274
        } else {
2275
            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
2276
        }
2277
        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
2278
        if (min_size <= new_table_size)
2279
            break;
2280
    }
2281
#ifdef DEBUG_ALLOC2
2282
    printf("grow_refcount_table from %d to %d\n",
2283
           s->refcount_table_size,
2284
           new_table_size);
2285
#endif
2286
    new_table_size2 = new_table_size * sizeof(uint64_t);
2287
    new_table = qemu_mallocz(new_table_size2);
2288
    memcpy(new_table, s->refcount_table,
2289
           s->refcount_table_size * sizeof(uint64_t));
2290
    for(i = 0; i < s->refcount_table_size; i++)
2291
        cpu_to_be64s(&new_table[i]);
2292
    /* Note: we cannot update the refcount now to avoid recursion */
2293
    table_offset = alloc_clusters_noref(bs, new_table_size2);
2294
    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
2295
    if (ret != new_table_size2)
2296
        goto fail;
2297
    for(i = 0; i < s->refcount_table_size; i++)
2298
        be64_to_cpus(&new_table[i]);
2299

    
2300
    cpu_to_be64w((uint64_t*)data, table_offset);
2301
    cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters);
2302
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
2303
                    data, sizeof(data)) != sizeof(data))
2304
        goto fail;
2305
    qemu_free(s->refcount_table);
2306
    old_table_offset = s->refcount_table_offset;
2307
    old_table_size = s->refcount_table_size;
2308
    s->refcount_table = new_table;
2309
    s->refcount_table_size = new_table_size;
2310
    s->refcount_table_offset = table_offset;
2311

    
2312
    update_refcount(bs, table_offset, new_table_size2, 1);
2313
    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
2314
    return 0;
2315
 fail:
2316
    free_clusters(bs, table_offset, new_table_size2);
2317
    qemu_free(new_table);
2318
    return -EIO;
2319
}
2320

    
2321
/* addend must be 1 or -1 */
2322
/* XXX: cache several refcount block clusters ? */
2323
static int update_cluster_refcount(BlockDriverState *bs,
2324
                                   int64_t cluster_index,
2325
                                   int addend)
2326
{
2327
    BDRVQcowState *s = bs->opaque;
2328
    int64_t offset, refcount_block_offset;
2329
    int ret, refcount_table_index, block_index, refcount;
2330
    uint64_t data64;
2331

    
2332
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2333
    if (refcount_table_index >= s->refcount_table_size) {
2334
        if (addend < 0)
2335
            return -EINVAL;
2336
        ret = grow_refcount_table(bs, refcount_table_index + 1);
2337
        if (ret < 0)
2338
            return ret;
2339
    }
2340
    refcount_block_offset = s->refcount_table[refcount_table_index];
2341
    if (!refcount_block_offset) {
2342
        if (addend < 0)
2343
            return -EINVAL;
2344
        /* create a new refcount block */
2345
        /* Note: we cannot update the refcount now to avoid recursion */
2346
        offset = alloc_clusters_noref(bs, s->cluster_size);
2347
        memset(s->refcount_block_cache, 0, s->cluster_size);
2348
        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
2349
        if (ret != s->cluster_size)
2350
            return -EINVAL;
2351
        s->refcount_table[refcount_table_index] = offset;
2352
        data64 = cpu_to_be64(offset);
2353
        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
2354
                          refcount_table_index * sizeof(uint64_t),
2355
                          &data64, sizeof(data64));
2356
        if (ret != sizeof(data64))
2357
            return -EINVAL;
2358

    
2359
        refcount_block_offset = offset;
2360
        s->refcount_block_cache_offset = offset;
2361
        update_refcount(bs, offset, s->cluster_size, 1);
2362
    } else {
2363
        if (refcount_block_offset != s->refcount_block_cache_offset) {
2364
            if (load_refcount_block(bs, refcount_block_offset) < 0)
2365
                return -EIO;
2366
        }
2367
    }
2368
    /* we can update the count and save it */
2369
    block_index = cluster_index &
2370
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2371
    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
2372
    refcount += addend;
2373
    if (refcount < 0 || refcount > 0xffff)
2374
        return -EINVAL;
2375
    if (refcount == 0 && cluster_index < s->free_cluster_index) {
2376
        s->free_cluster_index = cluster_index;
2377
    }
2378
    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
2379
    if (bdrv_pwrite(s->hd,
2380
                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
2381
                    &s->refcount_block_cache[block_index], 2) != 2)
2382
        return -EIO;
2383
    return refcount;
2384
}
2385

    
2386
static void update_refcount(BlockDriverState *bs,
2387
                            int64_t offset, int64_t length,
2388
                            int addend)
2389
{
2390
    BDRVQcowState *s = bs->opaque;
2391
    int64_t start, last, cluster_offset;
2392

    
2393
#ifdef DEBUG_ALLOC2
2394
    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
2395
           offset, length, addend);
2396
#endif
2397
    if (length <= 0)
2398
        return;
2399
    start = offset & ~(s->cluster_size - 1);
2400
    last = (offset + length - 1) & ~(s->cluster_size - 1);
2401
    for(cluster_offset = start; cluster_offset <= last;
2402
        cluster_offset += s->cluster_size) {
2403
        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
2404
    }
2405
}
2406

    
2407
#ifdef DEBUG_ALLOC
2408
static void inc_refcounts(BlockDriverState *bs,
2409
                          uint16_t *refcount_table,
2410
                          int refcount_table_size,
2411
                          int64_t offset, int64_t size)
2412
{
2413
    BDRVQcowState *s = bs->opaque;
2414
    int64_t start, last, cluster_offset;
2415
    int k;
2416

    
2417
    if (size <= 0)
2418
        return;
2419

    
2420
    start = offset & ~(s->cluster_size - 1);
2421
    last = (offset + size - 1) & ~(s->cluster_size - 1);
2422
    for(cluster_offset = start; cluster_offset <= last;
2423
        cluster_offset += s->cluster_size) {
2424
        k = cluster_offset >> s->cluster_bits;
2425
        if (k < 0 || k >= refcount_table_size) {
2426
            printf("ERROR: invalid cluster offset=0x%llx\n", cluster_offset);
2427
        } else {
2428
            if (++refcount_table[k] == 0) {
2429
                printf("ERROR: overflow cluster offset=0x%llx\n", cluster_offset);
2430
            }
2431
        }
2432
    }
2433
}
2434

    
2435
static int check_refcounts_l1(BlockDriverState *bs,
2436
                              uint16_t *refcount_table,
2437
                              int refcount_table_size,
2438
                              int64_t l1_table_offset, int l1_size,
2439
                              int check_copied)
2440
{
2441
    BDRVQcowState *s = bs->opaque;
2442
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
2443
    int l2_size, i, j, nb_csectors, refcount;
2444

    
2445
    l2_table = NULL;
2446
    l1_size2 = l1_size * sizeof(uint64_t);
2447

    
2448
    inc_refcounts(bs, refcount_table, refcount_table_size,
2449
                  l1_table_offset, l1_size2);
2450

    
2451
    l1_table = qemu_malloc(l1_size2);
2452
    if (bdrv_pread(s->hd, l1_table_offset,
2453
                   l1_table, l1_size2) != l1_size2)
2454
        goto fail;
2455
    for(i = 0;i < l1_size; i++)
2456
        be64_to_cpus(&l1_table[i]);
2457

    
2458
    l2_size = s->l2_size * sizeof(uint64_t);
2459
    l2_table = qemu_malloc(l2_size);
2460
    for(i = 0; i < l1_size; i++) {
2461
        l2_offset = l1_table[i];
2462
        if (l2_offset) {
2463
            if (check_copied) {
2464
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2465
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
2466
                    printf("ERROR OFLAG_COPIED: l2_offset=%llx refcount=%d\n",
2467
                           l2_offset, refcount);
2468
                }
2469
            }
2470
            l2_offset &= ~QCOW_OFLAG_COPIED;
2471
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
2472
                goto fail;
2473
            for(j = 0; j < s->l2_size; j++) {
2474
                offset = be64_to_cpu(l2_table[j]);
2475
                if (offset != 0) {
2476
                    if (offset & QCOW_OFLAG_COMPRESSED) {
2477
                        if (offset & QCOW_OFLAG_COPIED) {
2478
                            printf("ERROR: cluster %lld: copied flag must never be set for compressed clusters\n",
2479
                                   offset >> s->cluster_bits);
2480
                            offset &= ~QCOW_OFLAG_COPIED;
2481
                        }
2482
                        nb_csectors = ((offset >> s->csize_shift) &
2483
                                       s->csize_mask) + 1;
2484
                        offset &= s->cluster_offset_mask;
2485
                        inc_refcounts(bs, refcount_table,
2486
                                      refcount_table_size,
2487
                                      offset & ~511, nb_csectors * 512);
2488
                    } else {
2489
                        if (check_copied) {
2490
                            refcount = get_refcount(bs, (offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2491
                            if ((refcount == 1) != ((offset & QCOW_OFLAG_COPIED) != 0)) {
2492
                                printf("ERROR OFLAG_COPIED: offset=%llx refcount=%d\n",
2493
                                       offset, refcount);
2494
                            }
2495
                        }
2496
                        offset &= ~QCOW_OFLAG_COPIED;
2497
                        inc_refcounts(bs, refcount_table,
2498
                                      refcount_table_size,
2499
                                      offset, s->cluster_size);
2500
                    }
2501
                }
2502
            }
2503
            inc_refcounts(bs, refcount_table,
2504
                          refcount_table_size,
2505
                          l2_offset,
2506
                          s->cluster_size);
2507
        }
2508
    }
2509
    qemu_free(l1_table);
2510
    qemu_free(l2_table);
2511
    return 0;
2512
 fail:
2513
    printf("ERROR: I/O error in check_refcounts_l1\n");
2514
    qemu_free(l1_table);
2515
    qemu_free(l2_table);
2516
    return -EIO;
2517
}
2518

    
2519
static void check_refcounts(BlockDriverState *bs)
2520
{
2521
    BDRVQcowState *s = bs->opaque;
2522
    int64_t size;
2523
    int nb_clusters, refcount1, refcount2, i;
2524
    QCowSnapshot *sn;
2525
    uint16_t *refcount_table;
2526

    
2527
    size = bdrv_getlength(s->hd);
2528
    nb_clusters = size_to_clusters(s, size);
2529
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
2530

    
2531
    /* header */
2532
    inc_refcounts(bs, refcount_table, nb_clusters,
2533
                  0, s->cluster_size);
2534

    
2535
    check_refcounts_l1(bs, refcount_table, nb_clusters,
2536
                       s->l1_table_offset, s->l1_size, 1);
2537

    
2538
    /* snapshots */
2539
    for(i = 0; i < s->nb_snapshots; i++) {
2540
        sn = s->snapshots + i;
2541
        check_refcounts_l1(bs, refcount_table, nb_clusters,
2542
                           sn->l1_table_offset, sn->l1_size, 0);
2543
    }
2544
    inc_refcounts(bs, refcount_table, nb_clusters,
2545
                  s->snapshots_offset, s->snapshots_size);
2546

    
2547
    /* refcount data */
2548
    inc_refcounts(bs, refcount_table, nb_clusters,
2549
                  s->refcount_table_offset,
2550
                  s->refcount_table_size * sizeof(uint64_t));
2551
    for(i = 0; i < s->refcount_table_size; i++) {
2552
        int64_t offset;
2553
        offset = s->refcount_table[i];
2554
        if (offset != 0) {
2555
            inc_refcounts(bs, refcount_table, nb_clusters,
2556
                          offset, s->cluster_size);
2557
        }
2558
    }
2559

    
2560
    /* compare ref counts */
2561
    for(i = 0; i < nb_clusters; i++) {
2562
        refcount1 = get_refcount(bs, i);
2563
        refcount2 = refcount_table[i];
2564
        if (refcount1 != refcount2)
2565
            printf("ERROR cluster %d refcount=%d reference=%d\n",
2566
                   i, refcount1, refcount2);
2567
    }
2568

    
2569
    qemu_free(refcount_table);
2570
}
2571

    
2572
#if 0
2573
static void dump_refcounts(BlockDriverState *bs)
2574
{
2575
    BDRVQcowState *s = bs->opaque;
2576
    int64_t nb_clusters, k, k1, size;
2577
    int refcount;
2578

2579
    size = bdrv_getlength(s->hd);
2580
    nb_clusters = size_to_clusters(s, size);
2581
    for(k = 0; k < nb_clusters;) {
2582
        k1 = k;
2583
        refcount = get_refcount(bs, k);
2584
        k++;
2585
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2586
            k++;
2587
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2588
    }
2589
}
2590
#endif
2591
#endif
2592

    
2593
BlockDriver bdrv_qcow2 = {
2594
    .format_name        = "qcow2",
2595
    .instance_size        = sizeof(BDRVQcowState),
2596
    .bdrv_probe                = qcow_probe,
2597
    .bdrv_open                = qcow_open,
2598
    .bdrv_close                = qcow_close,
2599
    .bdrv_create        = qcow_create,
2600
    .bdrv_flush                = qcow_flush,
2601
    .bdrv_is_allocated        = qcow_is_allocated,
2602
    .bdrv_set_key        = qcow_set_key,
2603
    .bdrv_make_empty        = qcow_make_empty,
2604

    
2605
    .bdrv_aio_read        = qcow_aio_read,
2606
    .bdrv_aio_write        = qcow_aio_write,
2607
    .bdrv_aio_cancel        = qcow_aio_cancel,
2608
    .aiocb_size                = sizeof(QCowAIOCB),
2609
    .bdrv_write_compressed = qcow_write_compressed,
2610

    
2611
    .bdrv_snapshot_create = qcow_snapshot_create,
2612
    .bdrv_snapshot_goto        = qcow_snapshot_goto,
2613
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2614
    .bdrv_snapshot_list        = qcow_snapshot_list,
2615
    .bdrv_get_info        = qcow_get_info,
2616
};