Statistics
| Branch: | Revision:

root / block-qcow2.c @ 5fafdf24

History | View | Annotate | Download (71.5 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "vl.h"
25
#include "block_int.h"
26
#include <zlib.h>
27
#include "aes.h"
28
#include <assert.h>
29

    
30
/*
31
  Differences with QCOW:
32

33
  - Support for multiple incremental snapshots.
34
  - Memory management by reference counts.
35
  - Clusters which have a reference count of one have the bit
36
    QCOW_OFLAG_COPIED to optimize write performance.
37
  - Size of compressed clusters is stored in sectors to reduce bit usage
38
    in the cluster offsets.
39
  - Support for storing additional data (such as the VM state) in the
40
    snapshots. 
41
  - If a backing store is used, the cluster size is not constrained
42
    (could be backported to QCOW).
43
  - L2 tables have always a size of one cluster.
44
*/
45

    
46
//#define DEBUG_ALLOC
47
//#define DEBUG_ALLOC2
48

    
49
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
50
#define QCOW_VERSION 2
51

    
52
#define QCOW_CRYPT_NONE 0
53
#define QCOW_CRYPT_AES  1
54

    
55
/* indicate that the refcount of the referenced cluster is exactly one. */
56
#define QCOW_OFLAG_COPIED     (1LL << 63)
57
/* indicate that the cluster is compressed (they never have the copied flag) */
58
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
59

    
60
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
61

    
62
#ifndef offsetof
63
#define offsetof(type, field) ((size_t) &((type *)0)->field)
64
#endif
65

    
66
typedef struct QCowHeader {
67
    uint32_t magic;
68
    uint32_t version;
69
    uint64_t backing_file_offset;
70
    uint32_t backing_file_size;
71
    uint32_t cluster_bits;
72
    uint64_t size; /* in bytes */
73
    uint32_t crypt_method;
74
    uint32_t l1_size; /* XXX: save number of clusters instead ? */
75
    uint64_t l1_table_offset;
76
    uint64_t refcount_table_offset;
77
    uint32_t refcount_table_clusters;
78
    uint32_t nb_snapshots;
79
    uint64_t snapshots_offset;
80
} QCowHeader;
81

    
82
typedef struct __attribute__((packed)) QCowSnapshotHeader {
83
    /* header is 8 byte aligned */
84
    uint64_t l1_table_offset;
85

    
86
    uint32_t l1_size;
87
    uint16_t id_str_size;
88
    uint16_t name_size;
89

    
90
    uint32_t date_sec;
91
    uint32_t date_nsec;
92

    
93
    uint64_t vm_clock_nsec;
94

    
95
    uint32_t vm_state_size;
96
    uint32_t extra_data_size; /* for extension */
97
    /* extra data follows */
98
    /* id_str follows */
99
    /* name follows  */
100
} QCowSnapshotHeader;
101

    
102
#define L2_CACHE_SIZE 16
103

    
104
typedef struct QCowSnapshot {
105
    uint64_t l1_table_offset;
106
    uint32_t l1_size;
107
    char *id_str;
108
    char *name;
109
    uint32_t vm_state_size;
110
    uint32_t date_sec;
111
    uint32_t date_nsec;
112
    uint64_t vm_clock_nsec;
113
} QCowSnapshot;
114

    
115
typedef struct BDRVQcowState {
116
    BlockDriverState *hd;
117
    int cluster_bits;
118
    int cluster_size;
119
    int cluster_sectors;
120
    int l2_bits;
121
    int l2_size;
122
    int l1_size;
123
    int l1_vm_state_index;
124
    int csize_shift;
125
    int csize_mask;
126
    uint64_t cluster_offset_mask;
127
    uint64_t l1_table_offset;
128
    uint64_t *l1_table;
129
    uint64_t *l2_cache;
130
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
131
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
132
    uint8_t *cluster_cache;
133
    uint8_t *cluster_data;
134
    uint64_t cluster_cache_offset;
135

    
136
    uint64_t *refcount_table;
137
    uint64_t refcount_table_offset;
138
    uint32_t refcount_table_size;
139
    uint64_t refcount_block_cache_offset;
140
    uint16_t *refcount_block_cache;
141
    int64_t free_cluster_index;
142
    int64_t free_byte_offset;
143

    
144
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
145
    uint32_t crypt_method_header;
146
    AES_KEY aes_encrypt_key;
147
    AES_KEY aes_decrypt_key;
148
    uint64_t snapshots_offset;
149
    int snapshots_size;
150
    int nb_snapshots;
151
    QCowSnapshot *snapshots;
152
} BDRVQcowState;
153

    
154
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
155
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
156
                     uint8_t *buf, int nb_sectors);
157
static int qcow_read_snapshots(BlockDriverState *bs);
158
static void qcow_free_snapshots(BlockDriverState *bs);
159
static int refcount_init(BlockDriverState *bs);
160
static void refcount_close(BlockDriverState *bs);
161
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
162
static int update_cluster_refcount(BlockDriverState *bs,
163
                                   int64_t cluster_index,
164
                                   int addend);
165
static void update_refcount(BlockDriverState *bs,
166
                            int64_t offset, int64_t length,
167
                            int addend);
168
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
169
static int64_t alloc_bytes(BlockDriverState *bs, int size);
170
static void free_clusters(BlockDriverState *bs,
171
                          int64_t offset, int64_t size);
172
#ifdef DEBUG_ALLOC
173
static void check_refcounts(BlockDriverState *bs);
174
#endif
175

    
176
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
177
{
178
    const QCowHeader *cow_header = (const void *)buf;
179
   
180
    if (buf_size >= sizeof(QCowHeader) &&
181
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
182
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
183
        return 100;
184
    else
185
        return 0;
186
}
187

    
188
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
189
{
190
    BDRVQcowState *s = bs->opaque;
191
    int len, i, shift, ret;
192
    QCowHeader header;
193

    
194
    ret = bdrv_file_open(&s->hd, filename, flags);
195
    if (ret < 0)
196
        return ret;
197
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
198
        goto fail;
199
    be32_to_cpus(&header.magic);
200
    be32_to_cpus(&header.version);
201
    be64_to_cpus(&header.backing_file_offset);
202
    be32_to_cpus(&header.backing_file_size);
203
    be64_to_cpus(&header.size);
204
    be32_to_cpus(&header.cluster_bits);
205
    be32_to_cpus(&header.crypt_method);
206
    be64_to_cpus(&header.l1_table_offset);
207
    be32_to_cpus(&header.l1_size);
208
    be64_to_cpus(&header.refcount_table_offset);
209
    be32_to_cpus(&header.refcount_table_clusters);
210
    be64_to_cpus(&header.snapshots_offset);
211
    be32_to_cpus(&header.nb_snapshots);
212
   
213
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
214
        goto fail;
215
    if (header.size <= 1 ||
216
        header.cluster_bits < 9 ||
217
        header.cluster_bits > 16)
218
        goto fail;
219
    if (header.crypt_method > QCOW_CRYPT_AES)
220
        goto fail;
221
    s->crypt_method_header = header.crypt_method;
222
    if (s->crypt_method_header)
223
        bs->encrypted = 1;
224
    s->cluster_bits = header.cluster_bits;
225
    s->cluster_size = 1 << s->cluster_bits;
226
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
227
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
228
    s->l2_size = 1 << s->l2_bits;
229
    bs->total_sectors = header.size / 512;
230
    s->csize_shift = (62 - (s->cluster_bits - 8));
231
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
232
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
233
    s->refcount_table_offset = header.refcount_table_offset;
234
    s->refcount_table_size =
235
        header.refcount_table_clusters << (s->cluster_bits - 3);
236

    
237
    s->snapshots_offset = header.snapshots_offset;
238
    s->nb_snapshots = header.nb_snapshots;
239

    
240
    /* read the level 1 table */
241
    s->l1_size = header.l1_size;
242
    shift = s->cluster_bits + s->l2_bits;
243
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
244
    /* the L1 table must contain at least enough entries to put
245
       header.size bytes */
246
    if (s->l1_size < s->l1_vm_state_index)
247
        goto fail;
248
    s->l1_table_offset = header.l1_table_offset;
249
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
250
    if (!s->l1_table)
251
        goto fail;
252
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
253
        s->l1_size * sizeof(uint64_t))
254
        goto fail;
255
    for(i = 0;i < s->l1_size; i++) {
256
        be64_to_cpus(&s->l1_table[i]);
257
    }
258
    /* alloc L2 cache */
259
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
260
    if (!s->l2_cache)
261
        goto fail;
262
    s->cluster_cache = qemu_malloc(s->cluster_size);
263
    if (!s->cluster_cache)
264
        goto fail;
265
    /* one more sector for decompressed data alignment */
266
    s->cluster_data = qemu_malloc(s->cluster_size + 512);
267
    if (!s->cluster_data)
268
        goto fail;
269
    s->cluster_cache_offset = -1;
270
   
271
    if (refcount_init(bs) < 0)
272
        goto fail;
273

    
274
    /* read the backing file name */
275
    if (header.backing_file_offset != 0) {
276
        len = header.backing_file_size;
277
        if (len > 1023)
278
            len = 1023;
279
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
280
            goto fail;
281
        bs->backing_file[len] = '\0';
282
    }
283
    if (qcow_read_snapshots(bs) < 0)
284
        goto fail;
285

    
286
#ifdef DEBUG_ALLOC
287
    check_refcounts(bs);
288
#endif
289
    return 0;
290

    
291
 fail:
292
    qcow_free_snapshots(bs);
293
    refcount_close(bs);
294
    qemu_free(s->l1_table);
295
    qemu_free(s->l2_cache);
296
    qemu_free(s->cluster_cache);
297
    qemu_free(s->cluster_data);
298
    bdrv_delete(s->hd);
299
    return -1;
300
}
301

    
302
static int qcow_set_key(BlockDriverState *bs, const char *key)
303
{
304
    BDRVQcowState *s = bs->opaque;
305
    uint8_t keybuf[16];
306
    int len, i;
307
   
308
    memset(keybuf, 0, 16);
309
    len = strlen(key);
310
    if (len > 16)
311
        len = 16;
312
    /* XXX: we could compress the chars to 7 bits to increase
313
       entropy */
314
    for(i = 0;i < len;i++) {
315
        keybuf[i] = key[i];
316
    }
317
    s->crypt_method = s->crypt_method_header;
318

    
319
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
320
        return -1;
321
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
322
        return -1;
323
#if 0
324
    /* test */
325
    {
326
        uint8_t in[16];
327
        uint8_t out[16];
328
        uint8_t tmp[16];
329
        for(i=0;i<16;i++)
330
            in[i] = i;
331
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
332
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
333
        for(i = 0; i < 16; i++)
334
            printf(" %02x", tmp[i]);
335
        printf("\n");
336
        for(i = 0; i < 16; i++)
337
            printf(" %02x", out[i]);
338
        printf("\n");
339
    }
340
#endif
341
    return 0;
342
}
343

    
344
/* The crypt function is compatible with the linux cryptoloop
345
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
346
   supported */
347
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
348
                            uint8_t *out_buf, const uint8_t *in_buf,
349
                            int nb_sectors, int enc,
350
                            const AES_KEY *key)
351
{
352
    union {
353
        uint64_t ll[2];
354
        uint8_t b[16];
355
    } ivec;
356
    int i;
357

    
358
    for(i = 0; i < nb_sectors; i++) {
359
        ivec.ll[0] = cpu_to_le64(sector_num);
360
        ivec.ll[1] = 0;
361
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
362
                        ivec.b, enc);
363
        sector_num++;
364
        in_buf += 512;
365
        out_buf += 512;
366
    }
367
}
368

    
369
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
370
                        uint64_t cluster_offset, int n_start, int n_end)
371
{
372
    BDRVQcowState *s = bs->opaque;
373
    int n, ret;
374

    
375
    n = n_end - n_start;
376
    if (n <= 0)
377
        return 0;
378
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
379
    if (ret < 0)
380
        return ret;
381
    if (s->crypt_method) {
382
        encrypt_sectors(s, start_sect + n_start,
383
                        s->cluster_data,
384
                        s->cluster_data, n, 1,
385
                        &s->aes_encrypt_key);
386
    }
387
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
388
                     s->cluster_data, n);
389
    if (ret < 0)
390
        return ret;
391
    return 0;
392
}
393

    
394
static void l2_cache_reset(BlockDriverState *bs)
395
{
396
    BDRVQcowState *s = bs->opaque;
397

    
398
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
399
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
400
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
401
}
402

    
403
static inline int l2_cache_new_entry(BlockDriverState *bs)
404
{
405
    BDRVQcowState *s = bs->opaque;
406
    uint32_t min_count;
407
    int min_index, i;
408

    
409
    /* find a new entry in the least used one */
410
    min_index = 0;
411
    min_count = 0xffffffff;
412
    for(i = 0; i < L2_CACHE_SIZE; i++) {
413
        if (s->l2_cache_counts[i] < min_count) {
414
            min_count = s->l2_cache_counts[i];
415
            min_index = i;
416
        }
417
    }
418
    return min_index;
419
}
420

    
421
static int64_t align_offset(int64_t offset, int n)
422
{
423
    offset = (offset + n - 1) & ~(n - 1);
424
    return offset;
425
}
426

    
427
static int grow_l1_table(BlockDriverState *bs, int min_size)
428
{
429
    BDRVQcowState *s = bs->opaque;
430
    int new_l1_size, new_l1_size2, ret, i;
431
    uint64_t *new_l1_table;
432
    uint64_t new_l1_table_offset;
433
    uint64_t data64;
434
    uint32_t data32;
435

    
436
    new_l1_size = s->l1_size;
437
    if (min_size <= new_l1_size)
438
        return 0;
439
    while (min_size > new_l1_size) {
440
        new_l1_size = (new_l1_size * 3 + 1) / 2;
441
    }
442
#ifdef DEBUG_ALLOC2
443
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
444
#endif
445

    
446
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
447
    new_l1_table = qemu_mallocz(new_l1_size2);
448
    if (!new_l1_table)
449
        return -ENOMEM;
450
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
451

    
452
    /* write new table (align to cluster) */
453
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
454
   
455
    for(i = 0; i < s->l1_size; i++)
456
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
457
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
458
    if (ret != new_l1_size2)
459
        goto fail;
460
    for(i = 0; i < s->l1_size; i++)
461
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
462
   
463
    /* set new table */
464
    data64 = cpu_to_be64(new_l1_table_offset);
465
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_table_offset),
466
                    &data64, sizeof(data64)) != sizeof(data64))
467
        goto fail;
468
    data32 = cpu_to_be32(new_l1_size);
469
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size),
470
                    &data32, sizeof(data32)) != sizeof(data32))
471
        goto fail;
472
    qemu_free(s->l1_table);
473
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
474
    s->l1_table_offset = new_l1_table_offset;
475
    s->l1_table = new_l1_table;
476
    s->l1_size = new_l1_size;
477
    return 0;
478
 fail:
479
    qemu_free(s->l1_table);
480
    return -EIO;
481
}
482

    
483
/* 'allocate' is:
484
 *
485
 * 0 not to allocate.
486
 *
487
 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
488
 * 'n_end')
489
 *
490
 * 2 to allocate a compressed cluster of size
491
 * 'compressed_size'. 'compressed_size' must be > 0 and <
492
 * cluster_size
493
 *
494
 * return 0 if not allocated.
495
 */
496
static uint64_t get_cluster_offset(BlockDriverState *bs,
497
                                   uint64_t offset, int allocate,
498
                                   int compressed_size,
499
                                   int n_start, int n_end)
500
{
501
    BDRVQcowState *s = bs->opaque;
502
    int min_index, i, j, l1_index, l2_index, ret;
503
    uint64_t l2_offset, *l2_table, cluster_offset, tmp, old_l2_offset;
504
   
505
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
506
    if (l1_index >= s->l1_size) {
507
        /* outside l1 table is allowed: we grow the table if needed */
508
        if (!allocate)
509
            return 0;
510
        if (grow_l1_table(bs, l1_index + 1) < 0)
511
            return 0;
512
    }
513
    l2_offset = s->l1_table[l1_index];
514
    if (!l2_offset) {
515
        if (!allocate)
516
            return 0;
517
    l2_allocate:
518
        old_l2_offset = l2_offset;
519
        /* allocate a new l2 entry */
520
        l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
521
        /* update the L1 entry */
522
        s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
523
        tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
524
        if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
525
                        &tmp, sizeof(tmp)) != sizeof(tmp))
526
            return 0;
527
        min_index = l2_cache_new_entry(bs);
528
        l2_table = s->l2_cache + (min_index << s->l2_bits);
529

    
530
        if (old_l2_offset == 0) {
531
            memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
532
        } else {
533
            if (bdrv_pread(s->hd, old_l2_offset,
534
                           l2_table, s->l2_size * sizeof(uint64_t)) !=
535
                s->l2_size * sizeof(uint64_t))
536
                return 0;
537
        }
538
        if (bdrv_pwrite(s->hd, l2_offset,
539
                        l2_table, s->l2_size * sizeof(uint64_t)) !=
540
            s->l2_size * sizeof(uint64_t))
541
            return 0;
542
    } else {
543
        if (!(l2_offset & QCOW_OFLAG_COPIED)) {
544
            if (allocate) {
545
                free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
546
                goto l2_allocate;
547
            }
548
        } else {
549
            l2_offset &= ~QCOW_OFLAG_COPIED;
550
        }
551
        for(i = 0; i < L2_CACHE_SIZE; i++) {
552
            if (l2_offset == s->l2_cache_offsets[i]) {
553
                /* increment the hit count */
554
                if (++s->l2_cache_counts[i] == 0xffffffff) {
555
                    for(j = 0; j < L2_CACHE_SIZE; j++) {
556
                        s->l2_cache_counts[j] >>= 1;
557
                    }
558
                }
559
                l2_table = s->l2_cache + (i << s->l2_bits);
560
                goto found;
561
            }
562
        }
563
        /* not found: load a new entry in the least used one */
564
        min_index = l2_cache_new_entry(bs);
565
        l2_table = s->l2_cache + (min_index << s->l2_bits);
566
        if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
567
            s->l2_size * sizeof(uint64_t))
568
            return 0;
569
    }
570
    s->l2_cache_offsets[min_index] = l2_offset;
571
    s->l2_cache_counts[min_index] = 1;
572
 found:
573
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
574
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
575
    if (!cluster_offset) {
576
        if (!allocate)
577
            return cluster_offset;
578
    } else if (!(cluster_offset & QCOW_OFLAG_COPIED)) {
579
        if (!allocate)
580
            return cluster_offset;
581
        /* free the cluster */
582
        if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
583
            int nb_csectors;
584
            nb_csectors = ((cluster_offset >> s->csize_shift) &
585
                           s->csize_mask) + 1;
586
            free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
587
                          nb_csectors * 512);
588
        } else {
589
            free_clusters(bs, cluster_offset, s->cluster_size);
590
        }
591
    } else {
592
        cluster_offset &= ~QCOW_OFLAG_COPIED;
593
        return cluster_offset;
594
    }
595
    if (allocate == 1) {
596
        /* allocate a new cluster */
597
        cluster_offset = alloc_clusters(bs, s->cluster_size);
598

    
599
        /* we must initialize the cluster content which won't be
600
           written */
601
        if ((n_end - n_start) < s->cluster_sectors) {
602
            uint64_t start_sect;
603
           
604
            start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
605
            ret = copy_sectors(bs, start_sect,
606
                               cluster_offset, 0, n_start);
607
            if (ret < 0)
608
                return 0;
609
            ret = copy_sectors(bs, start_sect,
610
                               cluster_offset, n_end, s->cluster_sectors);
611
            if (ret < 0)
612
                return 0;
613
        }
614
        tmp = cpu_to_be64(cluster_offset | QCOW_OFLAG_COPIED);
615
    } else {
616
        int nb_csectors;
617
        cluster_offset = alloc_bytes(bs, compressed_size);
618
        nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
619
            (cluster_offset >> 9);
620
        cluster_offset |= QCOW_OFLAG_COMPRESSED |
621
            ((uint64_t)nb_csectors << s->csize_shift);
622
        /* compressed clusters never have the copied flag */
623
        tmp = cpu_to_be64(cluster_offset);
624
    }
625
    /* update L2 table */
626
    l2_table[l2_index] = tmp;
627
    if (bdrv_pwrite(s->hd,
628
                    l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp))
629
        return 0;
630
    return cluster_offset;
631
}
632

    
633
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
634
                             int nb_sectors, int *pnum)
635
{
636
    BDRVQcowState *s = bs->opaque;
637
    int index_in_cluster, n;
638
    uint64_t cluster_offset;
639

    
640
    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
641
    index_in_cluster = sector_num & (s->cluster_sectors - 1);
642
    n = s->cluster_sectors - index_in_cluster;
643
    if (n > nb_sectors)
644
        n = nb_sectors;
645
    *pnum = n;
646
    return (cluster_offset != 0);
647
}
648

    
649
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
650
                             const uint8_t *buf, int buf_size)
651
{
652
    z_stream strm1, *strm = &strm1;
653
    int ret, out_len;
654

    
655
    memset(strm, 0, sizeof(*strm));
656

    
657
    strm->next_in = (uint8_t *)buf;
658
    strm->avail_in = buf_size;
659
    strm->next_out = out_buf;
660
    strm->avail_out = out_buf_size;
661

    
662
    ret = inflateInit2(strm, -12);
663
    if (ret != Z_OK)
664
        return -1;
665
    ret = inflate(strm, Z_FINISH);
666
    out_len = strm->next_out - out_buf;
667
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
668
        out_len != out_buf_size) {
669
        inflateEnd(strm);
670
        return -1;
671
    }
672
    inflateEnd(strm);
673
    return 0;
674
}
675
                             
676
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
677
{
678
    int ret, csize, nb_csectors, sector_offset;
679
    uint64_t coffset;
680

    
681
    coffset = cluster_offset & s->cluster_offset_mask;
682
    if (s->cluster_cache_offset != coffset) {
683
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
684
        sector_offset = coffset & 511;
685
        csize = nb_csectors * 512 - sector_offset;
686
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
687
        if (ret < 0) {
688
            return -1;
689
        }
690
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
691
                              s->cluster_data + sector_offset, csize) < 0) {
692
            return -1;
693
        }
694
        s->cluster_cache_offset = coffset;
695
    }
696
    return 0;
697
}
698

    
699
/* handle reading after the end of the backing file */
700
static int backing_read1(BlockDriverState *bs,
701
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
702
{
703
    int n1;
704
    if ((sector_num + nb_sectors) <= bs->total_sectors)
705
        return nb_sectors;
706
    if (sector_num >= bs->total_sectors)
707
        n1 = 0;
708
    else
709
        n1 = bs->total_sectors - sector_num;
710
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
711
    return n1;
712
}
713

    
714
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
715
                     uint8_t *buf, int nb_sectors)
716
{
717
    BDRVQcowState *s = bs->opaque;
718
    int ret, index_in_cluster, n, n1;
719
    uint64_t cluster_offset;
720
   
721
    while (nb_sectors > 0) {
722
        cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
723
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
724
        n = s->cluster_sectors - index_in_cluster;
725
        if (n > nb_sectors)
726
            n = nb_sectors;
727
        if (!cluster_offset) {
728
            if (bs->backing_hd) {
729
                /* read from the base image */
730
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
731
                if (n1 > 0) {
732
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
733
                    if (ret < 0)
734
                        return -1;
735
                }
736
            } else {
737
                memset(buf, 0, 512 * n);
738
            }
739
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
740
            if (decompress_cluster(s, cluster_offset) < 0)
741
                return -1;
742
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
743
        } else {
744
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
745
            if (ret != n * 512)
746
                return -1;
747
            if (s->crypt_method) {
748
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
749
                                &s->aes_decrypt_key);
750
            }
751
        }
752
        nb_sectors -= n;
753
        sector_num += n;
754
        buf += n * 512;
755
    }
756
    return 0;
757
}
758

    
759
static int qcow_write(BlockDriverState *bs, int64_t sector_num,
760
                     const uint8_t *buf, int nb_sectors)
761
{
762
    BDRVQcowState *s = bs->opaque;
763
    int ret, index_in_cluster, n;
764
    uint64_t cluster_offset;
765
   
766
    while (nb_sectors > 0) {
767
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
768
        n = s->cluster_sectors - index_in_cluster;
769
        if (n > nb_sectors)
770
            n = nb_sectors;
771
        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
772
                                            index_in_cluster,
773
                                            index_in_cluster + n);
774
        if (!cluster_offset)
775
            return -1;
776
        if (s->crypt_method) {
777
            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
778
                            &s->aes_encrypt_key);
779
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
780
                              s->cluster_data, n * 512);
781
        } else {
782
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
783
        }
784
        if (ret != n * 512)
785
            return -1;
786
        nb_sectors -= n;
787
        sector_num += n;
788
        buf += n * 512;
789
    }
790
    s->cluster_cache_offset = -1; /* disable compressed cache */
791
    return 0;
792
}
793

    
794
typedef struct QCowAIOCB {
795
    BlockDriverAIOCB common;
796
    int64_t sector_num;
797
    uint8_t *buf;
798
    int nb_sectors;
799
    int n;
800
    uint64_t cluster_offset;
801
    uint8_t *cluster_data;
802
    BlockDriverAIOCB *hd_aiocb;
803
} QCowAIOCB;
804

    
805
static void qcow_aio_read_cb(void *opaque, int ret)
806
{
807
    QCowAIOCB *acb = opaque;
808
    BlockDriverState *bs = acb->common.bs;
809
    BDRVQcowState *s = bs->opaque;
810
    int index_in_cluster, n1;
811

    
812
    acb->hd_aiocb = NULL;
813
    if (ret < 0) {
814
    fail:
815
        acb->common.cb(acb->common.opaque, ret);
816
        qemu_aio_release(acb);
817
        return;
818
    }
819

    
820
 redo:
821
    /* post process the read buffer */
822
    if (!acb->cluster_offset) {
823
        /* nothing to do */
824
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
825
        /* nothing to do */
826
    } else {
827
        if (s->crypt_method) {
828
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
829
                            acb->n, 0,
830
                            &s->aes_decrypt_key);
831
        }
832
    }
833

    
834
    acb->nb_sectors -= acb->n;
835
    acb->sector_num += acb->n;
836
    acb->buf += acb->n * 512;
837

    
838
    if (acb->nb_sectors == 0) {
839
        /* request completed */
840
        acb->common.cb(acb->common.opaque, 0);
841
        qemu_aio_release(acb);
842
        return;
843
    }
844
   
845
    /* prepare next AIO request */
846
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9,
847
                                             0, 0, 0, 0);
848
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
849
    acb->n = s->cluster_sectors - index_in_cluster;
850
    if (acb->n > acb->nb_sectors)
851
        acb->n = acb->nb_sectors;
852

    
853
    if (!acb->cluster_offset) {
854
        if (bs->backing_hd) {
855
            /* read from the base image */
856
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
857
                               acb->buf, acb->n);
858
            if (n1 > 0) {
859
                acb->hd_aiocb = bdrv_aio_read(bs->backing_hd, acb->sector_num,
860
                                    acb->buf, acb->n, qcow_aio_read_cb, acb);
861
                if (acb->hd_aiocb == NULL)
862
                    goto fail;
863
            } else {
864
                goto redo;
865
            }
866
        } else {
867
            /* Note: in this case, no need to wait */
868
            memset(acb->buf, 0, 512 * acb->n);
869
            goto redo;
870
        }
871
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
872
        /* add AIO support for compressed blocks ? */
873
        if (decompress_cluster(s, acb->cluster_offset) < 0)
874
            goto fail;
875
        memcpy(acb->buf,
876
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
877
        goto redo;
878
    } else {
879
        if ((acb->cluster_offset & 511) != 0) {
880
            ret = -EIO;
881
            goto fail;
882
        }
883
        acb->hd_aiocb = bdrv_aio_read(s->hd,
884
                            (acb->cluster_offset >> 9) + index_in_cluster,
885
                            acb->buf, acb->n, qcow_aio_read_cb, acb);
886
        if (acb->hd_aiocb == NULL)
887
            goto fail;
888
    }
889
}
890

    
891
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
892
        int64_t sector_num, uint8_t *buf, int nb_sectors,
893
        BlockDriverCompletionFunc *cb, void *opaque)
894
{
895
    QCowAIOCB *acb;
896

    
897
    acb = qemu_aio_get(bs, cb, opaque);
898
    if (!acb)
899
        return NULL;
900
    acb->hd_aiocb = NULL;
901
    acb->sector_num = sector_num;
902
    acb->buf = buf;
903
    acb->nb_sectors = nb_sectors;
904
    acb->n = 0;
905
    acb->cluster_offset = 0;
906
    return acb;
907
}
908

    
909
static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs,
910
        int64_t sector_num, uint8_t *buf, int nb_sectors,
911
        BlockDriverCompletionFunc *cb, void *opaque)
912
{
913
    QCowAIOCB *acb;
914

    
915
    acb = qcow_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
916
    if (!acb)
917
        return NULL;
918

    
919
    qcow_aio_read_cb(acb, 0);
920
    return &acb->common;
921
}
922

    
923
static void qcow_aio_write_cb(void *opaque, int ret)
924
{
925
    QCowAIOCB *acb = opaque;
926
    BlockDriverState *bs = acb->common.bs;
927
    BDRVQcowState *s = bs->opaque;
928
    int index_in_cluster;
929
    uint64_t cluster_offset;
930
    const uint8_t *src_buf;
931

    
932
    acb->hd_aiocb = NULL;
933

    
934
    if (ret < 0) {
935
    fail:
936
        acb->common.cb(acb->common.opaque, ret);
937
        qemu_aio_release(acb);
938
        return;
939
    }
940

    
941
    acb->nb_sectors -= acb->n;
942
    acb->sector_num += acb->n;
943
    acb->buf += acb->n * 512;
944

    
945
    if (acb->nb_sectors == 0) {
946
        /* request completed */
947
        acb->common.cb(acb->common.opaque, 0);
948
        qemu_aio_release(acb);
949
        return;
950
    }
951
   
952
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
953
    acb->n = s->cluster_sectors - index_in_cluster;
954
    if (acb->n > acb->nb_sectors)
955
        acb->n = acb->nb_sectors;
956
    cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0,
957
                                        index_in_cluster,
958
                                        index_in_cluster + acb->n);
959
    if (!cluster_offset || (cluster_offset & 511) != 0) {
960
        ret = -EIO;
961
        goto fail;
962
    }
963
    if (s->crypt_method) {
964
        if (!acb->cluster_data) {
965
            acb->cluster_data = qemu_mallocz(s->cluster_size);
966
            if (!acb->cluster_data) {
967
                ret = -ENOMEM;
968
                goto fail;
969
            }
970
        }
971
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
972
                        acb->n, 1, &s->aes_encrypt_key);
973
        src_buf = acb->cluster_data;
974
    } else {
975
        src_buf = acb->buf;
976
    }
977
    acb->hd_aiocb = bdrv_aio_write(s->hd,
978
                                   (cluster_offset >> 9) + index_in_cluster,
979
                                   src_buf, acb->n,
980
                                   qcow_aio_write_cb, acb);
981
    if (acb->hd_aiocb == NULL)
982
        goto fail;
983
}
984

    
985
static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs,
986
        int64_t sector_num, const uint8_t *buf, int nb_sectors,
987
        BlockDriverCompletionFunc *cb, void *opaque)
988
{
989
    BDRVQcowState *s = bs->opaque;
990
    QCowAIOCB *acb;
991
   
992
    s->cluster_cache_offset = -1; /* disable compressed cache */
993

    
994
    acb = qcow_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
995
    if (!acb)
996
        return NULL;
997
   
998
    qcow_aio_write_cb(acb, 0);
999
    return &acb->common;
1000
}
1001

    
1002
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1003
{
1004
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1005
    if (acb->hd_aiocb)
1006
        bdrv_aio_cancel(acb->hd_aiocb);
1007
    qemu_aio_release(acb);
1008
}
1009

    
1010
static void qcow_close(BlockDriverState *bs)
1011
{
1012
    BDRVQcowState *s = bs->opaque;
1013
    qemu_free(s->l1_table);
1014
    qemu_free(s->l2_cache);
1015
    qemu_free(s->cluster_cache);
1016
    qemu_free(s->cluster_data);
1017
    refcount_close(bs);
1018
    bdrv_delete(s->hd);
1019
}
1020

    
1021
/* XXX: use std qcow open function ? */
1022
typedef struct QCowCreateState {
1023
    int cluster_size;
1024
    int cluster_bits;
1025
    uint16_t *refcount_block;
1026
    uint64_t *refcount_table;
1027
    int64_t l1_table_offset;
1028
    int64_t refcount_table_offset;
1029
    int64_t refcount_block_offset;
1030
} QCowCreateState;
1031

    
1032
static void create_refcount_update(QCowCreateState *s,
1033
                                   int64_t offset, int64_t size)
1034
{
1035
    int refcount;
1036
    int64_t start, last, cluster_offset;
1037
    uint16_t *p;
1038

    
1039
    start = offset & ~(s->cluster_size - 1);
1040
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
1041
    for(cluster_offset = start; cluster_offset <= last;
1042
        cluster_offset += s->cluster_size) {
1043
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
1044
        refcount = be16_to_cpu(*p);
1045
        refcount++;
1046
        *p = cpu_to_be16(refcount);
1047
    }
1048
}
1049

    
1050
static int qcow_create(const char *filename, int64_t total_size,
1051
                      const char *backing_file, int flags)
1052
{
1053
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1054
    QCowHeader header;
1055
    uint64_t tmp, offset;
1056
    QCowCreateState s1, *s = &s1;
1057
   
1058
    memset(s, 0, sizeof(*s));
1059

    
1060
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1061
    if (fd < 0)
1062
        return -1;
1063
    memset(&header, 0, sizeof(header));
1064
    header.magic = cpu_to_be32(QCOW_MAGIC);
1065
    header.version = cpu_to_be32(QCOW_VERSION);
1066
    header.size = cpu_to_be64(total_size * 512);
1067
    header_size = sizeof(header);
1068
    backing_filename_len = 0;
1069
    if (backing_file) {
1070
        header.backing_file_offset = cpu_to_be64(header_size);
1071
        backing_filename_len = strlen(backing_file);
1072
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1073
        header_size += backing_filename_len;
1074
    }
1075
    s->cluster_bits = 12;  /* 4 KB clusters */
1076
    s->cluster_size = 1 << s->cluster_bits;
1077
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1078
    header_size = (header_size + 7) & ~7;
1079
    if (flags) {
1080
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1081
    } else {
1082
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1083
    }
1084
    l2_bits = s->cluster_bits - 3;
1085
    shift = s->cluster_bits + l2_bits;
1086
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1087
    offset = align_offset(header_size, s->cluster_size);
1088
    s->l1_table_offset = offset;
1089
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1090
    header.l1_size = cpu_to_be32(l1_size);
1091
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1092

    
1093
    s->refcount_table = qemu_mallocz(s->cluster_size);
1094
    if (!s->refcount_table)
1095
        goto fail;
1096
    s->refcount_block = qemu_mallocz(s->cluster_size);
1097
    if (!s->refcount_block)
1098
        goto fail;
1099
   
1100
    s->refcount_table_offset = offset;
1101
    header.refcount_table_offset = cpu_to_be64(offset);
1102
    header.refcount_table_clusters = cpu_to_be32(1);
1103
    offset += s->cluster_size;
1104

    
1105
    s->refcount_table[0] = cpu_to_be64(offset);
1106
    s->refcount_block_offset = offset;
1107
    offset += s->cluster_size;
1108

    
1109
    /* update refcounts */
1110
    create_refcount_update(s, 0, header_size);
1111
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1112
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1113
    create_refcount_update(s, s->refcount_block_offset, s->cluster_size);
1114
   
1115
    /* write all the data */
1116
    write(fd, &header, sizeof(header));
1117
    if (backing_file) {
1118
        write(fd, backing_file, backing_filename_len);
1119
    }
1120
    lseek(fd, s->l1_table_offset, SEEK_SET);
1121
    tmp = 0;
1122
    for(i = 0;i < l1_size; i++) {
1123
        write(fd, &tmp, sizeof(tmp));
1124
    }
1125
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1126
    write(fd, s->refcount_table, s->cluster_size);
1127
   
1128
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1129
    write(fd, s->refcount_block, s->cluster_size);
1130

    
1131
    qemu_free(s->refcount_table);
1132
    qemu_free(s->refcount_block);
1133
    close(fd);
1134
    return 0;
1135
 fail:
1136
    qemu_free(s->refcount_table);
1137
    qemu_free(s->refcount_block);
1138
    close(fd);
1139
    return -ENOMEM;
1140
}
1141

    
1142
static int qcow_make_empty(BlockDriverState *bs)
1143
{
1144
#if 0
1145
    /* XXX: not correct */
1146
    BDRVQcowState *s = bs->opaque;
1147
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1148
    int ret;
1149

1150
    memset(s->l1_table, 0, l1_length);
1151
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1152
        return -1;
1153
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1154
    if (ret < 0)
1155
        return ret;
1156
   
1157
    l2_cache_reset(bs);
1158
#endif
1159
    return 0;
1160
}
1161

    
1162
/* XXX: put compressed sectors first, then all the cluster aligned
1163
   tables to avoid losing bytes in alignment */
1164
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1165
                                 const uint8_t *buf, int nb_sectors)
1166
{
1167
    BDRVQcowState *s = bs->opaque;
1168
    z_stream strm;
1169
    int ret, out_len;
1170
    uint8_t *out_buf;
1171
    uint64_t cluster_offset;
1172

    
1173
    if (nb_sectors == 0) {
1174
        /* align end of file to a sector boundary to ease reading with
1175
           sector based I/Os */
1176
        cluster_offset = bdrv_getlength(s->hd);
1177
        cluster_offset = (cluster_offset + 511) & ~511;
1178
        bdrv_truncate(s->hd, cluster_offset);
1179
        return 0;
1180
    }
1181

    
1182
    if (nb_sectors != s->cluster_sectors)
1183
        return -EINVAL;
1184

    
1185
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1186
    if (!out_buf)
1187
        return -ENOMEM;
1188

    
1189
    /* best compression, small window, no zlib header */
1190
    memset(&strm, 0, sizeof(strm));
1191
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1192
                       Z_DEFLATED, -12,
1193
                       9, Z_DEFAULT_STRATEGY);
1194
    if (ret != 0) {
1195
        qemu_free(out_buf);
1196
        return -1;
1197
    }
1198

    
1199
    strm.avail_in = s->cluster_size;
1200
    strm.next_in = (uint8_t *)buf;
1201
    strm.avail_out = s->cluster_size;
1202
    strm.next_out = out_buf;
1203

    
1204
    ret = deflate(&strm, Z_FINISH);
1205
    if (ret != Z_STREAM_END && ret != Z_OK) {
1206
        qemu_free(out_buf);
1207
        deflateEnd(&strm);
1208
        return -1;
1209
    }
1210
    out_len = strm.next_out - out_buf;
1211

    
1212
    deflateEnd(&strm);
1213

    
1214
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1215
        /* could not compress: write normal cluster */
1216
        qcow_write(bs, sector_num, buf, s->cluster_sectors);
1217
    } else {
1218
        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
1219
                                            out_len, 0, 0);
1220
        cluster_offset &= s->cluster_offset_mask;
1221
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1222
            qemu_free(out_buf);
1223
            return -1;
1224
        }
1225
    }
1226
   
1227
    qemu_free(out_buf);
1228
    return 0;
1229
}
1230

    
1231
static void qcow_flush(BlockDriverState *bs)
1232
{
1233
    BDRVQcowState *s = bs->opaque;
1234
    bdrv_flush(s->hd);
1235
}
1236

    
1237
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1238
{
1239
    BDRVQcowState *s = bs->opaque;
1240
    bdi->cluster_size = s->cluster_size;
1241
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1242
        (s->cluster_bits + s->l2_bits);
1243
    return 0;
1244
}
1245

    
1246
/*********************************************************/
1247
/* snapshot support */
1248

    
1249
/* update the refcounts of snapshots and the copied flag */
1250
static int update_snapshot_refcount(BlockDriverState *bs,
1251
                                    int64_t l1_table_offset,
1252
                                    int l1_size,
1253
                                    int addend)
1254
{
1255
    BDRVQcowState *s = bs->opaque;
1256
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
1257
    int64_t old_offset, old_l2_offset;
1258
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
1259
   
1260
    l2_cache_reset(bs);
1261

    
1262
    l2_table = NULL;
1263
    l1_table = NULL;
1264
    l1_size2 = l1_size * sizeof(uint64_t);
1265
    l1_allocated = 0;
1266
    if (l1_table_offset != s->l1_table_offset) {
1267
        l1_table = qemu_malloc(l1_size2);
1268
        if (!l1_table)
1269
            goto fail;
1270
        l1_allocated = 1;
1271
        if (bdrv_pread(s->hd, l1_table_offset,
1272
                       l1_table, l1_size2) != l1_size2)
1273
            goto fail;
1274
        for(i = 0;i < l1_size; i++)
1275
            be64_to_cpus(&l1_table[i]);
1276
    } else {
1277
        assert(l1_size == s->l1_size);
1278
        l1_table = s->l1_table;
1279
        l1_allocated = 0;
1280
    }
1281
   
1282
    l2_size = s->l2_size * sizeof(uint64_t);
1283
    l2_table = qemu_malloc(l2_size);
1284
    if (!l2_table)
1285
        goto fail;
1286
    l1_modified = 0;
1287
    for(i = 0; i < l1_size; i++) {
1288
        l2_offset = l1_table[i];
1289
        if (l2_offset) {
1290
            old_l2_offset = l2_offset;
1291
            l2_offset &= ~QCOW_OFLAG_COPIED;
1292
            l2_modified = 0;
1293
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
1294
                goto fail;
1295
            for(j = 0; j < s->l2_size; j++) {
1296
                offset = be64_to_cpu(l2_table[j]);
1297
                if (offset != 0) {
1298
                    old_offset = offset;
1299
                    offset &= ~QCOW_OFLAG_COPIED;
1300
                    if (offset & QCOW_OFLAG_COMPRESSED) {
1301
                        nb_csectors = ((offset >> s->csize_shift) &
1302
                                       s->csize_mask) + 1;
1303
                        if (addend != 0)
1304
                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
1305
                                            nb_csectors * 512, addend);
1306
                        /* compressed clusters are never modified */
1307
                        refcount = 2;
1308
                    } else {
1309
                        if (addend != 0) {
1310
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
1311
                        } else {
1312
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
1313
                        }
1314
                    }
1315

    
1316
                    if (refcount == 1) {
1317
                        offset |= QCOW_OFLAG_COPIED;
1318
                    }
1319
                    if (offset != old_offset) {
1320
                        l2_table[j] = cpu_to_be64(offset);
1321
                        l2_modified = 1;
1322
                    }
1323
                }
1324
            }
1325
            if (l2_modified) {
1326
                if (bdrv_pwrite(s->hd,
1327
                                l2_offset, l2_table, l2_size) != l2_size)
1328
                    goto fail;
1329
            }
1330

    
1331
            if (addend != 0) {
1332
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
1333
            } else {
1334
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1335
            }
1336
            if (refcount == 1) {
1337
                l2_offset |= QCOW_OFLAG_COPIED;
1338
            }
1339
            if (l2_offset != old_l2_offset) {
1340
                l1_table[i] = l2_offset;
1341
                l1_modified = 1;
1342
            }
1343
        }
1344
    }
1345
    if (l1_modified) {
1346
        for(i = 0; i < l1_size; i++)
1347
            cpu_to_be64s(&l1_table[i]);
1348
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
1349
                        l1_size2) != l1_size2)
1350
            goto fail;
1351
        for(i = 0; i < l1_size; i++)
1352
            be64_to_cpus(&l1_table[i]);
1353
    }
1354
    if (l1_allocated)
1355
        qemu_free(l1_table);
1356
    qemu_free(l2_table);
1357
    return 0;
1358
 fail:
1359
    if (l1_allocated)
1360
        qemu_free(l1_table);
1361
    qemu_free(l2_table);
1362
    return -EIO;
1363
}
1364

    
1365
static void qcow_free_snapshots(BlockDriverState *bs)
1366
{
1367
    BDRVQcowState *s = bs->opaque;
1368
    int i;
1369

    
1370
    for(i = 0; i < s->nb_snapshots; i++) {
1371
        qemu_free(s->snapshots[i].name);
1372
        qemu_free(s->snapshots[i].id_str);
1373
    }
1374
    qemu_free(s->snapshots);
1375
    s->snapshots = NULL;
1376
    s->nb_snapshots = 0;
1377
}
1378

    
1379
static int qcow_read_snapshots(BlockDriverState *bs)
1380
{
1381
    BDRVQcowState *s = bs->opaque;
1382
    QCowSnapshotHeader h;
1383
    QCowSnapshot *sn;
1384
    int i, id_str_size, name_size;
1385
    int64_t offset;
1386
    uint32_t extra_data_size;
1387

    
1388
    offset = s->snapshots_offset;
1389
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1390
    if (!s->snapshots)
1391
        goto fail;
1392
    for(i = 0; i < s->nb_snapshots; i++) {
1393
        offset = align_offset(offset, 8);
1394
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1395
            goto fail;
1396
        offset += sizeof(h);
1397
        sn = s->snapshots + i;
1398
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1399
        sn->l1_size = be32_to_cpu(h.l1_size);
1400
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1401
        sn->date_sec = be32_to_cpu(h.date_sec);
1402
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1403
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1404
        extra_data_size = be32_to_cpu(h.extra_data_size);
1405

    
1406
        id_str_size = be16_to_cpu(h.id_str_size);
1407
        name_size = be16_to_cpu(h.name_size);
1408

    
1409
        offset += extra_data_size;
1410

    
1411
        sn->id_str = qemu_malloc(id_str_size + 1);
1412
        if (!sn->id_str)
1413
            goto fail;
1414
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1415
            goto fail;
1416
        offset += id_str_size;
1417
        sn->id_str[id_str_size] = '\0';
1418

    
1419
        sn->name = qemu_malloc(name_size + 1);
1420
        if (!sn->name)
1421
            goto fail;
1422
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1423
            goto fail;
1424
        offset += name_size;
1425
        sn->name[name_size] = '\0';
1426
    }
1427
    s->snapshots_size = offset - s->snapshots_offset;
1428
    return 0;
1429
 fail:
1430
    qcow_free_snapshots(bs);
1431
    return -1;
1432
}
1433

    
1434
/* add at the end of the file a new list of snapshots */
1435
static int qcow_write_snapshots(BlockDriverState *bs)
1436
{
1437
    BDRVQcowState *s = bs->opaque;
1438
    QCowSnapshot *sn;
1439
    QCowSnapshotHeader h;
1440
    int i, name_size, id_str_size, snapshots_size;
1441
    uint64_t data64;
1442
    uint32_t data32;
1443
    int64_t offset, snapshots_offset;
1444

    
1445
    /* compute the size of the snapshots */
1446
    offset = 0;
1447
    for(i = 0; i < s->nb_snapshots; i++) {
1448
        sn = s->snapshots + i;
1449
        offset = align_offset(offset, 8);
1450
        offset += sizeof(h);
1451
        offset += strlen(sn->id_str);
1452
        offset += strlen(sn->name);
1453
    }
1454
    snapshots_size = offset;
1455

    
1456
    snapshots_offset = alloc_clusters(bs, snapshots_size);
1457
    offset = snapshots_offset;
1458
   
1459
    for(i = 0; i < s->nb_snapshots; i++) {
1460
        sn = s->snapshots + i;
1461
        memset(&h, 0, sizeof(h));
1462
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
1463
        h.l1_size = cpu_to_be32(sn->l1_size);
1464
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
1465
        h.date_sec = cpu_to_be32(sn->date_sec);
1466
        h.date_nsec = cpu_to_be32(sn->date_nsec);
1467
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
1468
       
1469
        id_str_size = strlen(sn->id_str);
1470
        name_size = strlen(sn->name);
1471
        h.id_str_size = cpu_to_be16(id_str_size);
1472
        h.name_size = cpu_to_be16(name_size);
1473
        offset = align_offset(offset, 8);
1474
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1475
            goto fail;
1476
        offset += sizeof(h);
1477
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1478
            goto fail;
1479
        offset += id_str_size;
1480
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
1481
            goto fail;
1482
        offset += name_size;
1483
    }
1484

    
1485
    /* update the various header fields */
1486
    data64 = cpu_to_be64(snapshots_offset);
1487
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
1488
                    &data64, sizeof(data64)) != sizeof(data64))
1489
        goto fail;
1490
    data32 = cpu_to_be32(s->nb_snapshots);
1491
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
1492
                    &data32, sizeof(data32)) != sizeof(data32))
1493
        goto fail;
1494

    
1495
    /* free the old snapshot table */
1496
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
1497
    s->snapshots_offset = snapshots_offset;
1498
    s->snapshots_size = snapshots_size;
1499
    return 0;
1500
 fail:
1501
    return -1;
1502
}
1503

    
1504
static void find_new_snapshot_id(BlockDriverState *bs,
1505
                                 char *id_str, int id_str_size)
1506
{
1507
    BDRVQcowState *s = bs->opaque;
1508
    QCowSnapshot *sn;
1509
    int i, id, id_max = 0;
1510

    
1511
    for(i = 0; i < s->nb_snapshots; i++) {
1512
        sn = s->snapshots + i;
1513
        id = strtoul(sn->id_str, NULL, 10);
1514
        if (id > id_max)
1515
            id_max = id;
1516
    }
1517
    snprintf(id_str, id_str_size, "%d", id_max + 1);
1518
}
1519

    
1520
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
1521
{
1522
    BDRVQcowState *s = bs->opaque;
1523
    int i;
1524

    
1525
    for(i = 0; i < s->nb_snapshots; i++) {
1526
        if (!strcmp(s->snapshots[i].id_str, id_str))
1527
            return i;
1528
    }
1529
    return -1;
1530
}
1531

    
1532
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
1533
{
1534
    BDRVQcowState *s = bs->opaque;
1535
    int i, ret;
1536
   
1537
    ret = find_snapshot_by_id(bs, name);
1538
    if (ret >= 0)
1539
        return ret;
1540
    for(i = 0; i < s->nb_snapshots; i++) {
1541
        if (!strcmp(s->snapshots[i].name, name))
1542
            return i;
1543
    }
1544
    return -1;
1545
}
1546

    
1547
/* if no id is provided, a new one is constructed */
1548
static int qcow_snapshot_create(BlockDriverState *bs,
1549
                                QEMUSnapshotInfo *sn_info)
1550
{
1551
    BDRVQcowState *s = bs->opaque;
1552
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
1553
    int i, ret;
1554
    uint64_t *l1_table = NULL;
1555
   
1556
    memset(sn, 0, sizeof(*sn));
1557

    
1558
    if (sn_info->id_str[0] == '\0') {
1559
        /* compute a new id */
1560
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
1561
    }
1562

    
1563
    /* check that the ID is unique */
1564
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
1565
        return -ENOENT;
1566

    
1567
    sn->id_str = qemu_strdup(sn_info->id_str);
1568
    if (!sn->id_str)
1569
        goto fail;
1570
    sn->name = qemu_strdup(sn_info->name);
1571
    if (!sn->name)
1572
        goto fail;
1573
    sn->vm_state_size = sn_info->vm_state_size;
1574
    sn->date_sec = sn_info->date_sec;
1575
    sn->date_nsec = sn_info->date_nsec;
1576
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
1577

    
1578
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
1579
    if (ret < 0)
1580
        goto fail;
1581

    
1582
    /* create the L1 table of the snapshot */
1583
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
1584
    sn->l1_size = s->l1_size;
1585

    
1586
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
1587
    if (!l1_table)
1588
        goto fail;
1589
    for(i = 0; i < s->l1_size; i++) {
1590
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
1591
    }
1592
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
1593
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
1594
        (s->l1_size * sizeof(uint64_t)))
1595
        goto fail;
1596
    qemu_free(l1_table);
1597
    l1_table = NULL;
1598

    
1599
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
1600
    if (!snapshots1)
1601
        goto fail;
1602
    memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
1603
    s->snapshots = snapshots1;
1604
    s->snapshots[s->nb_snapshots++] = *sn;
1605

    
1606
    if (qcow_write_snapshots(bs) < 0)
1607
        goto fail;
1608
#ifdef DEBUG_ALLOC
1609
    check_refcounts(bs);
1610
#endif
1611
    return 0;
1612
 fail:
1613
    qemu_free(sn->name);
1614
    qemu_free(l1_table);
1615
    return -1;
1616
}
1617

    
1618
/* copy the snapshot 'snapshot_name' into the current disk image */
1619
static int qcow_snapshot_goto(BlockDriverState *bs,
1620
                              const char *snapshot_id)
1621
{
1622
    BDRVQcowState *s = bs->opaque;
1623
    QCowSnapshot *sn;
1624
    int i, snapshot_index, l1_size2;
1625

    
1626
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
1627
    if (snapshot_index < 0)
1628
        return -ENOENT;
1629
    sn = &s->snapshots[snapshot_index];
1630

    
1631
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
1632
        goto fail;
1633

    
1634
    if (grow_l1_table(bs, sn->l1_size) < 0)
1635
        goto fail;
1636

    
1637
    s->l1_size = sn->l1_size;
1638
    l1_size2 = s->l1_size * sizeof(uint64_t);
1639
    /* copy the snapshot l1 table to the current l1 table */
1640
    if (bdrv_pread(s->hd, sn->l1_table_offset,
1641
                   s->l1_table, l1_size2) != l1_size2)
1642
        goto fail;
1643
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
1644
                    s->l1_table, l1_size2) != l1_size2)
1645
        goto fail;
1646
    for(i = 0;i < s->l1_size; i++) {
1647
        be64_to_cpus(&s->l1_table[i]);
1648
    }
1649

    
1650
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
1651
        goto fail;
1652

    
1653
#ifdef DEBUG_ALLOC
1654
    check_refcounts(bs);
1655
#endif
1656
    return 0;
1657
 fail:
1658
    return -EIO;
1659
}
1660

    
1661
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1662
{
1663
    BDRVQcowState *s = bs->opaque;
1664
    QCowSnapshot *sn;
1665
    int snapshot_index, ret;
1666
   
1667
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
1668
    if (snapshot_index < 0)
1669
        return -ENOENT;
1670
    sn = &s->snapshots[snapshot_index];
1671

    
1672
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
1673
    if (ret < 0)
1674
        return ret;
1675
    /* must update the copied flag on the current cluster offsets */
1676
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
1677
    if (ret < 0)
1678
        return ret;
1679
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
1680

    
1681
    qemu_free(sn->id_str);
1682
    qemu_free(sn->name);
1683
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
1684
    s->nb_snapshots--;
1685
    ret = qcow_write_snapshots(bs);
1686
    if (ret < 0) {
1687
        /* XXX: restore snapshot if error ? */
1688
        return ret;
1689
    }
1690
#ifdef DEBUG_ALLOC
1691
    check_refcounts(bs);
1692
#endif
1693
    return 0;
1694
}
1695

    
1696
static int qcow_snapshot_list(BlockDriverState *bs,
1697
                              QEMUSnapshotInfo **psn_tab)
1698
{
1699
    BDRVQcowState *s = bs->opaque;
1700
    QEMUSnapshotInfo *sn_tab, *sn_info;
1701
    QCowSnapshot *sn;
1702
    int i;
1703

    
1704
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
1705
    if (!sn_tab)
1706
        goto fail;
1707
    for(i = 0; i < s->nb_snapshots; i++) {
1708
        sn_info = sn_tab + i;
1709
        sn = s->snapshots + i;
1710
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
1711
                sn->id_str);
1712
        pstrcpy(sn_info->name, sizeof(sn_info->name),
1713
                sn->name);
1714
        sn_info->vm_state_size = sn->vm_state_size;
1715
        sn_info->date_sec = sn->date_sec;
1716
        sn_info->date_nsec = sn->date_nsec;
1717
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
1718
    }
1719
    *psn_tab = sn_tab;
1720
    return s->nb_snapshots;
1721
 fail:
1722
    qemu_free(sn_tab);
1723
    *psn_tab = NULL;
1724
    return -ENOMEM;
1725
}
1726

    
1727
/*********************************************************/
1728
/* refcount handling */
1729

    
1730
static int refcount_init(BlockDriverState *bs)
1731
{
1732
    BDRVQcowState *s = bs->opaque;
1733
    int ret, refcount_table_size2, i;
1734
   
1735
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
1736
    if (!s->refcount_block_cache)
1737
        goto fail;
1738
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
1739
    s->refcount_table = qemu_malloc(refcount_table_size2);
1740
    if (!s->refcount_table)
1741
        goto fail;
1742
    if (s->refcount_table_size > 0) {
1743
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
1744
                         s->refcount_table, refcount_table_size2);
1745
        if (ret != refcount_table_size2)
1746
            goto fail;
1747
        for(i = 0; i < s->refcount_table_size; i++)
1748
            be64_to_cpus(&s->refcount_table[i]);
1749
    }
1750
    return 0;
1751
 fail:
1752
    return -ENOMEM;
1753
}
1754

    
1755
static void refcount_close(BlockDriverState *bs)
1756
{
1757
    BDRVQcowState *s = bs->opaque;
1758
    qemu_free(s->refcount_block_cache);
1759
    qemu_free(s->refcount_table);
1760
}
1761

    
1762

    
1763
static int load_refcount_block(BlockDriverState *bs,
1764
                               int64_t refcount_block_offset)
1765
{
1766
    BDRVQcowState *s = bs->opaque;
1767
    int ret;
1768
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
1769
                     s->cluster_size);
1770
    if (ret != s->cluster_size)
1771
        return -EIO;
1772
    s->refcount_block_cache_offset = refcount_block_offset;
1773
    return 0;
1774
}
1775

    
1776
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
1777
{
1778
    BDRVQcowState *s = bs->opaque;
1779
    int refcount_table_index, block_index;
1780
    int64_t refcount_block_offset;
1781

    
1782
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
1783
    if (refcount_table_index >= s->refcount_table_size)
1784
        return 0;
1785
    refcount_block_offset = s->refcount_table[refcount_table_index];
1786
    if (!refcount_block_offset)
1787
        return 0;
1788
    if (refcount_block_offset != s->refcount_block_cache_offset) {
1789
        /* better than nothing: return allocated if read error */
1790
        if (load_refcount_block(bs, refcount_block_offset) < 0)
1791
            return 1;
1792
    }
1793
    block_index = cluster_index &
1794
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
1795
    return be16_to_cpu(s->refcount_block_cache[block_index]);
1796
}
1797

    
1798
/* return < 0 if error */
1799
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
1800
{
1801
    BDRVQcowState *s = bs->opaque;
1802
    int i, nb_clusters;
1803

    
1804
    nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
1805
    for(;;) {
1806
        if (get_refcount(bs, s->free_cluster_index) == 0) {
1807
            s->free_cluster_index++;
1808
            for(i = 1; i < nb_clusters; i++) {
1809
                if (get_refcount(bs, s->free_cluster_index) != 0)
1810
                    goto not_found;
1811
                s->free_cluster_index++;
1812
            }
1813
#ifdef DEBUG_ALLOC2
1814
            printf("alloc_clusters: size=%lld -> %lld\n",
1815
                   size,
1816
                   (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1817
#endif
1818
            return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1819
        } else {
1820
        not_found:
1821
            s->free_cluster_index++;
1822
        }
1823
    }
1824
}
1825

    
1826
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
1827
{
1828
    int64_t offset;
1829

    
1830
    offset = alloc_clusters_noref(bs, size);
1831
    update_refcount(bs, offset, size, 1);
1832
    return offset;
1833
}
1834

    
1835
/* only used to allocate compressed sectors. We try to allocate
1836
   contiguous sectors. size must be <= cluster_size */
1837
static int64_t alloc_bytes(BlockDriverState *bs, int size)
1838
{
1839
    BDRVQcowState *s = bs->opaque;
1840
    int64_t offset, cluster_offset;
1841
    int free_in_cluster;
1842
   
1843
    assert(size > 0 && size <= s->cluster_size);
1844
    if (s->free_byte_offset == 0) {
1845
        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
1846
    }
1847
 redo:
1848
    free_in_cluster = s->cluster_size -
1849
        (s->free_byte_offset & (s->cluster_size - 1));
1850
    if (size <= free_in_cluster) {
1851
        /* enough space in current cluster */
1852
        offset = s->free_byte_offset;
1853
        s->free_byte_offset += size;
1854
        free_in_cluster -= size;
1855
        if (free_in_cluster == 0)
1856
            s->free_byte_offset = 0;
1857
        if ((offset & (s->cluster_size - 1)) != 0)
1858
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
1859
    } else {
1860
        offset = alloc_clusters(bs, s->cluster_size);
1861
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
1862
        if ((cluster_offset + s->cluster_size) == offset) {
1863
            /* we are lucky: contiguous data */
1864
            offset = s->free_byte_offset;
1865
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
1866
            s->free_byte_offset += size;
1867
        } else {
1868
            s->free_byte_offset = offset;
1869
            goto redo;
1870
        }
1871
    }
1872
    return offset;
1873
}
1874

    
1875
static void free_clusters(BlockDriverState *bs,
1876
                          int64_t offset, int64_t size)
1877
{
1878
    update_refcount(bs, offset, size, -1);
1879
}
1880

    
1881
static int grow_refcount_table(BlockDriverState *bs, int min_size)
1882
{
1883
    BDRVQcowState *s = bs->opaque;
1884
    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
1885
    uint64_t *new_table;
1886
    int64_t table_offset;
1887
    uint64_t data64;
1888
    uint32_t data32;
1889
    int old_table_size;
1890
    int64_t old_table_offset;
1891

    
1892
    if (min_size <= s->refcount_table_size)
1893
        return 0;
1894
    /* compute new table size */
1895
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
1896
    for(;;) {
1897
        if (refcount_table_clusters == 0) {
1898
            refcount_table_clusters = 1;
1899
        } else {
1900
            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
1901
        }
1902
        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
1903
        if (min_size <= new_table_size)
1904
            break;
1905
    }
1906
#ifdef DEBUG_ALLOC2
1907
    printf("grow_refcount_table from %d to %d\n",
1908
           s->refcount_table_size,
1909
           new_table_size);
1910
#endif
1911
    new_table_size2 = new_table_size * sizeof(uint64_t);
1912
    new_table = qemu_mallocz(new_table_size2);
1913
    if (!new_table)
1914
        return -ENOMEM;
1915
    memcpy(new_table, s->refcount_table,
1916
           s->refcount_table_size * sizeof(uint64_t));
1917
    for(i = 0; i < s->refcount_table_size; i++)
1918
        cpu_to_be64s(&new_table[i]);
1919
    /* Note: we cannot update the refcount now to avoid recursion */
1920
    table_offset = alloc_clusters_noref(bs, new_table_size2);
1921
    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
1922
    if (ret != new_table_size2)
1923
        goto fail;
1924
    for(i = 0; i < s->refcount_table_size; i++)
1925
        be64_to_cpus(&new_table[i]);
1926

    
1927
    data64 = cpu_to_be64(table_offset);
1928
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
1929
                    &data64, sizeof(data64)) != sizeof(data64))
1930
        goto fail;
1931
    data32 = cpu_to_be32(refcount_table_clusters);
1932
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_clusters),
1933
                    &data32, sizeof(data32)) != sizeof(data32))
1934
        goto fail;
1935
    qemu_free(s->refcount_table);
1936
    old_table_offset = s->refcount_table_offset;
1937
    old_table_size = s->refcount_table_size;
1938
    s->refcount_table = new_table;
1939
    s->refcount_table_size = new_table_size;
1940
    s->refcount_table_offset = table_offset;
1941

    
1942
    update_refcount(bs, table_offset, new_table_size2, 1);
1943
    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
1944
    return 0;
1945
 fail:
1946
    free_clusters(bs, table_offset, new_table_size2);
1947
    qemu_free(new_table);
1948
    return -EIO;
1949
}
1950

    
1951
/* addend must be 1 or -1 */
1952
/* XXX: cache several refcount block clusters ? */
1953
static int update_cluster_refcount(BlockDriverState *bs,
1954
                                   int64_t cluster_index,
1955
                                   int addend)
1956
{
1957
    BDRVQcowState *s = bs->opaque;
1958
    int64_t offset, refcount_block_offset;
1959
    int ret, refcount_table_index, block_index, refcount;
1960
    uint64_t data64;
1961

    
1962
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
1963
    if (refcount_table_index >= s->refcount_table_size) {
1964
        if (addend < 0)
1965
            return -EINVAL;
1966
        ret = grow_refcount_table(bs, refcount_table_index + 1);
1967
        if (ret < 0)
1968
            return ret;
1969
    }
1970
    refcount_block_offset = s->refcount_table[refcount_table_index];
1971
    if (!refcount_block_offset) {
1972
        if (addend < 0)
1973
            return -EINVAL;
1974
        /* create a new refcount block */
1975
        /* Note: we cannot update the refcount now to avoid recursion */
1976
        offset = alloc_clusters_noref(bs, s->cluster_size);
1977
        memset(s->refcount_block_cache, 0, s->cluster_size);
1978
        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
1979
        if (ret != s->cluster_size)
1980
            return -EINVAL;
1981
        s->refcount_table[refcount_table_index] = offset;
1982
        data64 = cpu_to_be64(offset);
1983
        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
1984
                          refcount_table_index * sizeof(uint64_t),
1985
                          &data64, sizeof(data64));
1986
        if (ret != sizeof(data64))
1987
            return -EINVAL;
1988

    
1989
        refcount_block_offset = offset;
1990
        s->refcount_block_cache_offset = offset;
1991
        update_refcount(bs, offset, s->cluster_size, 1);
1992
    } else {
1993
        if (refcount_block_offset != s->refcount_block_cache_offset) {
1994
            if (load_refcount_block(bs, refcount_block_offset) < 0)
1995
                return -EIO;
1996
        }
1997
    }
1998
    /* we can update the count and save it */
1999
    block_index = cluster_index &
2000
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2001
    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
2002
    refcount += addend;
2003
    if (refcount < 0 || refcount > 0xffff)
2004
        return -EINVAL;
2005
    if (refcount == 0 && cluster_index < s->free_cluster_index) {
2006
        s->free_cluster_index = cluster_index;
2007
    }
2008
    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
2009
    if (bdrv_pwrite(s->hd,
2010
                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
2011
                    &s->refcount_block_cache[block_index], 2) != 2)
2012
        return -EIO;
2013
    return refcount;
2014
}
2015

    
2016
static void update_refcount(BlockDriverState *bs,
2017
                            int64_t offset, int64_t length,
2018
                            int addend)
2019
{
2020
    BDRVQcowState *s = bs->opaque;
2021
    int64_t start, last, cluster_offset;
2022

    
2023
#ifdef DEBUG_ALLOC2
2024
    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
2025
           offset, length, addend);
2026
#endif
2027
    if (length <= 0)
2028
        return;
2029
    start = offset & ~(s->cluster_size - 1);
2030
    last = (offset + length - 1) & ~(s->cluster_size - 1);
2031
    for(cluster_offset = start; cluster_offset <= last;
2032
        cluster_offset += s->cluster_size) {
2033
        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
2034
    }
2035
}
2036

    
2037
#ifdef DEBUG_ALLOC
2038
static void inc_refcounts(BlockDriverState *bs,
2039
                          uint16_t *refcount_table,
2040
                          int refcount_table_size,
2041
                          int64_t offset, int64_t size)
2042
{
2043
    BDRVQcowState *s = bs->opaque;
2044
    int64_t start, last, cluster_offset;
2045
    int k;
2046
   
2047
    if (size <= 0)
2048
        return;
2049

    
2050
    start = offset & ~(s->cluster_size - 1);
2051
    last = (offset + size - 1) & ~(s->cluster_size - 1);
2052
    for(cluster_offset = start; cluster_offset <= last;
2053
        cluster_offset += s->cluster_size) {
2054
        k = cluster_offset >> s->cluster_bits;
2055
        if (k < 0 || k >= refcount_table_size) {
2056
            printf("ERROR: invalid cluster offset=0x%llx\n", cluster_offset);
2057
        } else {
2058
            if (++refcount_table[k] == 0) {
2059
                printf("ERROR: overflow cluster offset=0x%llx\n", cluster_offset);
2060
            }
2061
        }
2062
    }
2063
}
2064

    
2065
static int check_refcounts_l1(BlockDriverState *bs,
2066
                              uint16_t *refcount_table,
2067
                              int refcount_table_size,
2068
                              int64_t l1_table_offset, int l1_size,
2069
                              int check_copied)
2070
{
2071
    BDRVQcowState *s = bs->opaque;
2072
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
2073
    int l2_size, i, j, nb_csectors, refcount;
2074

    
2075
    l2_table = NULL;
2076
    l1_size2 = l1_size * sizeof(uint64_t);
2077

    
2078
    inc_refcounts(bs, refcount_table, refcount_table_size,
2079
                  l1_table_offset, l1_size2);
2080

    
2081
    l1_table = qemu_malloc(l1_size2);
2082
    if (!l1_table)
2083
        goto fail;
2084
    if (bdrv_pread(s->hd, l1_table_offset,
2085
                   l1_table, l1_size2) != l1_size2)
2086
        goto fail;
2087
    for(i = 0;i < l1_size; i++)
2088
        be64_to_cpus(&l1_table[i]);
2089
   
2090
    l2_size = s->l2_size * sizeof(uint64_t);
2091
    l2_table = qemu_malloc(l2_size);
2092
    if (!l2_table)
2093
        goto fail;
2094
    for(i = 0; i < l1_size; i++) {
2095
        l2_offset = l1_table[i];
2096
        if (l2_offset) {
2097
            if (check_copied) {
2098
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2099
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
2100
                    printf("ERROR OFLAG_COPIED: l2_offset=%llx refcount=%d\n",
2101
                           l2_offset, refcount);
2102
                }
2103
            }
2104
            l2_offset &= ~QCOW_OFLAG_COPIED;
2105
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
2106
                goto fail;
2107
            for(j = 0; j < s->l2_size; j++) {
2108
                offset = be64_to_cpu(l2_table[j]);
2109
                if (offset != 0) {
2110
                    if (offset & QCOW_OFLAG_COMPRESSED) {
2111
                        if (offset & QCOW_OFLAG_COPIED) {
2112
                            printf("ERROR: cluster %lld: copied flag must never be set for compressed clusters\n",
2113
                                   offset >> s->cluster_bits);
2114
                            offset &= ~QCOW_OFLAG_COPIED;
2115
                        }
2116
                        nb_csectors = ((offset >> s->csize_shift) &
2117
                                       s->csize_mask) + 1;
2118
                        offset &= s->cluster_offset_mask;
2119
                        inc_refcounts(bs, refcount_table,
2120
                                      refcount_table_size,
2121
                                      offset & ~511, nb_csectors * 512);
2122
                    } else {
2123
                        if (check_copied) {
2124
                            refcount = get_refcount(bs, (offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2125
                            if ((refcount == 1) != ((offset & QCOW_OFLAG_COPIED) != 0)) {
2126
                                printf("ERROR OFLAG_COPIED: offset=%llx refcount=%d\n",
2127
                                       offset, refcount);
2128
                            }
2129
                        }
2130
                        offset &= ~QCOW_OFLAG_COPIED;
2131
                        inc_refcounts(bs, refcount_table,
2132
                                      refcount_table_size,
2133
                                      offset, s->cluster_size);
2134
                    }
2135
                }
2136
            }
2137
            inc_refcounts(bs, refcount_table,
2138
                          refcount_table_size,
2139
                          l2_offset,
2140
                          s->cluster_size);
2141
        }
2142
    }
2143
    qemu_free(l1_table);
2144
    qemu_free(l2_table);
2145
    return 0;
2146
 fail:
2147
    printf("ERROR: I/O error in check_refcounts_l1\n");
2148
    qemu_free(l1_table);
2149
    qemu_free(l2_table);
2150
    return -EIO;
2151
}
2152

    
2153
static void check_refcounts(BlockDriverState *bs)
2154
{
2155
    BDRVQcowState *s = bs->opaque;
2156
    int64_t size;
2157
    int nb_clusters, refcount1, refcount2, i;
2158
    QCowSnapshot *sn;
2159
    uint16_t *refcount_table;
2160

    
2161
    size = bdrv_getlength(s->hd);
2162
    nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
2163
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
2164

    
2165
    /* header */
2166
    inc_refcounts(bs, refcount_table, nb_clusters,
2167
                  0, s->cluster_size);
2168
   
2169
    check_refcounts_l1(bs, refcount_table, nb_clusters,
2170
                       s->l1_table_offset, s->l1_size, 1);
2171

    
2172
    /* snapshots */
2173
    for(i = 0; i < s->nb_snapshots; i++) {
2174
        sn = s->snapshots + i;
2175
        check_refcounts_l1(bs, refcount_table, nb_clusters,
2176
                           sn->l1_table_offset, sn->l1_size, 0);
2177
    }
2178
    inc_refcounts(bs, refcount_table, nb_clusters,
2179
                  s->snapshots_offset, s->snapshots_size);
2180

    
2181
    /* refcount data */
2182
    inc_refcounts(bs, refcount_table, nb_clusters,
2183
                  s->refcount_table_offset,
2184
                  s->refcount_table_size * sizeof(uint64_t));
2185
    for(i = 0; i < s->refcount_table_size; i++) {
2186
        int64_t offset;
2187
        offset = s->refcount_table[i];
2188
        if (offset != 0) {
2189
            inc_refcounts(bs, refcount_table, nb_clusters,
2190
                          offset, s->cluster_size);
2191
        }
2192
    }
2193

    
2194
    /* compare ref counts */
2195
    for(i = 0; i < nb_clusters; i++) {
2196
        refcount1 = get_refcount(bs, i);
2197
        refcount2 = refcount_table[i];
2198
        if (refcount1 != refcount2)
2199
            printf("ERROR cluster %d refcount=%d reference=%d\n",
2200
                   i, refcount1, refcount2);
2201
    }
2202

    
2203
    qemu_free(refcount_table);
2204
}
2205

    
2206
#if 0
2207
static void dump_refcounts(BlockDriverState *bs)
2208
{
2209
    BDRVQcowState *s = bs->opaque;
2210
    int64_t nb_clusters, k, k1, size;
2211
    int refcount;
2212

2213
    size = bdrv_getlength(s->hd);
2214
    nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
2215
    for(k = 0; k < nb_clusters;) {
2216
        k1 = k;
2217
        refcount = get_refcount(bs, k);
2218
        k++;
2219
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2220
            k++;
2221
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2222
    }
2223
}
2224
#endif
2225
#endif
2226

    
2227
BlockDriver bdrv_qcow2 = {
2228
    "qcow2",
2229
    sizeof(BDRVQcowState),
2230
    qcow_probe,
2231
    qcow_open,
2232
    NULL,
2233
    NULL,
2234
    qcow_close,
2235
    qcow_create,
2236
    qcow_flush,
2237
    qcow_is_allocated,
2238
    qcow_set_key,
2239
    qcow_make_empty,
2240

    
2241
    .bdrv_aio_read = qcow_aio_read,
2242
    .bdrv_aio_write = qcow_aio_write,
2243
    .bdrv_aio_cancel = qcow_aio_cancel,
2244
    .aiocb_size = sizeof(QCowAIOCB),
2245
    .bdrv_write_compressed = qcow_write_compressed,
2246

    
2247
    .bdrv_snapshot_create = qcow_snapshot_create,
2248
    .bdrv_snapshot_goto = qcow_snapshot_goto,
2249
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2250
    .bdrv_snapshot_list = qcow_snapshot_list,
2251
    .bdrv_get_info = qcow_get_info,
2252
};