Statistics
| Branch: | Revision:

root / block-qcow2.c @ f8de1660

History | View | Annotate | Download (83.8 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include <zlib.h>
27
#include "aes.h"
28
#include <assert.h>
29

    
30
/*
31
  Differences with QCOW:
32

33
  - Support for multiple incremental snapshots.
34
  - Memory management by reference counts.
35
  - Clusters which have a reference count of one have the bit
36
    QCOW_OFLAG_COPIED to optimize write performance.
37
  - Size of compressed clusters is stored in sectors to reduce bit usage
38
    in the cluster offsets.
39
  - Support for storing additional data (such as the VM state) in the
40
    snapshots.
41
  - If a backing store is used, the cluster size is not constrained
42
    (could be backported to QCOW).
43
  - L2 tables have always a size of one cluster.
44
*/
45

    
46
//#define DEBUG_ALLOC
47
//#define DEBUG_ALLOC2
48
//#define DEBUG_EXT
49

    
50
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
51
#define QCOW_VERSION 2
52

    
53
#define QCOW_CRYPT_NONE 0
54
#define QCOW_CRYPT_AES  1
55

    
56
#define QCOW_MAX_CRYPT_CLUSTERS 32
57

    
58
/* indicate that the refcount of the referenced cluster is exactly one. */
59
#define QCOW_OFLAG_COPIED     (1LL << 63)
60
/* indicate that the cluster is compressed (they never have the copied flag) */
61
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
62

    
63
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
64

    
65
typedef struct QCowHeader {
66
    uint32_t magic;
67
    uint32_t version;
68
    uint64_t backing_file_offset;
69
    uint32_t backing_file_size;
70
    uint32_t cluster_bits;
71
    uint64_t size; /* in bytes */
72
    uint32_t crypt_method;
73
    uint32_t l1_size; /* XXX: save number of clusters instead ? */
74
    uint64_t l1_table_offset;
75
    uint64_t refcount_table_offset;
76
    uint32_t refcount_table_clusters;
77
    uint32_t nb_snapshots;
78
    uint64_t snapshots_offset;
79
} QCowHeader;
80

    
81

    
82
typedef struct {
83
    uint32_t magic;
84
    uint32_t len;
85
} QCowExtension;
86
#define  QCOW_EXT_MAGIC_END 0
87
#define  QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
88

    
89

    
90
typedef struct __attribute__((packed)) QCowSnapshotHeader {
91
    /* header is 8 byte aligned */
92
    uint64_t l1_table_offset;
93

    
94
    uint32_t l1_size;
95
    uint16_t id_str_size;
96
    uint16_t name_size;
97

    
98
    uint32_t date_sec;
99
    uint32_t date_nsec;
100

    
101
    uint64_t vm_clock_nsec;
102

    
103
    uint32_t vm_state_size;
104
    uint32_t extra_data_size; /* for extension */
105
    /* extra data follows */
106
    /* id_str follows */
107
    /* name follows  */
108
} QCowSnapshotHeader;
109

    
110
#define L2_CACHE_SIZE 16
111

    
112
typedef struct QCowSnapshot {
113
    uint64_t l1_table_offset;
114
    uint32_t l1_size;
115
    char *id_str;
116
    char *name;
117
    uint32_t vm_state_size;
118
    uint32_t date_sec;
119
    uint32_t date_nsec;
120
    uint64_t vm_clock_nsec;
121
} QCowSnapshot;
122

    
123
typedef struct BDRVQcowState {
124
    BlockDriverState *hd;
125
    int cluster_bits;
126
    int cluster_size;
127
    int cluster_sectors;
128
    int l2_bits;
129
    int l2_size;
130
    int l1_size;
131
    int l1_vm_state_index;
132
    int csize_shift;
133
    int csize_mask;
134
    uint64_t cluster_offset_mask;
135
    uint64_t l1_table_offset;
136
    uint64_t *l1_table;
137
    uint64_t *l2_cache;
138
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
139
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
140
    uint8_t *cluster_cache;
141
    uint8_t *cluster_data;
142
    uint64_t cluster_cache_offset;
143

    
144
    uint64_t *refcount_table;
145
    uint64_t refcount_table_offset;
146
    uint32_t refcount_table_size;
147
    uint64_t refcount_block_cache_offset;
148
    uint16_t *refcount_block_cache;
149
    int64_t free_cluster_index;
150
    int64_t free_byte_offset;
151

    
152
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
153
    uint32_t crypt_method_header;
154
    AES_KEY aes_encrypt_key;
155
    AES_KEY aes_decrypt_key;
156
    uint64_t snapshots_offset;
157
    int snapshots_size;
158
    int nb_snapshots;
159
    QCowSnapshot *snapshots;
160
} BDRVQcowState;
161

    
162
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
163
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
164
                     uint8_t *buf, int nb_sectors);
165
static int qcow_read_snapshots(BlockDriverState *bs);
166
static void qcow_free_snapshots(BlockDriverState *bs);
167
static int refcount_init(BlockDriverState *bs);
168
static void refcount_close(BlockDriverState *bs);
169
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
170
static int update_cluster_refcount(BlockDriverState *bs,
171
                                   int64_t cluster_index,
172
                                   int addend);
173
static void update_refcount(BlockDriverState *bs,
174
                            int64_t offset, int64_t length,
175
                            int addend);
176
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
177
static int64_t alloc_bytes(BlockDriverState *bs, int size);
178
static void free_clusters(BlockDriverState *bs,
179
                          int64_t offset, int64_t size);
180
#ifdef DEBUG_ALLOC
181
static void check_refcounts(BlockDriverState *bs);
182
#endif
183

    
184
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
185
{
186
    const QCowHeader *cow_header = (const void *)buf;
187

    
188
    if (buf_size >= sizeof(QCowHeader) &&
189
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
190
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
191
        return 100;
192
    else
193
        return 0;
194
}
195

    
196

    
197
/* 
198
 * read qcow2 extension and fill bs
199
 * start reading from start_offset
200
 * finish reading upon magic of value 0 or when end_offset reached
201
 * unknown magic is skipped (future extension this version knows nothing about)
202
 * return 0 upon success, non-0 otherwise
203
 */
204
static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset,
205
                                uint64_t end_offset)
206
{
207
    BDRVQcowState *s = bs->opaque;
208
    QCowExtension ext;
209
    uint64_t offset;
210

    
211
#ifdef DEBUG_EXT
212
    printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
213
#endif
214
    offset = start_offset;
215
    while (offset < end_offset) {
216

    
217
#ifdef DEBUG_EXT
218
        /* Sanity check */
219
        if (offset > s->cluster_size)
220
            printf("qcow_handle_extension: suspicious offset %lu\n", offset);
221

    
222
        printf("attemting to read extended header in offset %lu\n", offset);
223
#endif
224

    
225
        if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) {
226
            fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n",
227
                    (unsigned long long)offset);
228
            return 1;
229
        }
230
        be32_to_cpus(&ext.magic);
231
        be32_to_cpus(&ext.len);
232
        offset += sizeof(ext);
233
#ifdef DEBUG_EXT
234
        printf("ext.magic = 0x%x\n", ext.magic);
235
#endif
236
        switch (ext.magic) {
237
        case QCOW_EXT_MAGIC_END:
238
            return 0;
239

    
240
        case QCOW_EXT_MAGIC_BACKING_FORMAT:
241
            if (ext.len >= sizeof(bs->backing_format)) {
242
                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
243
                        " (>=%zu)\n",
244
                        ext.len, sizeof(bs->backing_format));
245
                return 2;
246
            }
247
            if (bdrv_pread(s->hd, offset , bs->backing_format,
248
                           ext.len) != ext.len)
249
                return 3;
250
            bs->backing_format[ext.len] = '\0';
251
#ifdef DEBUG_EXT
252
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
253
#endif
254
            offset += ((ext.len + 7) & ~7);
255
            break;
256

    
257
        default:
258
            /* unknown magic -- just skip it */
259
            offset += ((ext.len + 7) & ~7);
260
            break;
261
        }
262
    }
263

    
264
    return 0;
265
}
266

    
267

    
268
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
269
{
270
    BDRVQcowState *s = bs->opaque;
271
    int len, i, shift, ret;
272
    QCowHeader header;
273
    uint64_t ext_end;
274

    
275
    /* Performance is terrible right now with cache=writethrough due mainly
276
     * to reference count updates.  If the user does not explicitly specify
277
     * a caching type, force to writeback caching.
278
     */
279
    if ((flags & BDRV_O_CACHE_DEF)) {
280
        flags |= BDRV_O_CACHE_WB;
281
        flags &= ~BDRV_O_CACHE_DEF;
282
    }
283
    ret = bdrv_file_open(&s->hd, filename, flags);
284
    if (ret < 0)
285
        return ret;
286
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
287
        goto fail;
288
    be32_to_cpus(&header.magic);
289
    be32_to_cpus(&header.version);
290
    be64_to_cpus(&header.backing_file_offset);
291
    be32_to_cpus(&header.backing_file_size);
292
    be64_to_cpus(&header.size);
293
    be32_to_cpus(&header.cluster_bits);
294
    be32_to_cpus(&header.crypt_method);
295
    be64_to_cpus(&header.l1_table_offset);
296
    be32_to_cpus(&header.l1_size);
297
    be64_to_cpus(&header.refcount_table_offset);
298
    be32_to_cpus(&header.refcount_table_clusters);
299
    be64_to_cpus(&header.snapshots_offset);
300
    be32_to_cpus(&header.nb_snapshots);
301

    
302
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
303
        goto fail;
304
    if (header.size <= 1 ||
305
        header.cluster_bits < 9 ||
306
        header.cluster_bits > 16)
307
        goto fail;
308
    if (header.crypt_method > QCOW_CRYPT_AES)
309
        goto fail;
310
    s->crypt_method_header = header.crypt_method;
311
    if (s->crypt_method_header)
312
        bs->encrypted = 1;
313
    s->cluster_bits = header.cluster_bits;
314
    s->cluster_size = 1 << s->cluster_bits;
315
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
316
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
317
    s->l2_size = 1 << s->l2_bits;
318
    bs->total_sectors = header.size / 512;
319
    s->csize_shift = (62 - (s->cluster_bits - 8));
320
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
321
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
322
    s->refcount_table_offset = header.refcount_table_offset;
323
    s->refcount_table_size =
324
        header.refcount_table_clusters << (s->cluster_bits - 3);
325

    
326
    s->snapshots_offset = header.snapshots_offset;
327
    s->nb_snapshots = header.nb_snapshots;
328

    
329
    /* read the level 1 table */
330
    s->l1_size = header.l1_size;
331
    shift = s->cluster_bits + s->l2_bits;
332
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
333
    /* the L1 table must contain at least enough entries to put
334
       header.size bytes */
335
    if (s->l1_size < s->l1_vm_state_index)
336
        goto fail;
337
    s->l1_table_offset = header.l1_table_offset;
338
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
339
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
340
        s->l1_size * sizeof(uint64_t))
341
        goto fail;
342
    for(i = 0;i < s->l1_size; i++) {
343
        be64_to_cpus(&s->l1_table[i]);
344
    }
345
    /* alloc L2 cache */
346
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
347
    s->cluster_cache = qemu_malloc(s->cluster_size);
348
    /* one more sector for decompressed data alignment */
349
    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
350
                                  + 512);
351
    s->cluster_cache_offset = -1;
352

    
353
    if (refcount_init(bs) < 0)
354
        goto fail;
355

    
356
    /* read qcow2 extensions */
357
    if (header.backing_file_offset)
358
        ext_end = header.backing_file_offset;
359
    else
360
        ext_end = s->cluster_size;
361
    if (qcow_read_extensions(bs, sizeof(header), ext_end))
362
        goto fail;
363

    
364
    /* read the backing file name */
365
    if (header.backing_file_offset != 0) {
366
        len = header.backing_file_size;
367
        if (len > 1023)
368
            len = 1023;
369
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
370
            goto fail;
371
        bs->backing_file[len] = '\0';
372
    }
373
    if (qcow_read_snapshots(bs) < 0)
374
        goto fail;
375

    
376
#ifdef DEBUG_ALLOC
377
    check_refcounts(bs);
378
#endif
379
    return 0;
380

    
381
 fail:
382
    qcow_free_snapshots(bs);
383
    refcount_close(bs);
384
    qemu_free(s->l1_table);
385
    qemu_free(s->l2_cache);
386
    qemu_free(s->cluster_cache);
387
    qemu_free(s->cluster_data);
388
    bdrv_delete(s->hd);
389
    return -1;
390
}
391

    
392
static int qcow_set_key(BlockDriverState *bs, const char *key)
393
{
394
    BDRVQcowState *s = bs->opaque;
395
    uint8_t keybuf[16];
396
    int len, i;
397

    
398
    memset(keybuf, 0, 16);
399
    len = strlen(key);
400
    if (len > 16)
401
        len = 16;
402
    /* XXX: we could compress the chars to 7 bits to increase
403
       entropy */
404
    for(i = 0;i < len;i++) {
405
        keybuf[i] = key[i];
406
    }
407
    s->crypt_method = s->crypt_method_header;
408

    
409
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
410
        return -1;
411
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
412
        return -1;
413
#if 0
414
    /* test */
415
    {
416
        uint8_t in[16];
417
        uint8_t out[16];
418
        uint8_t tmp[16];
419
        for(i=0;i<16;i++)
420
            in[i] = i;
421
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
422
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
423
        for(i = 0; i < 16; i++)
424
            printf(" %02x", tmp[i]);
425
        printf("\n");
426
        for(i = 0; i < 16; i++)
427
            printf(" %02x", out[i]);
428
        printf("\n");
429
    }
430
#endif
431
    return 0;
432
}
433

    
434
/* The crypt function is compatible with the linux cryptoloop
435
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
436
   supported */
437
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
438
                            uint8_t *out_buf, const uint8_t *in_buf,
439
                            int nb_sectors, int enc,
440
                            const AES_KEY *key)
441
{
442
    union {
443
        uint64_t ll[2];
444
        uint8_t b[16];
445
    } ivec;
446
    int i;
447

    
448
    for(i = 0; i < nb_sectors; i++) {
449
        ivec.ll[0] = cpu_to_le64(sector_num);
450
        ivec.ll[1] = 0;
451
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
452
                        ivec.b, enc);
453
        sector_num++;
454
        in_buf += 512;
455
        out_buf += 512;
456
    }
457
}
458

    
459
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
460
                        uint64_t cluster_offset, int n_start, int n_end)
461
{
462
    BDRVQcowState *s = bs->opaque;
463
    int n, ret;
464

    
465
    n = n_end - n_start;
466
    if (n <= 0)
467
        return 0;
468
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
469
    if (ret < 0)
470
        return ret;
471
    if (s->crypt_method) {
472
        encrypt_sectors(s, start_sect + n_start,
473
                        s->cluster_data,
474
                        s->cluster_data, n, 1,
475
                        &s->aes_encrypt_key);
476
    }
477
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
478
                     s->cluster_data, n);
479
    if (ret < 0)
480
        return ret;
481
    return 0;
482
}
483

    
484
static void l2_cache_reset(BlockDriverState *bs)
485
{
486
    BDRVQcowState *s = bs->opaque;
487

    
488
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
489
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
490
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
491
}
492

    
493
static inline int l2_cache_new_entry(BlockDriverState *bs)
494
{
495
    BDRVQcowState *s = bs->opaque;
496
    uint32_t min_count;
497
    int min_index, i;
498

    
499
    /* find a new entry in the least used one */
500
    min_index = 0;
501
    min_count = 0xffffffff;
502
    for(i = 0; i < L2_CACHE_SIZE; i++) {
503
        if (s->l2_cache_counts[i] < min_count) {
504
            min_count = s->l2_cache_counts[i];
505
            min_index = i;
506
        }
507
    }
508
    return min_index;
509
}
510

    
511
static int64_t align_offset(int64_t offset, int n)
512
{
513
    offset = (offset + n - 1) & ~(n - 1);
514
    return offset;
515
}
516

    
517
static int grow_l1_table(BlockDriverState *bs, int min_size)
518
{
519
    BDRVQcowState *s = bs->opaque;
520
    int new_l1_size, new_l1_size2, ret, i;
521
    uint64_t *new_l1_table;
522
    uint64_t new_l1_table_offset;
523
    uint8_t data[12];
524

    
525
    new_l1_size = s->l1_size;
526
    if (min_size <= new_l1_size)
527
        return 0;
528
    while (min_size > new_l1_size) {
529
        new_l1_size = (new_l1_size * 3 + 1) / 2;
530
    }
531
#ifdef DEBUG_ALLOC2
532
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
533
#endif
534

    
535
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
536
    new_l1_table = qemu_mallocz(new_l1_size2);
537
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
538

    
539
    /* write new table (align to cluster) */
540
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
541

    
542
    for(i = 0; i < s->l1_size; i++)
543
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
544
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
545
    if (ret != new_l1_size2)
546
        goto fail;
547
    for(i = 0; i < s->l1_size; i++)
548
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
549

    
550
    /* set new table */
551
    cpu_to_be32w((uint32_t*)data, new_l1_size);
552
    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
553
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
554
                sizeof(data)) != sizeof(data))
555
        goto fail;
556
    qemu_free(s->l1_table);
557
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
558
    s->l1_table_offset = new_l1_table_offset;
559
    s->l1_table = new_l1_table;
560
    s->l1_size = new_l1_size;
561
    return 0;
562
 fail:
563
    qemu_free(s->l1_table);
564
    return -EIO;
565
}
566

    
567
/*
568
 * seek_l2_table
569
 *
570
 * seek l2_offset in the l2_cache table
571
 * if not found, return NULL,
572
 * if found,
573
 *   increments the l2 cache hit count of the entry,
574
 *   if counter overflow, divide by two all counters
575
 *   return the pointer to the l2 cache entry
576
 *
577
 */
578

    
579
static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
580
{
581
    int i, j;
582

    
583
    for(i = 0; i < L2_CACHE_SIZE; i++) {
584
        if (l2_offset == s->l2_cache_offsets[i]) {
585
            /* increment the hit count */
586
            if (++s->l2_cache_counts[i] == 0xffffffff) {
587
                for(j = 0; j < L2_CACHE_SIZE; j++) {
588
                    s->l2_cache_counts[j] >>= 1;
589
                }
590
            }
591
            return s->l2_cache + (i << s->l2_bits);
592
        }
593
    }
594
    return NULL;
595
}
596

    
597
/*
598
 * l2_load
599
 *
600
 * Loads a L2 table into memory. If the table is in the cache, the cache
601
 * is used; otherwise the L2 table is loaded from the image file.
602
 *
603
 * Returns a pointer to the L2 table on success, or NULL if the read from
604
 * the image file failed.
605
 */
606

    
607
static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
608
{
609
    BDRVQcowState *s = bs->opaque;
610
    int min_index;
611
    uint64_t *l2_table;
612

    
613
    /* seek if the table for the given offset is in the cache */
614

    
615
    l2_table = seek_l2_table(s, l2_offset);
616
    if (l2_table != NULL)
617
        return l2_table;
618

    
619
    /* not found: load a new entry in the least used one */
620

    
621
    min_index = l2_cache_new_entry(bs);
622
    l2_table = s->l2_cache + (min_index << s->l2_bits);
623
    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
624
        s->l2_size * sizeof(uint64_t))
625
        return NULL;
626
    s->l2_cache_offsets[min_index] = l2_offset;
627
    s->l2_cache_counts[min_index] = 1;
628

    
629
    return l2_table;
630
}
631

    
632
/*
633
 * l2_allocate
634
 *
635
 * Allocate a new l2 entry in the file. If l1_index points to an already
636
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
637
 * table) copy the contents of the old L2 table into the newly allocated one.
638
 * Otherwise the new table is initialized with zeros.
639
 *
640
 */
641

    
642
static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
643
{
644
    BDRVQcowState *s = bs->opaque;
645
    int min_index;
646
    uint64_t old_l2_offset, tmp;
647
    uint64_t *l2_table, l2_offset;
648

    
649
    old_l2_offset = s->l1_table[l1_index];
650

    
651
    /* allocate a new l2 entry */
652

    
653
    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
654

    
655
    /* update the L1 entry */
656

    
657
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
658

    
659
    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
660
    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
661
                    &tmp, sizeof(tmp)) != sizeof(tmp))
662
        return NULL;
663

    
664
    /* allocate a new entry in the l2 cache */
665

    
666
    min_index = l2_cache_new_entry(bs);
667
    l2_table = s->l2_cache + (min_index << s->l2_bits);
668

    
669
    if (old_l2_offset == 0) {
670
        /* if there was no old l2 table, clear the new table */
671
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
672
    } else {
673
        /* if there was an old l2 table, read it from the disk */
674
        if (bdrv_pread(s->hd, old_l2_offset,
675
                       l2_table, s->l2_size * sizeof(uint64_t)) !=
676
            s->l2_size * sizeof(uint64_t))
677
            return NULL;
678
    }
679
    /* write the l2 table to the file */
680
    if (bdrv_pwrite(s->hd, l2_offset,
681
                    l2_table, s->l2_size * sizeof(uint64_t)) !=
682
        s->l2_size * sizeof(uint64_t))
683
        return NULL;
684

    
685
    /* update the l2 cache entry */
686

    
687
    s->l2_cache_offsets[min_index] = l2_offset;
688
    s->l2_cache_counts[min_index] = 1;
689

    
690
    return l2_table;
691
}
692

    
693
static int size_to_clusters(BDRVQcowState *s, int64_t size)
694
{
695
    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
696
}
697

    
698
static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
699
        uint64_t *l2_table, uint64_t start, uint64_t mask)
700
{
701
    int i;
702
    uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
703

    
704
    if (!offset)
705
        return 0;
706

    
707
    for (i = start; i < start + nb_clusters; i++)
708
        if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
709
            break;
710

    
711
        return (i - start);
712
}
713

    
714
static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
715
{
716
    int i = 0;
717

    
718
    while(nb_clusters-- && l2_table[i] == 0)
719
        i++;
720

    
721
    return i;
722
}
723

    
724
/*
725
 * get_cluster_offset
726
 *
727
 * For a given offset of the disk image, return cluster offset in
728
 * qcow2 file.
729
 *
730
 * on entry, *num is the number of contiguous clusters we'd like to
731
 * access following offset.
732
 *
733
 * on exit, *num is the number of contiguous clusters we can read.
734
 *
735
 * Return 1, if the offset is found
736
 * Return 0, otherwise.
737
 *
738
 */
739

    
740
static uint64_t get_cluster_offset(BlockDriverState *bs,
741
                                   uint64_t offset, int *num)
742
{
743
    BDRVQcowState *s = bs->opaque;
744
    int l1_index, l2_index;
745
    uint64_t l2_offset, *l2_table, cluster_offset;
746
    int l1_bits, c;
747
    int index_in_cluster, nb_available, nb_needed, nb_clusters;
748

    
749
    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
750
    nb_needed = *num + index_in_cluster;
751

    
752
    l1_bits = s->l2_bits + s->cluster_bits;
753

    
754
    /* compute how many bytes there are between the offset and
755
     * the end of the l1 entry
756
     */
757

    
758
    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
759

    
760
    /* compute the number of available sectors */
761

    
762
    nb_available = (nb_available >> 9) + index_in_cluster;
763

    
764
    if (nb_needed > nb_available) {
765
        nb_needed = nb_available;
766
    }
767

    
768
    cluster_offset = 0;
769

    
770
    /* seek the the l2 offset in the l1 table */
771

    
772
    l1_index = offset >> l1_bits;
773
    if (l1_index >= s->l1_size)
774
        goto out;
775

    
776
    l2_offset = s->l1_table[l1_index];
777

    
778
    /* seek the l2 table of the given l2 offset */
779

    
780
    if (!l2_offset)
781
        goto out;
782

    
783
    /* load the l2 table in memory */
784

    
785
    l2_offset &= ~QCOW_OFLAG_COPIED;
786
    l2_table = l2_load(bs, l2_offset);
787
    if (l2_table == NULL)
788
        return 0;
789

    
790
    /* find the cluster offset for the given disk offset */
791

    
792
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
793
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
794
    nb_clusters = size_to_clusters(s, nb_needed << 9);
795

    
796
    if (!cluster_offset) {
797
        /* how many empty clusters ? */
798
        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
799
    } else {
800
        /* how many allocated clusters ? */
801
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
802
                &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
803
    }
804

    
805
   nb_available = (c * s->cluster_sectors);
806
out:
807
    if (nb_available > nb_needed)
808
        nb_available = nb_needed;
809

    
810
    *num = nb_available - index_in_cluster;
811

    
812
    return cluster_offset & ~QCOW_OFLAG_COPIED;
813
}
814

    
815
/*
816
 * free_any_clusters
817
 *
818
 * free clusters according to its type: compressed or not
819
 *
820
 */
821

    
822
static void free_any_clusters(BlockDriverState *bs,
823
                              uint64_t cluster_offset, int nb_clusters)
824
{
825
    BDRVQcowState *s = bs->opaque;
826

    
827
    /* free the cluster */
828

    
829
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
830
        int nb_csectors;
831
        nb_csectors = ((cluster_offset >> s->csize_shift) &
832
                       s->csize_mask) + 1;
833
        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
834
                      nb_csectors * 512);
835
        return;
836
    }
837

    
838
    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
839

    
840
    return;
841
}
842

    
843
/*
844
 * get_cluster_table
845
 *
846
 * for a given disk offset, load (and allocate if needed)
847
 * the l2 table.
848
 *
849
 * the l2 table offset in the qcow2 file and the cluster index
850
 * in the l2 table are given to the caller.
851
 *
852
 */
853

    
854
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
855
                             uint64_t **new_l2_table,
856
                             uint64_t *new_l2_offset,
857
                             int *new_l2_index)
858
{
859
    BDRVQcowState *s = bs->opaque;
860
    int l1_index, l2_index, ret;
861
    uint64_t l2_offset, *l2_table;
862

    
863
    /* seek the the l2 offset in the l1 table */
864

    
865
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
866
    if (l1_index >= s->l1_size) {
867
        ret = grow_l1_table(bs, l1_index + 1);
868
        if (ret < 0)
869
            return 0;
870
    }
871
    l2_offset = s->l1_table[l1_index];
872

    
873
    /* seek the l2 table of the given l2 offset */
874

    
875
    if (l2_offset & QCOW_OFLAG_COPIED) {
876
        /* load the l2 table in memory */
877
        l2_offset &= ~QCOW_OFLAG_COPIED;
878
        l2_table = l2_load(bs, l2_offset);
879
        if (l2_table == NULL)
880
            return 0;
881
    } else {
882
        if (l2_offset)
883
            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
884
        l2_table = l2_allocate(bs, l1_index);
885
        if (l2_table == NULL)
886
            return 0;
887
        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
888
    }
889

    
890
    /* find the cluster offset for the given disk offset */
891

    
892
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
893

    
894
    *new_l2_table = l2_table;
895
    *new_l2_offset = l2_offset;
896
    *new_l2_index = l2_index;
897

    
898
    return 1;
899
}
900

    
901
/*
902
 * alloc_compressed_cluster_offset
903
 *
904
 * For a given offset of the disk image, return cluster offset in
905
 * qcow2 file.
906
 *
907
 * If the offset is not found, allocate a new compressed cluster.
908
 *
909
 * Return the cluster offset if successful,
910
 * Return 0, otherwise.
911
 *
912
 */
913

    
914
static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
915
                                                uint64_t offset,
916
                                                int compressed_size)
917
{
918
    BDRVQcowState *s = bs->opaque;
919
    int l2_index, ret;
920
    uint64_t l2_offset, *l2_table, cluster_offset;
921
    int nb_csectors;
922

    
923
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
924
    if (ret == 0)
925
        return 0;
926

    
927
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
928
    if (cluster_offset & QCOW_OFLAG_COPIED)
929
        return cluster_offset & ~QCOW_OFLAG_COPIED;
930

    
931
    if (cluster_offset)
932
        free_any_clusters(bs, cluster_offset, 1);
933

    
934
    cluster_offset = alloc_bytes(bs, compressed_size);
935
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
936
                  (cluster_offset >> 9);
937

    
938
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
939
                      ((uint64_t)nb_csectors << s->csize_shift);
940

    
941
    /* update L2 table */
942

    
943
    /* compressed clusters never have the copied flag */
944

    
945
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
946
    if (bdrv_pwrite(s->hd,
947
                    l2_offset + l2_index * sizeof(uint64_t),
948
                    l2_table + l2_index,
949
                    sizeof(uint64_t)) != sizeof(uint64_t))
950
        return 0;
951

    
952
    return cluster_offset;
953
}
954

    
955
typedef struct QCowL2Meta
956
{
957
    uint64_t offset;
958
    int n_start;
959
    int nb_available;
960
    int nb_clusters;
961
} QCowL2Meta;
962

    
963
static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
964
        QCowL2Meta *m)
965
{
966
    BDRVQcowState *s = bs->opaque;
967
    int i, j = 0, l2_index, ret;
968
    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
969

    
970
    if (m->nb_clusters == 0)
971
        return 0;
972

    
973
    old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
974

    
975
    /* copy content of unmodified sectors */
976
    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
977
    if (m->n_start) {
978
        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
979
        if (ret < 0)
980
            goto err;
981
    }
982

    
983
    if (m->nb_available & (s->cluster_sectors - 1)) {
984
        uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
985
        ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
986
                m->nb_available - end, s->cluster_sectors);
987
        if (ret < 0)
988
            goto err;
989
    }
990

    
991
    ret = -EIO;
992
    /* update L2 table */
993
    if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
994
        goto err;
995

    
996
    for (i = 0; i < m->nb_clusters; i++) {
997
        if(l2_table[l2_index + i] != 0)
998
            old_cluster[j++] = l2_table[l2_index + i];
999

    
1000
        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
1001
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
1002
     }
1003

    
1004
    if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
1005
                l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
1006
            m->nb_clusters * sizeof(uint64_t))
1007
        goto err;
1008

    
1009
    for (i = 0; i < j; i++)
1010
        free_any_clusters(bs, old_cluster[i], 1);
1011

    
1012
    ret = 0;
1013
err:
1014
    qemu_free(old_cluster);
1015
    return ret;
1016
 }
1017

    
1018
/*
1019
 * alloc_cluster_offset
1020
 *
1021
 * For a given offset of the disk image, return cluster offset in
1022
 * qcow2 file.
1023
 *
1024
 * If the offset is not found, allocate a new cluster.
1025
 *
1026
 * Return the cluster offset if successful,
1027
 * Return 0, otherwise.
1028
 *
1029
 */
1030

    
1031
static uint64_t alloc_cluster_offset(BlockDriverState *bs,
1032
                                     uint64_t offset,
1033
                                     int n_start, int n_end,
1034
                                     int *num, QCowL2Meta *m)
1035
{
1036
    BDRVQcowState *s = bs->opaque;
1037
    int l2_index, ret;
1038
    uint64_t l2_offset, *l2_table, cluster_offset;
1039
    int nb_clusters, i = 0;
1040

    
1041
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
1042
    if (ret == 0)
1043
        return 0;
1044

    
1045
    nb_clusters = size_to_clusters(s, n_end << 9);
1046

    
1047
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1048

    
1049
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
1050

    
1051
    /* We keep all QCOW_OFLAG_COPIED clusters */
1052

    
1053
    if (cluster_offset & QCOW_OFLAG_COPIED) {
1054
        nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
1055
                &l2_table[l2_index], 0, 0);
1056

    
1057
        cluster_offset &= ~QCOW_OFLAG_COPIED;
1058
        m->nb_clusters = 0;
1059

    
1060
        goto out;
1061
    }
1062

    
1063
    /* for the moment, multiple compressed clusters are not managed */
1064

    
1065
    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
1066
        nb_clusters = 1;
1067

    
1068
    /* how many available clusters ? */
1069

    
1070
    while (i < nb_clusters) {
1071
        i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
1072
                &l2_table[l2_index], i, 0);
1073

    
1074
        if(be64_to_cpu(l2_table[l2_index + i]))
1075
            break;
1076

    
1077
        i += count_contiguous_free_clusters(nb_clusters - i,
1078
                &l2_table[l2_index + i]);
1079

    
1080
        cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
1081

    
1082
        if ((cluster_offset & QCOW_OFLAG_COPIED) ||
1083
                (cluster_offset & QCOW_OFLAG_COMPRESSED))
1084
            break;
1085
    }
1086
    nb_clusters = i;
1087

    
1088
    /* allocate a new cluster */
1089

    
1090
    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
1091

    
1092
    /* save info needed for meta data update */
1093
    m->offset = offset;
1094
    m->n_start = n_start;
1095
    m->nb_clusters = nb_clusters;
1096

    
1097
out:
1098
    m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
1099

    
1100
    *num = m->nb_available - n_start;
1101

    
1102
    return cluster_offset;
1103
}
1104

    
1105
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
1106
                             int nb_sectors, int *pnum)
1107
{
1108
    uint64_t cluster_offset;
1109

    
1110
    *pnum = nb_sectors;
1111
    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
1112

    
1113
    return (cluster_offset != 0);
1114
}
1115

    
1116
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1117
                             const uint8_t *buf, int buf_size)
1118
{
1119
    z_stream strm1, *strm = &strm1;
1120
    int ret, out_len;
1121

    
1122
    memset(strm, 0, sizeof(*strm));
1123

    
1124
    strm->next_in = (uint8_t *)buf;
1125
    strm->avail_in = buf_size;
1126
    strm->next_out = out_buf;
1127
    strm->avail_out = out_buf_size;
1128

    
1129
    ret = inflateInit2(strm, -12);
1130
    if (ret != Z_OK)
1131
        return -1;
1132
    ret = inflate(strm, Z_FINISH);
1133
    out_len = strm->next_out - out_buf;
1134
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1135
        out_len != out_buf_size) {
1136
        inflateEnd(strm);
1137
        return -1;
1138
    }
1139
    inflateEnd(strm);
1140
    return 0;
1141
}
1142

    
1143
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
1144
{
1145
    int ret, csize, nb_csectors, sector_offset;
1146
    uint64_t coffset;
1147

    
1148
    coffset = cluster_offset & s->cluster_offset_mask;
1149
    if (s->cluster_cache_offset != coffset) {
1150
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1151
        sector_offset = coffset & 511;
1152
        csize = nb_csectors * 512 - sector_offset;
1153
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
1154
        if (ret < 0) {
1155
            return -1;
1156
        }
1157
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1158
                              s->cluster_data + sector_offset, csize) < 0) {
1159
            return -1;
1160
        }
1161
        s->cluster_cache_offset = coffset;
1162
    }
1163
    return 0;
1164
}
1165

    
1166
/* handle reading after the end of the backing file */
1167
static int backing_read1(BlockDriverState *bs,
1168
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
1169
{
1170
    int n1;
1171
    if ((sector_num + nb_sectors) <= bs->total_sectors)
1172
        return nb_sectors;
1173
    if (sector_num >= bs->total_sectors)
1174
        n1 = 0;
1175
    else
1176
        n1 = bs->total_sectors - sector_num;
1177
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
1178
    return n1;
1179
}
1180

    
1181
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
1182
                     uint8_t *buf, int nb_sectors)
1183
{
1184
    BDRVQcowState *s = bs->opaque;
1185
    int ret, index_in_cluster, n, n1;
1186
    uint64_t cluster_offset;
1187

    
1188
    while (nb_sectors > 0) {
1189
        n = nb_sectors;
1190
        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
1191
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1192
        if (!cluster_offset) {
1193
            if (bs->backing_hd) {
1194
                /* read from the base image */
1195
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
1196
                if (n1 > 0) {
1197
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
1198
                    if (ret < 0)
1199
                        return -1;
1200
                }
1201
            } else {
1202
                memset(buf, 0, 512 * n);
1203
            }
1204
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1205
            if (decompress_cluster(s, cluster_offset) < 0)
1206
                return -1;
1207
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
1208
        } else {
1209
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1210
            if (ret != n * 512)
1211
                return -1;
1212
            if (s->crypt_method) {
1213
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
1214
                                &s->aes_decrypt_key);
1215
            }
1216
        }
1217
        nb_sectors -= n;
1218
        sector_num += n;
1219
        buf += n * 512;
1220
    }
1221
    return 0;
1222
}
1223

    
1224
static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1225
                     const uint8_t *buf, int nb_sectors)
1226
{
1227
    BDRVQcowState *s = bs->opaque;
1228
    int ret, index_in_cluster, n;
1229
    uint64_t cluster_offset;
1230
    int n_end;
1231
    QCowL2Meta l2meta;
1232

    
1233
    while (nb_sectors > 0) {
1234
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1235
        n_end = index_in_cluster + nb_sectors;
1236
        if (s->crypt_method &&
1237
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1238
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1239
        cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1240
                                              index_in_cluster,
1241
                                              n_end, &n, &l2meta);
1242
        if (!cluster_offset)
1243
            return -1;
1244
        if (s->crypt_method) {
1245
            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
1246
                            &s->aes_encrypt_key);
1247
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
1248
                              s->cluster_data, n * 512);
1249
        } else {
1250
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1251
        }
1252
        if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
1253
            free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
1254
            return -1;
1255
        }
1256
        nb_sectors -= n;
1257
        sector_num += n;
1258
        buf += n * 512;
1259
    }
1260
    s->cluster_cache_offset = -1; /* disable compressed cache */
1261
    return 0;
1262
}
1263

    
1264
typedef struct QCowAIOCB {
1265
    BlockDriverAIOCB common;
1266
    int64_t sector_num;
1267
    uint8_t *buf;
1268
    int nb_sectors;
1269
    int n;
1270
    uint64_t cluster_offset;
1271
    uint8_t *cluster_data;
1272
    BlockDriverAIOCB *hd_aiocb;
1273
    QEMUBH *bh;
1274
    QCowL2Meta l2meta;
1275
} QCowAIOCB;
1276

    
1277
static void qcow_aio_read_cb(void *opaque, int ret);
1278
static void qcow_aio_read_bh(void *opaque)
1279
{
1280
    QCowAIOCB *acb = opaque;
1281
    qemu_bh_delete(acb->bh);
1282
    acb->bh = NULL;
1283
    qcow_aio_read_cb(opaque, 0);
1284
}
1285

    
1286
static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
1287
{
1288
    if (acb->bh)
1289
        return -EIO;
1290

    
1291
    acb->bh = qemu_bh_new(cb, acb);
1292
    if (!acb->bh)
1293
        return -EIO;
1294

    
1295
    qemu_bh_schedule(acb->bh);
1296

    
1297
    return 0;
1298
}
1299

    
1300
static void qcow_aio_read_cb(void *opaque, int ret)
1301
{
1302
    QCowAIOCB *acb = opaque;
1303
    BlockDriverState *bs = acb->common.bs;
1304
    BDRVQcowState *s = bs->opaque;
1305
    int index_in_cluster, n1;
1306

    
1307
    acb->hd_aiocb = NULL;
1308
    if (ret < 0) {
1309
fail:
1310
        acb->common.cb(acb->common.opaque, ret);
1311
        qemu_aio_release(acb);
1312
        return;
1313
    }
1314

    
1315
    /* post process the read buffer */
1316
    if (!acb->cluster_offset) {
1317
        /* nothing to do */
1318
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1319
        /* nothing to do */
1320
    } else {
1321
        if (s->crypt_method) {
1322
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
1323
                            acb->n, 0,
1324
                            &s->aes_decrypt_key);
1325
        }
1326
    }
1327

    
1328
    acb->nb_sectors -= acb->n;
1329
    acb->sector_num += acb->n;
1330
    acb->buf += acb->n * 512;
1331

    
1332
    if (acb->nb_sectors == 0) {
1333
        /* request completed */
1334
        acb->common.cb(acb->common.opaque, 0);
1335
        qemu_aio_release(acb);
1336
        return;
1337
    }
1338

    
1339
    /* prepare next AIO request */
1340
    acb->n = acb->nb_sectors;
1341
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
1342
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1343

    
1344
    if (!acb->cluster_offset) {
1345
        if (bs->backing_hd) {
1346
            /* read from the base image */
1347
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
1348
                               acb->buf, acb->n);
1349
            if (n1 > 0) {
1350
                acb->hd_aiocb = bdrv_aio_read(bs->backing_hd, acb->sector_num,
1351
                                    acb->buf, acb->n, qcow_aio_read_cb, acb);
1352
                if (acb->hd_aiocb == NULL)
1353
                    goto fail;
1354
            } else {
1355
                ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1356
                if (ret < 0)
1357
                    goto fail;
1358
            }
1359
        } else {
1360
            /* Note: in this case, no need to wait */
1361
            memset(acb->buf, 0, 512 * acb->n);
1362
            ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1363
            if (ret < 0)
1364
                goto fail;
1365
        }
1366
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1367
        /* add AIO support for compressed blocks ? */
1368
        if (decompress_cluster(s, acb->cluster_offset) < 0)
1369
            goto fail;
1370
        memcpy(acb->buf,
1371
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
1372
        ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1373
        if (ret < 0)
1374
            goto fail;
1375
    } else {
1376
        if ((acb->cluster_offset & 511) != 0) {
1377
            ret = -EIO;
1378
            goto fail;
1379
        }
1380
        acb->hd_aiocb = bdrv_aio_read(s->hd,
1381
                            (acb->cluster_offset >> 9) + index_in_cluster,
1382
                            acb->buf, acb->n, qcow_aio_read_cb, acb);
1383
        if (acb->hd_aiocb == NULL)
1384
            goto fail;
1385
    }
1386
}
1387

    
1388
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1389
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1390
        BlockDriverCompletionFunc *cb, void *opaque)
1391
{
1392
    QCowAIOCB *acb;
1393

    
1394
    acb = qemu_aio_get(bs, cb, opaque);
1395
    if (!acb)
1396
        return NULL;
1397
    acb->hd_aiocb = NULL;
1398
    acb->sector_num = sector_num;
1399
    acb->buf = buf;
1400
    acb->nb_sectors = nb_sectors;
1401
    acb->n = 0;
1402
    acb->cluster_offset = 0;
1403
    acb->l2meta.nb_clusters = 0;
1404
    return acb;
1405
}
1406

    
1407
static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs,
1408
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1409
        BlockDriverCompletionFunc *cb, void *opaque)
1410
{
1411
    QCowAIOCB *acb;
1412

    
1413
    acb = qcow_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
1414
    if (!acb)
1415
        return NULL;
1416

    
1417
    qcow_aio_read_cb(acb, 0);
1418
    return &acb->common;
1419
}
1420

    
1421
static void qcow_aio_write_cb(void *opaque, int ret)
1422
{
1423
    QCowAIOCB *acb = opaque;
1424
    BlockDriverState *bs = acb->common.bs;
1425
    BDRVQcowState *s = bs->opaque;
1426
    int index_in_cluster;
1427
    const uint8_t *src_buf;
1428
    int n_end;
1429

    
1430
    acb->hd_aiocb = NULL;
1431

    
1432
    if (ret < 0) {
1433
    fail:
1434
        acb->common.cb(acb->common.opaque, ret);
1435
        qemu_aio_release(acb);
1436
        return;
1437
    }
1438

    
1439
    if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
1440
        free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
1441
        goto fail;
1442
    }
1443

    
1444
    acb->nb_sectors -= acb->n;
1445
    acb->sector_num += acb->n;
1446
    acb->buf += acb->n * 512;
1447

    
1448
    if (acb->nb_sectors == 0) {
1449
        /* request completed */
1450
        acb->common.cb(acb->common.opaque, 0);
1451
        qemu_aio_release(acb);
1452
        return;
1453
    }
1454

    
1455
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1456
    n_end = index_in_cluster + acb->nb_sectors;
1457
    if (s->crypt_method &&
1458
        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1459
        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1460

    
1461
    acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1462
                                          index_in_cluster,
1463
                                          n_end, &acb->n, &acb->l2meta);
1464
    if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
1465
        ret = -EIO;
1466
        goto fail;
1467
    }
1468
    if (s->crypt_method) {
1469
        if (!acb->cluster_data) {
1470
            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
1471
                                             s->cluster_size);
1472
        }
1473
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
1474
                        acb->n, 1, &s->aes_encrypt_key);
1475
        src_buf = acb->cluster_data;
1476
    } else {
1477
        src_buf = acb->buf;
1478
    }
1479
    acb->hd_aiocb = bdrv_aio_write(s->hd,
1480
                                   (acb->cluster_offset >> 9) + index_in_cluster,
1481
                                   src_buf, acb->n,
1482
                                   qcow_aio_write_cb, acb);
1483
    if (acb->hd_aiocb == NULL)
1484
        goto fail;
1485
}
1486

    
1487
static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs,
1488
        int64_t sector_num, const uint8_t *buf, int nb_sectors,
1489
        BlockDriverCompletionFunc *cb, void *opaque)
1490
{
1491
    BDRVQcowState *s = bs->opaque;
1492
    QCowAIOCB *acb;
1493

    
1494
    s->cluster_cache_offset = -1; /* disable compressed cache */
1495

    
1496
    acb = qcow_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
1497
    if (!acb)
1498
        return NULL;
1499

    
1500
    qcow_aio_write_cb(acb, 0);
1501
    return &acb->common;
1502
}
1503

    
1504
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1505
{
1506
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1507
    if (acb->hd_aiocb)
1508
        bdrv_aio_cancel(acb->hd_aiocb);
1509
    qemu_aio_release(acb);
1510
}
1511

    
1512
static void qcow_close(BlockDriverState *bs)
1513
{
1514
    BDRVQcowState *s = bs->opaque;
1515
    qemu_free(s->l1_table);
1516
    qemu_free(s->l2_cache);
1517
    qemu_free(s->cluster_cache);
1518
    qemu_free(s->cluster_data);
1519
    refcount_close(bs);
1520
    bdrv_delete(s->hd);
1521
}
1522

    
1523
/* XXX: use std qcow open function ? */
1524
typedef struct QCowCreateState {
1525
    int cluster_size;
1526
    int cluster_bits;
1527
    uint16_t *refcount_block;
1528
    uint64_t *refcount_table;
1529
    int64_t l1_table_offset;
1530
    int64_t refcount_table_offset;
1531
    int64_t refcount_block_offset;
1532
} QCowCreateState;
1533

    
1534
static void create_refcount_update(QCowCreateState *s,
1535
                                   int64_t offset, int64_t size)
1536
{
1537
    int refcount;
1538
    int64_t start, last, cluster_offset;
1539
    uint16_t *p;
1540

    
1541
    start = offset & ~(s->cluster_size - 1);
1542
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
1543
    for(cluster_offset = start; cluster_offset <= last;
1544
        cluster_offset += s->cluster_size) {
1545
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
1546
        refcount = be16_to_cpu(*p);
1547
        refcount++;
1548
        *p = cpu_to_be16(refcount);
1549
    }
1550
}
1551

    
1552
static int qcow_create2(const char *filename, int64_t total_size,
1553
                        const char *backing_file, const char *backing_format,
1554
                        int flags)
1555
{
1556

    
1557
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1558
    int backing_format_len = 0;
1559
    QCowHeader header;
1560
    uint64_t tmp, offset;
1561
    QCowCreateState s1, *s = &s1;
1562
    QCowExtension ext_bf = {0, 0};
1563

    
1564

    
1565
    memset(s, 0, sizeof(*s));
1566

    
1567
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1568
    if (fd < 0)
1569
        return -1;
1570
    memset(&header, 0, sizeof(header));
1571
    header.magic = cpu_to_be32(QCOW_MAGIC);
1572
    header.version = cpu_to_be32(QCOW_VERSION);
1573
    header.size = cpu_to_be64(total_size * 512);
1574
    header_size = sizeof(header);
1575
    backing_filename_len = 0;
1576
    if (backing_file) {
1577
        if (backing_format) {
1578
            ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT;
1579
            backing_format_len = strlen(backing_format);
1580
            ext_bf.len = (backing_format_len + 7) & ~7;
1581
            header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7);
1582
        }
1583
        header.backing_file_offset = cpu_to_be64(header_size);
1584
        backing_filename_len = strlen(backing_file);
1585
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1586
        header_size += backing_filename_len;
1587
    }
1588
    s->cluster_bits = 12;  /* 4 KB clusters */
1589
    s->cluster_size = 1 << s->cluster_bits;
1590
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1591
    header_size = (header_size + 7) & ~7;
1592
    if (flags & BLOCK_FLAG_ENCRYPT) {
1593
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1594
    } else {
1595
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1596
    }
1597
    l2_bits = s->cluster_bits - 3;
1598
    shift = s->cluster_bits + l2_bits;
1599
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1600
    offset = align_offset(header_size, s->cluster_size);
1601
    s->l1_table_offset = offset;
1602
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1603
    header.l1_size = cpu_to_be32(l1_size);
1604
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1605

    
1606
    s->refcount_table = qemu_mallocz(s->cluster_size);
1607
    s->refcount_block = qemu_mallocz(s->cluster_size);
1608

    
1609
    s->refcount_table_offset = offset;
1610
    header.refcount_table_offset = cpu_to_be64(offset);
1611
    header.refcount_table_clusters = cpu_to_be32(1);
1612
    offset += s->cluster_size;
1613

    
1614
    s->refcount_table[0] = cpu_to_be64(offset);
1615
    s->refcount_block_offset = offset;
1616
    offset += s->cluster_size;
1617

    
1618
    /* update refcounts */
1619
    create_refcount_update(s, 0, header_size);
1620
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1621
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1622
    create_refcount_update(s, s->refcount_block_offset, s->cluster_size);
1623

    
1624
    /* write all the data */
1625
    write(fd, &header, sizeof(header));
1626
    if (backing_file) {
1627
        if (backing_format_len) {
1628
            char zero[16];
1629
            int d = ext_bf.len - backing_format_len;
1630

    
1631
            memset(zero, 0, sizeof(zero));
1632
            cpu_to_be32s(&ext_bf.magic);
1633
            cpu_to_be32s(&ext_bf.len);
1634
            write(fd, &ext_bf, sizeof(ext_bf));
1635
            write(fd, backing_format, backing_format_len);
1636
            if (d>0) {
1637
                write(fd, zero, d);
1638
            }
1639
        }
1640
        write(fd, backing_file, backing_filename_len);
1641
    }
1642
    lseek(fd, s->l1_table_offset, SEEK_SET);
1643
    tmp = 0;
1644
    for(i = 0;i < l1_size; i++) {
1645
        write(fd, &tmp, sizeof(tmp));
1646
    }
1647
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1648
    write(fd, s->refcount_table, s->cluster_size);
1649

    
1650
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1651
    write(fd, s->refcount_block, s->cluster_size);
1652

    
1653
    qemu_free(s->refcount_table);
1654
    qemu_free(s->refcount_block);
1655
    close(fd);
1656
    return 0;
1657
}
1658

    
1659
static int qcow_create(const char *filename, int64_t total_size,
1660
                       const char *backing_file, int flags)
1661
{
1662
    return qcow_create2(filename, total_size, backing_file, NULL, flags);
1663
}
1664

    
1665
static int qcow_make_empty(BlockDriverState *bs)
1666
{
1667
#if 0
1668
    /* XXX: not correct */
1669
    BDRVQcowState *s = bs->opaque;
1670
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1671
    int ret;
1672

1673
    memset(s->l1_table, 0, l1_length);
1674
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1675
        return -1;
1676
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1677
    if (ret < 0)
1678
        return ret;
1679

1680
    l2_cache_reset(bs);
1681
#endif
1682
    return 0;
1683
}
1684

    
1685
/* XXX: put compressed sectors first, then all the cluster aligned
1686
   tables to avoid losing bytes in alignment */
1687
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1688
                                 const uint8_t *buf, int nb_sectors)
1689
{
1690
    BDRVQcowState *s = bs->opaque;
1691
    z_stream strm;
1692
    int ret, out_len;
1693
    uint8_t *out_buf;
1694
    uint64_t cluster_offset;
1695

    
1696
    if (nb_sectors == 0) {
1697
        /* align end of file to a sector boundary to ease reading with
1698
           sector based I/Os */
1699
        cluster_offset = bdrv_getlength(s->hd);
1700
        cluster_offset = (cluster_offset + 511) & ~511;
1701
        bdrv_truncate(s->hd, cluster_offset);
1702
        return 0;
1703
    }
1704

    
1705
    if (nb_sectors != s->cluster_sectors)
1706
        return -EINVAL;
1707

    
1708
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1709

    
1710
    /* best compression, small window, no zlib header */
1711
    memset(&strm, 0, sizeof(strm));
1712
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1713
                       Z_DEFLATED, -12,
1714
                       9, Z_DEFAULT_STRATEGY);
1715
    if (ret != 0) {
1716
        qemu_free(out_buf);
1717
        return -1;
1718
    }
1719

    
1720
    strm.avail_in = s->cluster_size;
1721
    strm.next_in = (uint8_t *)buf;
1722
    strm.avail_out = s->cluster_size;
1723
    strm.next_out = out_buf;
1724

    
1725
    ret = deflate(&strm, Z_FINISH);
1726
    if (ret != Z_STREAM_END && ret != Z_OK) {
1727
        qemu_free(out_buf);
1728
        deflateEnd(&strm);
1729
        return -1;
1730
    }
1731
    out_len = strm.next_out - out_buf;
1732

    
1733
    deflateEnd(&strm);
1734

    
1735
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1736
        /* could not compress: write normal cluster */
1737
        qcow_write(bs, sector_num, buf, s->cluster_sectors);
1738
    } else {
1739
        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
1740
                                              out_len);
1741
        if (!cluster_offset)
1742
            return -1;
1743
        cluster_offset &= s->cluster_offset_mask;
1744
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1745
            qemu_free(out_buf);
1746
            return -1;
1747
        }
1748
    }
1749

    
1750
    qemu_free(out_buf);
1751
    return 0;
1752
}
1753

    
1754
static void qcow_flush(BlockDriverState *bs)
1755
{
1756
    BDRVQcowState *s = bs->opaque;
1757
    bdrv_flush(s->hd);
1758
}
1759

    
1760
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1761
{
1762
    BDRVQcowState *s = bs->opaque;
1763
    bdi->cluster_size = s->cluster_size;
1764
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1765
        (s->cluster_bits + s->l2_bits);
1766
    return 0;
1767
}
1768

    
1769
/*********************************************************/
1770
/* snapshot support */
1771

    
1772
/* update the refcounts of snapshots and the copied flag */
1773
static int update_snapshot_refcount(BlockDriverState *bs,
1774
                                    int64_t l1_table_offset,
1775
                                    int l1_size,
1776
                                    int addend)
1777
{
1778
    BDRVQcowState *s = bs->opaque;
1779
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
1780
    int64_t old_offset, old_l2_offset;
1781
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
1782

    
1783
    l2_cache_reset(bs);
1784

    
1785
    l2_table = NULL;
1786
    l1_table = NULL;
1787
    l1_size2 = l1_size * sizeof(uint64_t);
1788
    l1_allocated = 0;
1789
    if (l1_table_offset != s->l1_table_offset) {
1790
        l1_table = qemu_malloc(l1_size2);
1791
        l1_allocated = 1;
1792
        if (bdrv_pread(s->hd, l1_table_offset,
1793
                       l1_table, l1_size2) != l1_size2)
1794
            goto fail;
1795
        for(i = 0;i < l1_size; i++)
1796
            be64_to_cpus(&l1_table[i]);
1797
    } else {
1798
        assert(l1_size == s->l1_size);
1799
        l1_table = s->l1_table;
1800
        l1_allocated = 0;
1801
    }
1802

    
1803
    l2_size = s->l2_size * sizeof(uint64_t);
1804
    l2_table = qemu_malloc(l2_size);
1805
    l1_modified = 0;
1806
    for(i = 0; i < l1_size; i++) {
1807
        l2_offset = l1_table[i];
1808
        if (l2_offset) {
1809
            old_l2_offset = l2_offset;
1810
            l2_offset &= ~QCOW_OFLAG_COPIED;
1811
            l2_modified = 0;
1812
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
1813
                goto fail;
1814
            for(j = 0; j < s->l2_size; j++) {
1815
                offset = be64_to_cpu(l2_table[j]);
1816
                if (offset != 0) {
1817
                    old_offset = offset;
1818
                    offset &= ~QCOW_OFLAG_COPIED;
1819
                    if (offset & QCOW_OFLAG_COMPRESSED) {
1820
                        nb_csectors = ((offset >> s->csize_shift) &
1821
                                       s->csize_mask) + 1;
1822
                        if (addend != 0)
1823
                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
1824
                                            nb_csectors * 512, addend);
1825
                        /* compressed clusters are never modified */
1826
                        refcount = 2;
1827
                    } else {
1828
                        if (addend != 0) {
1829
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
1830
                        } else {
1831
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
1832
                        }
1833
                    }
1834

    
1835
                    if (refcount == 1) {
1836
                        offset |= QCOW_OFLAG_COPIED;
1837
                    }
1838
                    if (offset != old_offset) {
1839
                        l2_table[j] = cpu_to_be64(offset);
1840
                        l2_modified = 1;
1841
                    }
1842
                }
1843
            }
1844
            if (l2_modified) {
1845
                if (bdrv_pwrite(s->hd,
1846
                                l2_offset, l2_table, l2_size) != l2_size)
1847
                    goto fail;
1848
            }
1849

    
1850
            if (addend != 0) {
1851
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
1852
            } else {
1853
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1854
            }
1855
            if (refcount == 1) {
1856
                l2_offset |= QCOW_OFLAG_COPIED;
1857
            }
1858
            if (l2_offset != old_l2_offset) {
1859
                l1_table[i] = l2_offset;
1860
                l1_modified = 1;
1861
            }
1862
        }
1863
    }
1864
    if (l1_modified) {
1865
        for(i = 0; i < l1_size; i++)
1866
            cpu_to_be64s(&l1_table[i]);
1867
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
1868
                        l1_size2) != l1_size2)
1869
            goto fail;
1870
        for(i = 0; i < l1_size; i++)
1871
            be64_to_cpus(&l1_table[i]);
1872
    }
1873
    if (l1_allocated)
1874
        qemu_free(l1_table);
1875
    qemu_free(l2_table);
1876
    return 0;
1877
 fail:
1878
    if (l1_allocated)
1879
        qemu_free(l1_table);
1880
    qemu_free(l2_table);
1881
    return -EIO;
1882
}
1883

    
1884
static void qcow_free_snapshots(BlockDriverState *bs)
1885
{
1886
    BDRVQcowState *s = bs->opaque;
1887
    int i;
1888

    
1889
    for(i = 0; i < s->nb_snapshots; i++) {
1890
        qemu_free(s->snapshots[i].name);
1891
        qemu_free(s->snapshots[i].id_str);
1892
    }
1893
    qemu_free(s->snapshots);
1894
    s->snapshots = NULL;
1895
    s->nb_snapshots = 0;
1896
}
1897

    
1898
static int qcow_read_snapshots(BlockDriverState *bs)
1899
{
1900
    BDRVQcowState *s = bs->opaque;
1901
    QCowSnapshotHeader h;
1902
    QCowSnapshot *sn;
1903
    int i, id_str_size, name_size;
1904
    int64_t offset;
1905
    uint32_t extra_data_size;
1906

    
1907
    if (!s->nb_snapshots) {
1908
        s->snapshots = NULL;
1909
        s->snapshots_size = 0;
1910
        return 0;
1911
    }
1912

    
1913
    offset = s->snapshots_offset;
1914
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1915
    for(i = 0; i < s->nb_snapshots; i++) {
1916
        offset = align_offset(offset, 8);
1917
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1918
            goto fail;
1919
        offset += sizeof(h);
1920
        sn = s->snapshots + i;
1921
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1922
        sn->l1_size = be32_to_cpu(h.l1_size);
1923
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1924
        sn->date_sec = be32_to_cpu(h.date_sec);
1925
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1926
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1927
        extra_data_size = be32_to_cpu(h.extra_data_size);
1928

    
1929
        id_str_size = be16_to_cpu(h.id_str_size);
1930
        name_size = be16_to_cpu(h.name_size);
1931

    
1932
        offset += extra_data_size;
1933

    
1934
        sn->id_str = qemu_malloc(id_str_size + 1);
1935
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1936
            goto fail;
1937
        offset += id_str_size;
1938
        sn->id_str[id_str_size] = '\0';
1939

    
1940
        sn->name = qemu_malloc(name_size + 1);
1941
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1942
            goto fail;
1943
        offset += name_size;
1944
        sn->name[name_size] = '\0';
1945
    }
1946
    s->snapshots_size = offset - s->snapshots_offset;
1947
    return 0;
1948
 fail:
1949
    qcow_free_snapshots(bs);
1950
    return -1;
1951
}
1952

    
1953
/* add at the end of the file a new list of snapshots */
1954
static int qcow_write_snapshots(BlockDriverState *bs)
1955
{
1956
    BDRVQcowState *s = bs->opaque;
1957
    QCowSnapshot *sn;
1958
    QCowSnapshotHeader h;
1959
    int i, name_size, id_str_size, snapshots_size;
1960
    uint64_t data64;
1961
    uint32_t data32;
1962
    int64_t offset, snapshots_offset;
1963

    
1964
    /* compute the size of the snapshots */
1965
    offset = 0;
1966
    for(i = 0; i < s->nb_snapshots; i++) {
1967
        sn = s->snapshots + i;
1968
        offset = align_offset(offset, 8);
1969
        offset += sizeof(h);
1970
        offset += strlen(sn->id_str);
1971
        offset += strlen(sn->name);
1972
    }
1973
    snapshots_size = offset;
1974

    
1975
    snapshots_offset = alloc_clusters(bs, snapshots_size);
1976
    offset = snapshots_offset;
1977

    
1978
    for(i = 0; i < s->nb_snapshots; i++) {
1979
        sn = s->snapshots + i;
1980
        memset(&h, 0, sizeof(h));
1981
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
1982
        h.l1_size = cpu_to_be32(sn->l1_size);
1983
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
1984
        h.date_sec = cpu_to_be32(sn->date_sec);
1985
        h.date_nsec = cpu_to_be32(sn->date_nsec);
1986
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
1987

    
1988
        id_str_size = strlen(sn->id_str);
1989
        name_size = strlen(sn->name);
1990
        h.id_str_size = cpu_to_be16(id_str_size);
1991
        h.name_size = cpu_to_be16(name_size);
1992
        offset = align_offset(offset, 8);
1993
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1994
            goto fail;
1995
        offset += sizeof(h);
1996
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1997
            goto fail;
1998
        offset += id_str_size;
1999
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
2000
            goto fail;
2001
        offset += name_size;
2002
    }
2003

    
2004
    /* update the various header fields */
2005
    data64 = cpu_to_be64(snapshots_offset);
2006
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
2007
                    &data64, sizeof(data64)) != sizeof(data64))
2008
        goto fail;
2009
    data32 = cpu_to_be32(s->nb_snapshots);
2010
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
2011
                    &data32, sizeof(data32)) != sizeof(data32))
2012
        goto fail;
2013

    
2014
    /* free the old snapshot table */
2015
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
2016
    s->snapshots_offset = snapshots_offset;
2017
    s->snapshots_size = snapshots_size;
2018
    return 0;
2019
 fail:
2020
    return -1;
2021
}
2022

    
2023
static void find_new_snapshot_id(BlockDriverState *bs,
2024
                                 char *id_str, int id_str_size)
2025
{
2026
    BDRVQcowState *s = bs->opaque;
2027
    QCowSnapshot *sn;
2028
    int i, id, id_max = 0;
2029

    
2030
    for(i = 0; i < s->nb_snapshots; i++) {
2031
        sn = s->snapshots + i;
2032
        id = strtoul(sn->id_str, NULL, 10);
2033
        if (id > id_max)
2034
            id_max = id;
2035
    }
2036
    snprintf(id_str, id_str_size, "%d", id_max + 1);
2037
}
2038

    
2039
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
2040
{
2041
    BDRVQcowState *s = bs->opaque;
2042
    int i;
2043

    
2044
    for(i = 0; i < s->nb_snapshots; i++) {
2045
        if (!strcmp(s->snapshots[i].id_str, id_str))
2046
            return i;
2047
    }
2048
    return -1;
2049
}
2050

    
2051
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
2052
{
2053
    BDRVQcowState *s = bs->opaque;
2054
    int i, ret;
2055

    
2056
    ret = find_snapshot_by_id(bs, name);
2057
    if (ret >= 0)
2058
        return ret;
2059
    for(i = 0; i < s->nb_snapshots; i++) {
2060
        if (!strcmp(s->snapshots[i].name, name))
2061
            return i;
2062
    }
2063
    return -1;
2064
}
2065

    
2066
/* if no id is provided, a new one is constructed */
2067
static int qcow_snapshot_create(BlockDriverState *bs,
2068
                                QEMUSnapshotInfo *sn_info)
2069
{
2070
    BDRVQcowState *s = bs->opaque;
2071
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
2072
    int i, ret;
2073
    uint64_t *l1_table = NULL;
2074

    
2075
    memset(sn, 0, sizeof(*sn));
2076

    
2077
    if (sn_info->id_str[0] == '\0') {
2078
        /* compute a new id */
2079
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
2080
    }
2081

    
2082
    /* check that the ID is unique */
2083
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
2084
        return -ENOENT;
2085

    
2086
    sn->id_str = qemu_strdup(sn_info->id_str);
2087
    if (!sn->id_str)
2088
        goto fail;
2089
    sn->name = qemu_strdup(sn_info->name);
2090
    if (!sn->name)
2091
        goto fail;
2092
    sn->vm_state_size = sn_info->vm_state_size;
2093
    sn->date_sec = sn_info->date_sec;
2094
    sn->date_nsec = sn_info->date_nsec;
2095
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
2096

    
2097
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
2098
    if (ret < 0)
2099
        goto fail;
2100

    
2101
    /* create the L1 table of the snapshot */
2102
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
2103
    sn->l1_size = s->l1_size;
2104

    
2105
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
2106
    for(i = 0; i < s->l1_size; i++) {
2107
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
2108
    }
2109
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
2110
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
2111
        (s->l1_size * sizeof(uint64_t)))
2112
        goto fail;
2113
    qemu_free(l1_table);
2114
    l1_table = NULL;
2115

    
2116
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
2117
    if (s->snapshots) {
2118
        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
2119
        qemu_free(s->snapshots);
2120
    }
2121
    s->snapshots = snapshots1;
2122
    s->snapshots[s->nb_snapshots++] = *sn;
2123

    
2124
    if (qcow_write_snapshots(bs) < 0)
2125
        goto fail;
2126
#ifdef DEBUG_ALLOC
2127
    check_refcounts(bs);
2128
#endif
2129
    return 0;
2130
 fail:
2131
    qemu_free(sn->name);
2132
    qemu_free(l1_table);
2133
    return -1;
2134
}
2135

    
2136
/* copy the snapshot 'snapshot_name' into the current disk image */
2137
static int qcow_snapshot_goto(BlockDriverState *bs,
2138
                              const char *snapshot_id)
2139
{
2140
    BDRVQcowState *s = bs->opaque;
2141
    QCowSnapshot *sn;
2142
    int i, snapshot_index, l1_size2;
2143

    
2144
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2145
    if (snapshot_index < 0)
2146
        return -ENOENT;
2147
    sn = &s->snapshots[snapshot_index];
2148

    
2149
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
2150
        goto fail;
2151

    
2152
    if (grow_l1_table(bs, sn->l1_size) < 0)
2153
        goto fail;
2154

    
2155
    s->l1_size = sn->l1_size;
2156
    l1_size2 = s->l1_size * sizeof(uint64_t);
2157
    /* copy the snapshot l1 table to the current l1 table */
2158
    if (bdrv_pread(s->hd, sn->l1_table_offset,
2159
                   s->l1_table, l1_size2) != l1_size2)
2160
        goto fail;
2161
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
2162
                    s->l1_table, l1_size2) != l1_size2)
2163
        goto fail;
2164
    for(i = 0;i < s->l1_size; i++) {
2165
        be64_to_cpus(&s->l1_table[i]);
2166
    }
2167

    
2168
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
2169
        goto fail;
2170

    
2171
#ifdef DEBUG_ALLOC
2172
    check_refcounts(bs);
2173
#endif
2174
    return 0;
2175
 fail:
2176
    return -EIO;
2177
}
2178

    
2179
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2180
{
2181
    BDRVQcowState *s = bs->opaque;
2182
    QCowSnapshot *sn;
2183
    int snapshot_index, ret;
2184

    
2185
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2186
    if (snapshot_index < 0)
2187
        return -ENOENT;
2188
    sn = &s->snapshots[snapshot_index];
2189

    
2190
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
2191
    if (ret < 0)
2192
        return ret;
2193
    /* must update the copied flag on the current cluster offsets */
2194
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
2195
    if (ret < 0)
2196
        return ret;
2197
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
2198

    
2199
    qemu_free(sn->id_str);
2200
    qemu_free(sn->name);
2201
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
2202
    s->nb_snapshots--;
2203
    ret = qcow_write_snapshots(bs);
2204
    if (ret < 0) {
2205
        /* XXX: restore snapshot if error ? */
2206
        return ret;
2207
    }
2208
#ifdef DEBUG_ALLOC
2209
    check_refcounts(bs);
2210
#endif
2211
    return 0;
2212
}
2213

    
2214
static int qcow_snapshot_list(BlockDriverState *bs,
2215
                              QEMUSnapshotInfo **psn_tab)
2216
{
2217
    BDRVQcowState *s = bs->opaque;
2218
    QEMUSnapshotInfo *sn_tab, *sn_info;
2219
    QCowSnapshot *sn;
2220
    int i;
2221

    
2222
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
2223
    for(i = 0; i < s->nb_snapshots; i++) {
2224
        sn_info = sn_tab + i;
2225
        sn = s->snapshots + i;
2226
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
2227
                sn->id_str);
2228
        pstrcpy(sn_info->name, sizeof(sn_info->name),
2229
                sn->name);
2230
        sn_info->vm_state_size = sn->vm_state_size;
2231
        sn_info->date_sec = sn->date_sec;
2232
        sn_info->date_nsec = sn->date_nsec;
2233
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
2234
    }
2235
    *psn_tab = sn_tab;
2236
    return s->nb_snapshots;
2237
}
2238

    
2239
/*********************************************************/
2240
/* refcount handling */
2241

    
2242
static int refcount_init(BlockDriverState *bs)
2243
{
2244
    BDRVQcowState *s = bs->opaque;
2245
    int ret, refcount_table_size2, i;
2246

    
2247
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
2248
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
2249
    s->refcount_table = qemu_malloc(refcount_table_size2);
2250
    if (s->refcount_table_size > 0) {
2251
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
2252
                         s->refcount_table, refcount_table_size2);
2253
        if (ret != refcount_table_size2)
2254
            goto fail;
2255
        for(i = 0; i < s->refcount_table_size; i++)
2256
            be64_to_cpus(&s->refcount_table[i]);
2257
    }
2258
    return 0;
2259
 fail:
2260
    return -ENOMEM;
2261
}
2262

    
2263
static void refcount_close(BlockDriverState *bs)
2264
{
2265
    BDRVQcowState *s = bs->opaque;
2266
    qemu_free(s->refcount_block_cache);
2267
    qemu_free(s->refcount_table);
2268
}
2269

    
2270

    
2271
static int load_refcount_block(BlockDriverState *bs,
2272
                               int64_t refcount_block_offset)
2273
{
2274
    BDRVQcowState *s = bs->opaque;
2275
    int ret;
2276
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
2277
                     s->cluster_size);
2278
    if (ret != s->cluster_size)
2279
        return -EIO;
2280
    s->refcount_block_cache_offset = refcount_block_offset;
2281
    return 0;
2282
}
2283

    
2284
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
2285
{
2286
    BDRVQcowState *s = bs->opaque;
2287
    int refcount_table_index, block_index;
2288
    int64_t refcount_block_offset;
2289

    
2290
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2291
    if (refcount_table_index >= s->refcount_table_size)
2292
        return 0;
2293
    refcount_block_offset = s->refcount_table[refcount_table_index];
2294
    if (!refcount_block_offset)
2295
        return 0;
2296
    if (refcount_block_offset != s->refcount_block_cache_offset) {
2297
        /* better than nothing: return allocated if read error */
2298
        if (load_refcount_block(bs, refcount_block_offset) < 0)
2299
            return 1;
2300
    }
2301
    block_index = cluster_index &
2302
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2303
    return be16_to_cpu(s->refcount_block_cache[block_index]);
2304
}
2305

    
2306
/* return < 0 if error */
2307
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
2308
{
2309
    BDRVQcowState *s = bs->opaque;
2310
    int i, nb_clusters;
2311

    
2312
    nb_clusters = size_to_clusters(s, size);
2313
retry:
2314
    for(i = 0; i < nb_clusters; i++) {
2315
        int64_t i = s->free_cluster_index++;
2316
        if (get_refcount(bs, i) != 0)
2317
            goto retry;
2318
    }
2319
#ifdef DEBUG_ALLOC2
2320
    printf("alloc_clusters: size=%lld -> %lld\n",
2321
            size,
2322
            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
2323
#endif
2324
    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
2325
}
2326

    
2327
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
2328
{
2329
    int64_t offset;
2330

    
2331
    offset = alloc_clusters_noref(bs, size);
2332
    update_refcount(bs, offset, size, 1);
2333
    return offset;
2334
}
2335

    
2336
/* only used to allocate compressed sectors. We try to allocate
2337
   contiguous sectors. size must be <= cluster_size */
2338
static int64_t alloc_bytes(BlockDriverState *bs, int size)
2339
{
2340
    BDRVQcowState *s = bs->opaque;
2341
    int64_t offset, cluster_offset;
2342
    int free_in_cluster;
2343

    
2344
    assert(size > 0 && size <= s->cluster_size);
2345
    if (s->free_byte_offset == 0) {
2346
        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
2347
    }
2348
 redo:
2349
    free_in_cluster = s->cluster_size -
2350
        (s->free_byte_offset & (s->cluster_size - 1));
2351
    if (size <= free_in_cluster) {
2352
        /* enough space in current cluster */
2353
        offset = s->free_byte_offset;
2354
        s->free_byte_offset += size;
2355
        free_in_cluster -= size;
2356
        if (free_in_cluster == 0)
2357
            s->free_byte_offset = 0;
2358
        if ((offset & (s->cluster_size - 1)) != 0)
2359
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2360
    } else {
2361
        offset = alloc_clusters(bs, s->cluster_size);
2362
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
2363
        if ((cluster_offset + s->cluster_size) == offset) {
2364
            /* we are lucky: contiguous data */
2365
            offset = s->free_byte_offset;
2366
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2367
            s->free_byte_offset += size;
2368
        } else {
2369
            s->free_byte_offset = offset;
2370
            goto redo;
2371
        }
2372
    }
2373
    return offset;
2374
}
2375

    
2376
static void free_clusters(BlockDriverState *bs,
2377
                          int64_t offset, int64_t size)
2378
{
2379
    update_refcount(bs, offset, size, -1);
2380
}
2381

    
2382
static int grow_refcount_table(BlockDriverState *bs, int min_size)
2383
{
2384
    BDRVQcowState *s = bs->opaque;
2385
    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
2386
    uint64_t *new_table;
2387
    int64_t table_offset;
2388
    uint8_t data[12];
2389
    int old_table_size;
2390
    int64_t old_table_offset;
2391

    
2392
    if (min_size <= s->refcount_table_size)
2393
        return 0;
2394
    /* compute new table size */
2395
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2396
    for(;;) {
2397
        if (refcount_table_clusters == 0) {
2398
            refcount_table_clusters = 1;
2399
        } else {
2400
            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
2401
        }
2402
        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
2403
        if (min_size <= new_table_size)
2404
            break;
2405
    }
2406
#ifdef DEBUG_ALLOC2
2407
    printf("grow_refcount_table from %d to %d\n",
2408
           s->refcount_table_size,
2409
           new_table_size);
2410
#endif
2411
    new_table_size2 = new_table_size * sizeof(uint64_t);
2412
    new_table = qemu_mallocz(new_table_size2);
2413
    memcpy(new_table, s->refcount_table,
2414
           s->refcount_table_size * sizeof(uint64_t));
2415
    for(i = 0; i < s->refcount_table_size; i++)
2416
        cpu_to_be64s(&new_table[i]);
2417
    /* Note: we cannot update the refcount now to avoid recursion */
2418
    table_offset = alloc_clusters_noref(bs, new_table_size2);
2419
    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
2420
    if (ret != new_table_size2)
2421
        goto fail;
2422
    for(i = 0; i < s->refcount_table_size; i++)
2423
        be64_to_cpus(&new_table[i]);
2424

    
2425
    cpu_to_be64w((uint64_t*)data, table_offset);
2426
    cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters);
2427
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
2428
                    data, sizeof(data)) != sizeof(data))
2429
        goto fail;
2430
    qemu_free(s->refcount_table);
2431
    old_table_offset = s->refcount_table_offset;
2432
    old_table_size = s->refcount_table_size;
2433
    s->refcount_table = new_table;
2434
    s->refcount_table_size = new_table_size;
2435
    s->refcount_table_offset = table_offset;
2436

    
2437
    update_refcount(bs, table_offset, new_table_size2, 1);
2438
    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
2439
    return 0;
2440
 fail:
2441
    free_clusters(bs, table_offset, new_table_size2);
2442
    qemu_free(new_table);
2443
    return -EIO;
2444
}
2445

    
2446
/* addend must be 1 or -1 */
2447
/* XXX: cache several refcount block clusters ? */
2448
static int update_cluster_refcount(BlockDriverState *bs,
2449
                                   int64_t cluster_index,
2450
                                   int addend)
2451
{
2452
    BDRVQcowState *s = bs->opaque;
2453
    int64_t offset, refcount_block_offset;
2454
    int ret, refcount_table_index, block_index, refcount;
2455
    uint64_t data64;
2456

    
2457
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2458
    if (refcount_table_index >= s->refcount_table_size) {
2459
        if (addend < 0)
2460
            return -EINVAL;
2461
        ret = grow_refcount_table(bs, refcount_table_index + 1);
2462
        if (ret < 0)
2463
            return ret;
2464
    }
2465
    refcount_block_offset = s->refcount_table[refcount_table_index];
2466
    if (!refcount_block_offset) {
2467
        if (addend < 0)
2468
            return -EINVAL;
2469
        /* create a new refcount block */
2470
        /* Note: we cannot update the refcount now to avoid recursion */
2471
        offset = alloc_clusters_noref(bs, s->cluster_size);
2472
        memset(s->refcount_block_cache, 0, s->cluster_size);
2473
        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
2474
        if (ret != s->cluster_size)
2475
            return -EINVAL;
2476
        s->refcount_table[refcount_table_index] = offset;
2477
        data64 = cpu_to_be64(offset);
2478
        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
2479
                          refcount_table_index * sizeof(uint64_t),
2480
                          &data64, sizeof(data64));
2481
        if (ret != sizeof(data64))
2482
            return -EINVAL;
2483

    
2484
        refcount_block_offset = offset;
2485
        s->refcount_block_cache_offset = offset;
2486
        update_refcount(bs, offset, s->cluster_size, 1);
2487
    } else {
2488
        if (refcount_block_offset != s->refcount_block_cache_offset) {
2489
            if (load_refcount_block(bs, refcount_block_offset) < 0)
2490
                return -EIO;
2491
        }
2492
    }
2493
    /* we can update the count and save it */
2494
    block_index = cluster_index &
2495
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2496
    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
2497
    refcount += addend;
2498
    if (refcount < 0 || refcount > 0xffff)
2499
        return -EINVAL;
2500
    if (refcount == 0 && cluster_index < s->free_cluster_index) {
2501
        s->free_cluster_index = cluster_index;
2502
    }
2503
    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
2504
    if (bdrv_pwrite(s->hd,
2505
                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
2506
                    &s->refcount_block_cache[block_index], 2) != 2)
2507
        return -EIO;
2508
    return refcount;
2509
}
2510

    
2511
static void update_refcount(BlockDriverState *bs,
2512
                            int64_t offset, int64_t length,
2513
                            int addend)
2514
{
2515
    BDRVQcowState *s = bs->opaque;
2516
    int64_t start, last, cluster_offset;
2517

    
2518
#ifdef DEBUG_ALLOC2
2519
    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
2520
           offset, length, addend);
2521
#endif
2522
    if (length <= 0)
2523
        return;
2524
    start = offset & ~(s->cluster_size - 1);
2525
    last = (offset + length - 1) & ~(s->cluster_size - 1);
2526
    for(cluster_offset = start; cluster_offset <= last;
2527
        cluster_offset += s->cluster_size) {
2528
        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
2529
    }
2530
}
2531

    
2532
#ifdef DEBUG_ALLOC
2533
static void inc_refcounts(BlockDriverState *bs,
2534
                          uint16_t *refcount_table,
2535
                          int refcount_table_size,
2536
                          int64_t offset, int64_t size)
2537
{
2538
    BDRVQcowState *s = bs->opaque;
2539
    int64_t start, last, cluster_offset;
2540
    int k;
2541

    
2542
    if (size <= 0)
2543
        return;
2544

    
2545
    start = offset & ~(s->cluster_size - 1);
2546
    last = (offset + size - 1) & ~(s->cluster_size - 1);
2547
    for(cluster_offset = start; cluster_offset <= last;
2548
        cluster_offset += s->cluster_size) {
2549
        k = cluster_offset >> s->cluster_bits;
2550
        if (k < 0 || k >= refcount_table_size) {
2551
            printf("ERROR: invalid cluster offset=0x%llx\n", cluster_offset);
2552
        } else {
2553
            if (++refcount_table[k] == 0) {
2554
                printf("ERROR: overflow cluster offset=0x%llx\n", cluster_offset);
2555
            }
2556
        }
2557
    }
2558
}
2559

    
2560
static int check_refcounts_l1(BlockDriverState *bs,
2561
                              uint16_t *refcount_table,
2562
                              int refcount_table_size,
2563
                              int64_t l1_table_offset, int l1_size,
2564
                              int check_copied)
2565
{
2566
    BDRVQcowState *s = bs->opaque;
2567
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
2568
    int l2_size, i, j, nb_csectors, refcount;
2569

    
2570
    l2_table = NULL;
2571
    l1_size2 = l1_size * sizeof(uint64_t);
2572

    
2573
    inc_refcounts(bs, refcount_table, refcount_table_size,
2574
                  l1_table_offset, l1_size2);
2575

    
2576
    l1_table = qemu_malloc(l1_size2);
2577
    if (bdrv_pread(s->hd, l1_table_offset,
2578
                   l1_table, l1_size2) != l1_size2)
2579
        goto fail;
2580
    for(i = 0;i < l1_size; i++)
2581
        be64_to_cpus(&l1_table[i]);
2582

    
2583
    l2_size = s->l2_size * sizeof(uint64_t);
2584
    l2_table = qemu_malloc(l2_size);
2585
    for(i = 0; i < l1_size; i++) {
2586
        l2_offset = l1_table[i];
2587
        if (l2_offset) {
2588
            if (check_copied) {
2589
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2590
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
2591
                    printf("ERROR OFLAG_COPIED: l2_offset=%llx refcount=%d\n",
2592
                           l2_offset, refcount);
2593
                }
2594
            }
2595
            l2_offset &= ~QCOW_OFLAG_COPIED;
2596
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
2597
                goto fail;
2598
            for(j = 0; j < s->l2_size; j++) {
2599
                offset = be64_to_cpu(l2_table[j]);
2600
                if (offset != 0) {
2601
                    if (offset & QCOW_OFLAG_COMPRESSED) {
2602
                        if (offset & QCOW_OFLAG_COPIED) {
2603
                            printf("ERROR: cluster %lld: copied flag must never be set for compressed clusters\n",
2604
                                   offset >> s->cluster_bits);
2605
                            offset &= ~QCOW_OFLAG_COPIED;
2606
                        }
2607
                        nb_csectors = ((offset >> s->csize_shift) &
2608
                                       s->csize_mask) + 1;
2609
                        offset &= s->cluster_offset_mask;
2610
                        inc_refcounts(bs, refcount_table,
2611
                                      refcount_table_size,
2612
                                      offset & ~511, nb_csectors * 512);
2613
                    } else {
2614
                        if (check_copied) {
2615
                            refcount = get_refcount(bs, (offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2616
                            if ((refcount == 1) != ((offset & QCOW_OFLAG_COPIED) != 0)) {
2617
                                printf("ERROR OFLAG_COPIED: offset=%llx refcount=%d\n",
2618
                                       offset, refcount);
2619
                            }
2620
                        }
2621
                        offset &= ~QCOW_OFLAG_COPIED;
2622
                        inc_refcounts(bs, refcount_table,
2623
                                      refcount_table_size,
2624
                                      offset, s->cluster_size);
2625
                    }
2626
                }
2627
            }
2628
            inc_refcounts(bs, refcount_table,
2629
                          refcount_table_size,
2630
                          l2_offset,
2631
                          s->cluster_size);
2632
        }
2633
    }
2634
    qemu_free(l1_table);
2635
    qemu_free(l2_table);
2636
    return 0;
2637
 fail:
2638
    printf("ERROR: I/O error in check_refcounts_l1\n");
2639
    qemu_free(l1_table);
2640
    qemu_free(l2_table);
2641
    return -EIO;
2642
}
2643

    
2644
static void check_refcounts(BlockDriverState *bs)
2645
{
2646
    BDRVQcowState *s = bs->opaque;
2647
    int64_t size;
2648
    int nb_clusters, refcount1, refcount2, i;
2649
    QCowSnapshot *sn;
2650
    uint16_t *refcount_table;
2651

    
2652
    size = bdrv_getlength(s->hd);
2653
    nb_clusters = size_to_clusters(s, size);
2654
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
2655

    
2656
    /* header */
2657
    inc_refcounts(bs, refcount_table, nb_clusters,
2658
                  0, s->cluster_size);
2659

    
2660
    check_refcounts_l1(bs, refcount_table, nb_clusters,
2661
                       s->l1_table_offset, s->l1_size, 1);
2662

    
2663
    /* snapshots */
2664
    for(i = 0; i < s->nb_snapshots; i++) {
2665
        sn = s->snapshots + i;
2666
        check_refcounts_l1(bs, refcount_table, nb_clusters,
2667
                           sn->l1_table_offset, sn->l1_size, 0);
2668
    }
2669
    inc_refcounts(bs, refcount_table, nb_clusters,
2670
                  s->snapshots_offset, s->snapshots_size);
2671

    
2672
    /* refcount data */
2673
    inc_refcounts(bs, refcount_table, nb_clusters,
2674
                  s->refcount_table_offset,
2675
                  s->refcount_table_size * sizeof(uint64_t));
2676
    for(i = 0; i < s->refcount_table_size; i++) {
2677
        int64_t offset;
2678
        offset = s->refcount_table[i];
2679
        if (offset != 0) {
2680
            inc_refcounts(bs, refcount_table, nb_clusters,
2681
                          offset, s->cluster_size);
2682
        }
2683
    }
2684

    
2685
    /* compare ref counts */
2686
    for(i = 0; i < nb_clusters; i++) {
2687
        refcount1 = get_refcount(bs, i);
2688
        refcount2 = refcount_table[i];
2689
        if (refcount1 != refcount2)
2690
            printf("ERROR cluster %d refcount=%d reference=%d\n",
2691
                   i, refcount1, refcount2);
2692
    }
2693

    
2694
    qemu_free(refcount_table);
2695
}
2696

    
2697
#if 0
2698
static void dump_refcounts(BlockDriverState *bs)
2699
{
2700
    BDRVQcowState *s = bs->opaque;
2701
    int64_t nb_clusters, k, k1, size;
2702
    int refcount;
2703

2704
    size = bdrv_getlength(s->hd);
2705
    nb_clusters = size_to_clusters(s, size);
2706
    for(k = 0; k < nb_clusters;) {
2707
        k1 = k;
2708
        refcount = get_refcount(bs, k);
2709
        k++;
2710
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2711
            k++;
2712
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2713
    }
2714
}
2715
#endif
2716
#endif
2717

    
2718
BlockDriver bdrv_qcow2 = {
2719
    .format_name        = "qcow2",
2720
    .instance_size        = sizeof(BDRVQcowState),
2721
    .bdrv_probe                = qcow_probe,
2722
    .bdrv_open                = qcow_open,
2723
    .bdrv_close                = qcow_close,
2724
    .bdrv_create        = qcow_create,
2725
    .bdrv_flush                = qcow_flush,
2726
    .bdrv_is_allocated        = qcow_is_allocated,
2727
    .bdrv_set_key        = qcow_set_key,
2728
    .bdrv_make_empty        = qcow_make_empty,
2729

    
2730
    .bdrv_aio_read        = qcow_aio_read,
2731
    .bdrv_aio_write        = qcow_aio_write,
2732
    .bdrv_aio_cancel        = qcow_aio_cancel,
2733
    .aiocb_size                = sizeof(QCowAIOCB),
2734
    .bdrv_write_compressed = qcow_write_compressed,
2735

    
2736
    .bdrv_snapshot_create = qcow_snapshot_create,
2737
    .bdrv_snapshot_goto        = qcow_snapshot_goto,
2738
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2739
    .bdrv_snapshot_list        = qcow_snapshot_list,
2740
    .bdrv_get_info        = qcow_get_info,
2741

    
2742
    .bdrv_create2 = qcow_create2,
2743
};