Statistics
| Branch: | Revision:

root / block-qcow2.c @ 2d2431f0

History | View | Annotate | Download (84.1 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include <zlib.h>
27
#include "aes.h"
28
#include <assert.h>
29

    
30
/*
31
  Differences with QCOW:
32

33
  - Support for multiple incremental snapshots.
34
  - Memory management by reference counts.
35
  - Clusters which have a reference count of one have the bit
36
    QCOW_OFLAG_COPIED to optimize write performance.
37
  - Size of compressed clusters is stored in sectors to reduce bit usage
38
    in the cluster offsets.
39
  - Support for storing additional data (such as the VM state) in the
40
    snapshots.
41
  - If a backing store is used, the cluster size is not constrained
42
    (could be backported to QCOW).
43
  - L2 tables have always a size of one cluster.
44
*/
45

    
46
//#define DEBUG_ALLOC
47
//#define DEBUG_ALLOC2
48
//#define DEBUG_EXT
49

    
50
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
51
#define QCOW_VERSION 2
52

    
53
#define QCOW_CRYPT_NONE 0
54
#define QCOW_CRYPT_AES  1
55

    
56
#define QCOW_MAX_CRYPT_CLUSTERS 32
57

    
58
/* indicate that the refcount of the referenced cluster is exactly one. */
59
#define QCOW_OFLAG_COPIED     (1LL << 63)
60
/* indicate that the cluster is compressed (they never have the copied flag) */
61
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
62

    
63
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
64

    
65
typedef struct QCowHeader {
66
    uint32_t magic;
67
    uint32_t version;
68
    uint64_t backing_file_offset;
69
    uint32_t backing_file_size;
70
    uint32_t cluster_bits;
71
    uint64_t size; /* in bytes */
72
    uint32_t crypt_method;
73
    uint32_t l1_size; /* XXX: save number of clusters instead ? */
74
    uint64_t l1_table_offset;
75
    uint64_t refcount_table_offset;
76
    uint32_t refcount_table_clusters;
77
    uint32_t nb_snapshots;
78
    uint64_t snapshots_offset;
79
} QCowHeader;
80

    
81

    
82
typedef struct {
83
    uint32_t magic;
84
    uint32_t len;
85
} QCowExtension;
86
#define  QCOW_EXT_MAGIC_END 0
87
#define  QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
88

    
89

    
90
typedef struct __attribute__((packed)) QCowSnapshotHeader {
91
    /* header is 8 byte aligned */
92
    uint64_t l1_table_offset;
93

    
94
    uint32_t l1_size;
95
    uint16_t id_str_size;
96
    uint16_t name_size;
97

    
98
    uint32_t date_sec;
99
    uint32_t date_nsec;
100

    
101
    uint64_t vm_clock_nsec;
102

    
103
    uint32_t vm_state_size;
104
    uint32_t extra_data_size; /* for extension */
105
    /* extra data follows */
106
    /* id_str follows */
107
    /* name follows  */
108
} QCowSnapshotHeader;
109

    
110
#define L2_CACHE_SIZE 16
111

    
112
typedef struct QCowSnapshot {
113
    uint64_t l1_table_offset;
114
    uint32_t l1_size;
115
    char *id_str;
116
    char *name;
117
    uint32_t vm_state_size;
118
    uint32_t date_sec;
119
    uint32_t date_nsec;
120
    uint64_t vm_clock_nsec;
121
} QCowSnapshot;
122

    
123
typedef struct BDRVQcowState {
124
    BlockDriverState *hd;
125
    int cluster_bits;
126
    int cluster_size;
127
    int cluster_sectors;
128
    int l2_bits;
129
    int l2_size;
130
    int l1_size;
131
    int l1_vm_state_index;
132
    int csize_shift;
133
    int csize_mask;
134
    uint64_t cluster_offset_mask;
135
    uint64_t l1_table_offset;
136
    uint64_t *l1_table;
137
    uint64_t *l2_cache;
138
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
139
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
140
    uint8_t *cluster_cache;
141
    uint8_t *cluster_data;
142
    uint64_t cluster_cache_offset;
143

    
144
    uint64_t *refcount_table;
145
    uint64_t refcount_table_offset;
146
    uint32_t refcount_table_size;
147
    uint64_t refcount_block_cache_offset;
148
    uint16_t *refcount_block_cache;
149
    int64_t free_cluster_index;
150
    int64_t free_byte_offset;
151

    
152
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
153
    uint32_t crypt_method_header;
154
    AES_KEY aes_encrypt_key;
155
    AES_KEY aes_decrypt_key;
156
    uint64_t snapshots_offset;
157
    int snapshots_size;
158
    int nb_snapshots;
159
    QCowSnapshot *snapshots;
160
} BDRVQcowState;
161

    
162
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
163
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
164
                     uint8_t *buf, int nb_sectors);
165
static int qcow_read_snapshots(BlockDriverState *bs);
166
static void qcow_free_snapshots(BlockDriverState *bs);
167
static int refcount_init(BlockDriverState *bs);
168
static void refcount_close(BlockDriverState *bs);
169
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
170
static int update_cluster_refcount(BlockDriverState *bs,
171
                                   int64_t cluster_index,
172
                                   int addend);
173
static void update_refcount(BlockDriverState *bs,
174
                            int64_t offset, int64_t length,
175
                            int addend);
176
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
177
static int64_t alloc_bytes(BlockDriverState *bs, int size);
178
static void free_clusters(BlockDriverState *bs,
179
                          int64_t offset, int64_t size);
180
#ifdef DEBUG_ALLOC
181
static void check_refcounts(BlockDriverState *bs);
182
#endif
183

    
184
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
185
{
186
    const QCowHeader *cow_header = (const void *)buf;
187

    
188
    if (buf_size >= sizeof(QCowHeader) &&
189
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
190
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
191
        return 100;
192
    else
193
        return 0;
194
}
195

    
196

    
197
/* 
198
 * read qcow2 extension and fill bs
199
 * start reading from start_offset
200
 * finish reading upon magic of value 0 or when end_offset reached
201
 * unknown magic is skipped (future extension this version knows nothing about)
202
 * return 0 upon success, non-0 otherwise
203
 */
204
static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset,
205
                                uint64_t end_offset)
206
{
207
    BDRVQcowState *s = bs->opaque;
208
    QCowExtension ext;
209
    uint64_t offset;
210

    
211
#ifdef DEBUG_EXT
212
    printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
213
#endif
214
    offset = start_offset;
215
    while (offset < end_offset) {
216

    
217
#ifdef DEBUG_EXT
218
        /* Sanity check */
219
        if (offset > s->cluster_size)
220
            printf("qcow_handle_extension: suspicious offset %lu\n", offset);
221

    
222
        printf("attemting to read extended header in offset %lu\n", offset);
223
#endif
224

    
225
        if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) {
226
            fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n",
227
                    (unsigned long long)offset);
228
            return 1;
229
        }
230
        be32_to_cpus(&ext.magic);
231
        be32_to_cpus(&ext.len);
232
        offset += sizeof(ext);
233
#ifdef DEBUG_EXT
234
        printf("ext.magic = 0x%x\n", ext.magic);
235
#endif
236
        switch (ext.magic) {
237
        case QCOW_EXT_MAGIC_END:
238
            return 0;
239

    
240
        case QCOW_EXT_MAGIC_BACKING_FORMAT:
241
            if (ext.len >= sizeof(bs->backing_format)) {
242
                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
243
                        " (>=%zu)\n",
244
                        ext.len, sizeof(bs->backing_format));
245
                return 2;
246
            }
247
            if (bdrv_pread(s->hd, offset , bs->backing_format,
248
                           ext.len) != ext.len)
249
                return 3;
250
            bs->backing_format[ext.len] = '\0';
251
#ifdef DEBUG_EXT
252
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
253
#endif
254
            offset += ((ext.len + 7) & ~7);
255
            break;
256

    
257
        default:
258
            /* unknown magic -- just skip it */
259
            offset += ((ext.len + 7) & ~7);
260
            break;
261
        }
262
    }
263

    
264
    return 0;
265
}
266

    
267

    
268
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
269
{
270
    BDRVQcowState *s = bs->opaque;
271
    int len, i, shift, ret;
272
    QCowHeader header;
273
    uint64_t ext_end;
274

    
275
    /* Performance is terrible right now with cache=writethrough due mainly
276
     * to reference count updates.  If the user does not explicitly specify
277
     * a caching type, force to writeback caching.
278
     */
279
    if ((flags & BDRV_O_CACHE_DEF)) {
280
        flags |= BDRV_O_CACHE_WB;
281
        flags &= ~BDRV_O_CACHE_DEF;
282
    }
283
    ret = bdrv_file_open(&s->hd, filename, flags);
284
    if (ret < 0)
285
        return ret;
286
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
287
        goto fail;
288
    be32_to_cpus(&header.magic);
289
    be32_to_cpus(&header.version);
290
    be64_to_cpus(&header.backing_file_offset);
291
    be32_to_cpus(&header.backing_file_size);
292
    be64_to_cpus(&header.size);
293
    be32_to_cpus(&header.cluster_bits);
294
    be32_to_cpus(&header.crypt_method);
295
    be64_to_cpus(&header.l1_table_offset);
296
    be32_to_cpus(&header.l1_size);
297
    be64_to_cpus(&header.refcount_table_offset);
298
    be32_to_cpus(&header.refcount_table_clusters);
299
    be64_to_cpus(&header.snapshots_offset);
300
    be32_to_cpus(&header.nb_snapshots);
301

    
302
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
303
        goto fail;
304
    if (header.size <= 1 ||
305
        header.cluster_bits < 9 ||
306
        header.cluster_bits > 16)
307
        goto fail;
308
    if (header.crypt_method > QCOW_CRYPT_AES)
309
        goto fail;
310
    s->crypt_method_header = header.crypt_method;
311
    if (s->crypt_method_header)
312
        bs->encrypted = 1;
313
    s->cluster_bits = header.cluster_bits;
314
    s->cluster_size = 1 << s->cluster_bits;
315
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
316
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
317
    s->l2_size = 1 << s->l2_bits;
318
    bs->total_sectors = header.size / 512;
319
    s->csize_shift = (62 - (s->cluster_bits - 8));
320
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
321
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
322
    s->refcount_table_offset = header.refcount_table_offset;
323
    s->refcount_table_size =
324
        header.refcount_table_clusters << (s->cluster_bits - 3);
325

    
326
    s->snapshots_offset = header.snapshots_offset;
327
    s->nb_snapshots = header.nb_snapshots;
328

    
329
    /* read the level 1 table */
330
    s->l1_size = header.l1_size;
331
    shift = s->cluster_bits + s->l2_bits;
332
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
333
    /* the L1 table must contain at least enough entries to put
334
       header.size bytes */
335
    if (s->l1_size < s->l1_vm_state_index)
336
        goto fail;
337
    s->l1_table_offset = header.l1_table_offset;
338
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
339
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
340
        s->l1_size * sizeof(uint64_t))
341
        goto fail;
342
    for(i = 0;i < s->l1_size; i++) {
343
        be64_to_cpus(&s->l1_table[i]);
344
    }
345
    /* alloc L2 cache */
346
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
347
    s->cluster_cache = qemu_malloc(s->cluster_size);
348
    /* one more sector for decompressed data alignment */
349
    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
350
                                  + 512);
351
    s->cluster_cache_offset = -1;
352

    
353
    if (refcount_init(bs) < 0)
354
        goto fail;
355

    
356
    /* read qcow2 extensions */
357
    if (header.backing_file_offset)
358
        ext_end = header.backing_file_offset;
359
    else
360
        ext_end = s->cluster_size;
361
    if (qcow_read_extensions(bs, sizeof(header), ext_end))
362
        goto fail;
363

    
364
    /* read the backing file name */
365
    if (header.backing_file_offset != 0) {
366
        len = header.backing_file_size;
367
        if (len > 1023)
368
            len = 1023;
369
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
370
            goto fail;
371
        bs->backing_file[len] = '\0';
372
    }
373
    if (qcow_read_snapshots(bs) < 0)
374
        goto fail;
375

    
376
#ifdef DEBUG_ALLOC
377
    check_refcounts(bs);
378
#endif
379
    return 0;
380

    
381
 fail:
382
    qcow_free_snapshots(bs);
383
    refcount_close(bs);
384
    qemu_free(s->l1_table);
385
    qemu_free(s->l2_cache);
386
    qemu_free(s->cluster_cache);
387
    qemu_free(s->cluster_data);
388
    bdrv_delete(s->hd);
389
    return -1;
390
}
391

    
392
static int qcow_set_key(BlockDriverState *bs, const char *key)
393
{
394
    BDRVQcowState *s = bs->opaque;
395
    uint8_t keybuf[16];
396
    int len, i;
397

    
398
    memset(keybuf, 0, 16);
399
    len = strlen(key);
400
    if (len > 16)
401
        len = 16;
402
    /* XXX: we could compress the chars to 7 bits to increase
403
       entropy */
404
    for(i = 0;i < len;i++) {
405
        keybuf[i] = key[i];
406
    }
407
    s->crypt_method = s->crypt_method_header;
408

    
409
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
410
        return -1;
411
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
412
        return -1;
413
#if 0
414
    /* test */
415
    {
416
        uint8_t in[16];
417
        uint8_t out[16];
418
        uint8_t tmp[16];
419
        for(i=0;i<16;i++)
420
            in[i] = i;
421
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
422
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
423
        for(i = 0; i < 16; i++)
424
            printf(" %02x", tmp[i]);
425
        printf("\n");
426
        for(i = 0; i < 16; i++)
427
            printf(" %02x", out[i]);
428
        printf("\n");
429
    }
430
#endif
431
    return 0;
432
}
433

    
434
/* The crypt function is compatible with the linux cryptoloop
435
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
436
   supported */
437
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
438
                            uint8_t *out_buf, const uint8_t *in_buf,
439
                            int nb_sectors, int enc,
440
                            const AES_KEY *key)
441
{
442
    union {
443
        uint64_t ll[2];
444
        uint8_t b[16];
445
    } ivec;
446
    int i;
447

    
448
    for(i = 0; i < nb_sectors; i++) {
449
        ivec.ll[0] = cpu_to_le64(sector_num);
450
        ivec.ll[1] = 0;
451
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
452
                        ivec.b, enc);
453
        sector_num++;
454
        in_buf += 512;
455
        out_buf += 512;
456
    }
457
}
458

    
459
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
460
                        uint64_t cluster_offset, int n_start, int n_end)
461
{
462
    BDRVQcowState *s = bs->opaque;
463
    int n, ret;
464

    
465
    n = n_end - n_start;
466
    if (n <= 0)
467
        return 0;
468
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
469
    if (ret < 0)
470
        return ret;
471
    if (s->crypt_method) {
472
        encrypt_sectors(s, start_sect + n_start,
473
                        s->cluster_data,
474
                        s->cluster_data, n, 1,
475
                        &s->aes_encrypt_key);
476
    }
477
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
478
                     s->cluster_data, n);
479
    if (ret < 0)
480
        return ret;
481
    return 0;
482
}
483

    
484
static void l2_cache_reset(BlockDriverState *bs)
485
{
486
    BDRVQcowState *s = bs->opaque;
487

    
488
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
489
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
490
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
491
}
492

    
493
static inline int l2_cache_new_entry(BlockDriverState *bs)
494
{
495
    BDRVQcowState *s = bs->opaque;
496
    uint32_t min_count;
497
    int min_index, i;
498

    
499
    /* find a new entry in the least used one */
500
    min_index = 0;
501
    min_count = 0xffffffff;
502
    for(i = 0; i < L2_CACHE_SIZE; i++) {
503
        if (s->l2_cache_counts[i] < min_count) {
504
            min_count = s->l2_cache_counts[i];
505
            min_index = i;
506
        }
507
    }
508
    return min_index;
509
}
510

    
511
static int64_t align_offset(int64_t offset, int n)
512
{
513
    offset = (offset + n - 1) & ~(n - 1);
514
    return offset;
515
}
516

    
517
static int grow_l1_table(BlockDriverState *bs, int min_size)
518
{
519
    BDRVQcowState *s = bs->opaque;
520
    int new_l1_size, new_l1_size2, ret, i;
521
    uint64_t *new_l1_table;
522
    uint64_t new_l1_table_offset;
523
    uint8_t data[12];
524

    
525
    new_l1_size = s->l1_size;
526
    if (min_size <= new_l1_size)
527
        return 0;
528
    while (min_size > new_l1_size) {
529
        new_l1_size = (new_l1_size * 3 + 1) / 2;
530
    }
531
#ifdef DEBUG_ALLOC2
532
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
533
#endif
534

    
535
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
536
    new_l1_table = qemu_mallocz(new_l1_size2);
537
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
538

    
539
    /* write new table (align to cluster) */
540
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
541

    
542
    for(i = 0; i < s->l1_size; i++)
543
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
544
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
545
    if (ret != new_l1_size2)
546
        goto fail;
547
    for(i = 0; i < s->l1_size; i++)
548
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
549

    
550
    /* set new table */
551
    cpu_to_be32w((uint32_t*)data, new_l1_size);
552
    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
553
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
554
                sizeof(data)) != sizeof(data))
555
        goto fail;
556
    qemu_free(s->l1_table);
557
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
558
    s->l1_table_offset = new_l1_table_offset;
559
    s->l1_table = new_l1_table;
560
    s->l1_size = new_l1_size;
561
    return 0;
562
 fail:
563
    qemu_free(s->l1_table);
564
    return -EIO;
565
}
566

    
567
/*
568
 * seek_l2_table
569
 *
570
 * seek l2_offset in the l2_cache table
571
 * if not found, return NULL,
572
 * if found,
573
 *   increments the l2 cache hit count of the entry,
574
 *   if counter overflow, divide by two all counters
575
 *   return the pointer to the l2 cache entry
576
 *
577
 */
578

    
579
static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
580
{
581
    int i, j;
582

    
583
    for(i = 0; i < L2_CACHE_SIZE; i++) {
584
        if (l2_offset == s->l2_cache_offsets[i]) {
585
            /* increment the hit count */
586
            if (++s->l2_cache_counts[i] == 0xffffffff) {
587
                for(j = 0; j < L2_CACHE_SIZE; j++) {
588
                    s->l2_cache_counts[j] >>= 1;
589
                }
590
            }
591
            return s->l2_cache + (i << s->l2_bits);
592
        }
593
    }
594
    return NULL;
595
}
596

    
597
/*
598
 * l2_load
599
 *
600
 * Loads a L2 table into memory. If the table is in the cache, the cache
601
 * is used; otherwise the L2 table is loaded from the image file.
602
 *
603
 * Returns a pointer to the L2 table on success, or NULL if the read from
604
 * the image file failed.
605
 */
606

    
607
static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
608
{
609
    BDRVQcowState *s = bs->opaque;
610
    int min_index;
611
    uint64_t *l2_table;
612

    
613
    /* seek if the table for the given offset is in the cache */
614

    
615
    l2_table = seek_l2_table(s, l2_offset);
616
    if (l2_table != NULL)
617
        return l2_table;
618

    
619
    /* not found: load a new entry in the least used one */
620

    
621
    min_index = l2_cache_new_entry(bs);
622
    l2_table = s->l2_cache + (min_index << s->l2_bits);
623
    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
624
        s->l2_size * sizeof(uint64_t))
625
        return NULL;
626
    s->l2_cache_offsets[min_index] = l2_offset;
627
    s->l2_cache_counts[min_index] = 1;
628

    
629
    return l2_table;
630
}
631

    
632
/*
633
 * l2_allocate
634
 *
635
 * Allocate a new l2 entry in the file. If l1_index points to an already
636
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
637
 * table) copy the contents of the old L2 table into the newly allocated one.
638
 * Otherwise the new table is initialized with zeros.
639
 *
640
 */
641

    
642
static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
643
{
644
    BDRVQcowState *s = bs->opaque;
645
    int min_index;
646
    uint64_t old_l2_offset, tmp;
647
    uint64_t *l2_table, l2_offset;
648

    
649
    old_l2_offset = s->l1_table[l1_index];
650

    
651
    /* allocate a new l2 entry */
652

    
653
    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
654

    
655
    /* update the L1 entry */
656

    
657
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
658

    
659
    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
660
    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
661
                    &tmp, sizeof(tmp)) != sizeof(tmp))
662
        return NULL;
663

    
664
    /* allocate a new entry in the l2 cache */
665

    
666
    min_index = l2_cache_new_entry(bs);
667
    l2_table = s->l2_cache + (min_index << s->l2_bits);
668

    
669
    if (old_l2_offset == 0) {
670
        /* if there was no old l2 table, clear the new table */
671
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
672
    } else {
673
        /* if there was an old l2 table, read it from the disk */
674
        if (bdrv_pread(s->hd, old_l2_offset,
675
                       l2_table, s->l2_size * sizeof(uint64_t)) !=
676
            s->l2_size * sizeof(uint64_t))
677
            return NULL;
678
    }
679
    /* write the l2 table to the file */
680
    if (bdrv_pwrite(s->hd, l2_offset,
681
                    l2_table, s->l2_size * sizeof(uint64_t)) !=
682
        s->l2_size * sizeof(uint64_t))
683
        return NULL;
684

    
685
    /* update the l2 cache entry */
686

    
687
    s->l2_cache_offsets[min_index] = l2_offset;
688
    s->l2_cache_counts[min_index] = 1;
689

    
690
    return l2_table;
691
}
692

    
693
static int size_to_clusters(BDRVQcowState *s, int64_t size)
694
{
695
    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
696
}
697

    
698
static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
699
        uint64_t *l2_table, uint64_t start, uint64_t mask)
700
{
701
    int i;
702
    uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
703

    
704
    if (!offset)
705
        return 0;
706

    
707
    for (i = start; i < start + nb_clusters; i++)
708
        if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
709
            break;
710

    
711
        return (i - start);
712
}
713

    
714
static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
715
{
716
    int i = 0;
717

    
718
    while(nb_clusters-- && l2_table[i] == 0)
719
        i++;
720

    
721
    return i;
722
}
723

    
724
/*
725
 * get_cluster_offset
726
 *
727
 * For a given offset of the disk image, return cluster offset in
728
 * qcow2 file.
729
 *
730
 * on entry, *num is the number of contiguous clusters we'd like to
731
 * access following offset.
732
 *
733
 * on exit, *num is the number of contiguous clusters we can read.
734
 *
735
 * Return 1, if the offset is found
736
 * Return 0, otherwise.
737
 *
738
 */
739

    
740
static uint64_t get_cluster_offset(BlockDriverState *bs,
741
                                   uint64_t offset, int *num)
742
{
743
    BDRVQcowState *s = bs->opaque;
744
    int l1_index, l2_index;
745
    uint64_t l2_offset, *l2_table, cluster_offset;
746
    int l1_bits, c;
747
    int index_in_cluster, nb_available, nb_needed, nb_clusters;
748

    
749
    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
750
    nb_needed = *num + index_in_cluster;
751

    
752
    l1_bits = s->l2_bits + s->cluster_bits;
753

    
754
    /* compute how many bytes there are between the offset and
755
     * the end of the l1 entry
756
     */
757

    
758
    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
759

    
760
    /* compute the number of available sectors */
761

    
762
    nb_available = (nb_available >> 9) + index_in_cluster;
763

    
764
    if (nb_needed > nb_available) {
765
        nb_needed = nb_available;
766
    }
767

    
768
    cluster_offset = 0;
769

    
770
    /* seek the the l2 offset in the l1 table */
771

    
772
    l1_index = offset >> l1_bits;
773
    if (l1_index >= s->l1_size)
774
        goto out;
775

    
776
    l2_offset = s->l1_table[l1_index];
777

    
778
    /* seek the l2 table of the given l2 offset */
779

    
780
    if (!l2_offset)
781
        goto out;
782

    
783
    /* load the l2 table in memory */
784

    
785
    l2_offset &= ~QCOW_OFLAG_COPIED;
786
    l2_table = l2_load(bs, l2_offset);
787
    if (l2_table == NULL)
788
        return 0;
789

    
790
    /* find the cluster offset for the given disk offset */
791

    
792
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
793
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
794
    nb_clusters = size_to_clusters(s, nb_needed << 9);
795

    
796
    if (!cluster_offset) {
797
        /* how many empty clusters ? */
798
        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
799
    } else {
800
        /* how many allocated clusters ? */
801
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
802
                &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
803
    }
804

    
805
   nb_available = (c * s->cluster_sectors);
806
out:
807
    if (nb_available > nb_needed)
808
        nb_available = nb_needed;
809

    
810
    *num = nb_available - index_in_cluster;
811

    
812
    return cluster_offset & ~QCOW_OFLAG_COPIED;
813
}
814

    
815
/*
816
 * free_any_clusters
817
 *
818
 * free clusters according to its type: compressed or not
819
 *
820
 */
821

    
822
static void free_any_clusters(BlockDriverState *bs,
823
                              uint64_t cluster_offset, int nb_clusters)
824
{
825
    BDRVQcowState *s = bs->opaque;
826

    
827
    /* free the cluster */
828

    
829
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
830
        int nb_csectors;
831
        nb_csectors = ((cluster_offset >> s->csize_shift) &
832
                       s->csize_mask) + 1;
833
        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
834
                      nb_csectors * 512);
835
        return;
836
    }
837

    
838
    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
839

    
840
    return;
841
}
842

    
843
/*
844
 * get_cluster_table
845
 *
846
 * for a given disk offset, load (and allocate if needed)
847
 * the l2 table.
848
 *
849
 * the l2 table offset in the qcow2 file and the cluster index
850
 * in the l2 table are given to the caller.
851
 *
852
 */
853

    
854
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
855
                             uint64_t **new_l2_table,
856
                             uint64_t *new_l2_offset,
857
                             int *new_l2_index)
858
{
859
    BDRVQcowState *s = bs->opaque;
860
    int l1_index, l2_index, ret;
861
    uint64_t l2_offset, *l2_table;
862

    
863
    /* seek the the l2 offset in the l1 table */
864

    
865
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
866
    if (l1_index >= s->l1_size) {
867
        ret = grow_l1_table(bs, l1_index + 1);
868
        if (ret < 0)
869
            return 0;
870
    }
871
    l2_offset = s->l1_table[l1_index];
872

    
873
    /* seek the l2 table of the given l2 offset */
874

    
875
    if (l2_offset & QCOW_OFLAG_COPIED) {
876
        /* load the l2 table in memory */
877
        l2_offset &= ~QCOW_OFLAG_COPIED;
878
        l2_table = l2_load(bs, l2_offset);
879
        if (l2_table == NULL)
880
            return 0;
881
    } else {
882
        if (l2_offset)
883
            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
884
        l2_table = l2_allocate(bs, l1_index);
885
        if (l2_table == NULL)
886
            return 0;
887
        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
888
    }
889

    
890
    /* find the cluster offset for the given disk offset */
891

    
892
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
893

    
894
    *new_l2_table = l2_table;
895
    *new_l2_offset = l2_offset;
896
    *new_l2_index = l2_index;
897

    
898
    return 1;
899
}
900

    
901
/*
902
 * alloc_compressed_cluster_offset
903
 *
904
 * For a given offset of the disk image, return cluster offset in
905
 * qcow2 file.
906
 *
907
 * If the offset is not found, allocate a new compressed cluster.
908
 *
909
 * Return the cluster offset if successful,
910
 * Return 0, otherwise.
911
 *
912
 */
913

    
914
static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
915
                                                uint64_t offset,
916
                                                int compressed_size)
917
{
918
    BDRVQcowState *s = bs->opaque;
919
    int l2_index, ret;
920
    uint64_t l2_offset, *l2_table, cluster_offset;
921
    int nb_csectors;
922

    
923
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
924
    if (ret == 0)
925
        return 0;
926

    
927
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
928
    if (cluster_offset & QCOW_OFLAG_COPIED)
929
        return cluster_offset & ~QCOW_OFLAG_COPIED;
930

    
931
    if (cluster_offset)
932
        free_any_clusters(bs, cluster_offset, 1);
933

    
934
    cluster_offset = alloc_bytes(bs, compressed_size);
935
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
936
                  (cluster_offset >> 9);
937

    
938
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
939
                      ((uint64_t)nb_csectors << s->csize_shift);
940

    
941
    /* update L2 table */
942

    
943
    /* compressed clusters never have the copied flag */
944

    
945
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
946
    if (bdrv_pwrite(s->hd,
947
                    l2_offset + l2_index * sizeof(uint64_t),
948
                    l2_table + l2_index,
949
                    sizeof(uint64_t)) != sizeof(uint64_t))
950
        return 0;
951

    
952
    return cluster_offset;
953
}
954

    
955
typedef struct QCowL2Meta
956
{
957
    uint64_t offset;
958
    int n_start;
959
    int nb_available;
960
    int nb_clusters;
961
} QCowL2Meta;
962

    
963
static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
964
        QCowL2Meta *m)
965
{
966
    BDRVQcowState *s = bs->opaque;
967
    int i, j = 0, l2_index, ret;
968
    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
969

    
970
    if (m->nb_clusters == 0)
971
        return 0;
972

    
973
    old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
974

    
975
    /* copy content of unmodified sectors */
976
    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
977
    if (m->n_start) {
978
        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
979
        if (ret < 0)
980
            goto err;
981
    }
982

    
983
    if (m->nb_available & (s->cluster_sectors - 1)) {
984
        uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
985
        ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
986
                m->nb_available - end, s->cluster_sectors);
987
        if (ret < 0)
988
            goto err;
989
    }
990

    
991
    ret = -EIO;
992
    /* update L2 table */
993
    if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
994
        goto err;
995

    
996
    for (i = 0; i < m->nb_clusters; i++) {
997
        if(l2_table[l2_index + i] != 0)
998
            old_cluster[j++] = l2_table[l2_index + i];
999

    
1000
        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
1001
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
1002
     }
1003

    
1004
    if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
1005
                l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
1006
            m->nb_clusters * sizeof(uint64_t))
1007
        goto err;
1008

    
1009
    for (i = 0; i < j; i++)
1010
        free_any_clusters(bs, old_cluster[i], 1);
1011

    
1012
    ret = 0;
1013
err:
1014
    qemu_free(old_cluster);
1015
    return ret;
1016
 }
1017

    
1018
/*
1019
 * alloc_cluster_offset
1020
 *
1021
 * For a given offset of the disk image, return cluster offset in
1022
 * qcow2 file.
1023
 *
1024
 * If the offset is not found, allocate a new cluster.
1025
 *
1026
 * Return the cluster offset if successful,
1027
 * Return 0, otherwise.
1028
 *
1029
 */
1030

    
1031
static uint64_t alloc_cluster_offset(BlockDriverState *bs,
1032
                                     uint64_t offset,
1033
                                     int n_start, int n_end,
1034
                                     int *num, QCowL2Meta *m)
1035
{
1036
    BDRVQcowState *s = bs->opaque;
1037
    int l2_index, ret;
1038
    uint64_t l2_offset, *l2_table, cluster_offset;
1039
    int nb_clusters, i = 0;
1040

    
1041
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
1042
    if (ret == 0)
1043
        return 0;
1044

    
1045
    nb_clusters = size_to_clusters(s, n_end << 9);
1046

    
1047
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1048

    
1049
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
1050

    
1051
    /* We keep all QCOW_OFLAG_COPIED clusters */
1052

    
1053
    if (cluster_offset & QCOW_OFLAG_COPIED) {
1054
        nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
1055
                &l2_table[l2_index], 0, 0);
1056

    
1057
        cluster_offset &= ~QCOW_OFLAG_COPIED;
1058
        m->nb_clusters = 0;
1059

    
1060
        goto out;
1061
    }
1062

    
1063
    /* for the moment, multiple compressed clusters are not managed */
1064

    
1065
    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
1066
        nb_clusters = 1;
1067

    
1068
    /* how many available clusters ? */
1069

    
1070
    while (i < nb_clusters) {
1071
        i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
1072
                &l2_table[l2_index], i, 0);
1073

    
1074
        if(be64_to_cpu(l2_table[l2_index + i]))
1075
            break;
1076

    
1077
        i += count_contiguous_free_clusters(nb_clusters - i,
1078
                &l2_table[l2_index + i]);
1079

    
1080
        cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
1081

    
1082
        if ((cluster_offset & QCOW_OFLAG_COPIED) ||
1083
                (cluster_offset & QCOW_OFLAG_COMPRESSED))
1084
            break;
1085
    }
1086
    nb_clusters = i;
1087

    
1088
    /* allocate a new cluster */
1089

    
1090
    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
1091

    
1092
    /* save info needed for meta data update */
1093
    m->offset = offset;
1094
    m->n_start = n_start;
1095
    m->nb_clusters = nb_clusters;
1096

    
1097
out:
1098
    m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
1099

    
1100
    *num = m->nb_available - n_start;
1101

    
1102
    return cluster_offset;
1103
}
1104

    
1105
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
1106
                             int nb_sectors, int *pnum)
1107
{
1108
    uint64_t cluster_offset;
1109

    
1110
    *pnum = nb_sectors;
1111
    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
1112

    
1113
    return (cluster_offset != 0);
1114
}
1115

    
1116
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1117
                             const uint8_t *buf, int buf_size)
1118
{
1119
    z_stream strm1, *strm = &strm1;
1120
    int ret, out_len;
1121

    
1122
    memset(strm, 0, sizeof(*strm));
1123

    
1124
    strm->next_in = (uint8_t *)buf;
1125
    strm->avail_in = buf_size;
1126
    strm->next_out = out_buf;
1127
    strm->avail_out = out_buf_size;
1128

    
1129
    ret = inflateInit2(strm, -12);
1130
    if (ret != Z_OK)
1131
        return -1;
1132
    ret = inflate(strm, Z_FINISH);
1133
    out_len = strm->next_out - out_buf;
1134
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1135
        out_len != out_buf_size) {
1136
        inflateEnd(strm);
1137
        return -1;
1138
    }
1139
    inflateEnd(strm);
1140
    return 0;
1141
}
1142

    
1143
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
1144
{
1145
    int ret, csize, nb_csectors, sector_offset;
1146
    uint64_t coffset;
1147

    
1148
    coffset = cluster_offset & s->cluster_offset_mask;
1149
    if (s->cluster_cache_offset != coffset) {
1150
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1151
        sector_offset = coffset & 511;
1152
        csize = nb_csectors * 512 - sector_offset;
1153
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
1154
        if (ret < 0) {
1155
            return -1;
1156
        }
1157
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1158
                              s->cluster_data + sector_offset, csize) < 0) {
1159
            return -1;
1160
        }
1161
        s->cluster_cache_offset = coffset;
1162
    }
1163
    return 0;
1164
}
1165

    
1166
/* handle reading after the end of the backing file */
1167
static int backing_read1(BlockDriverState *bs,
1168
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
1169
{
1170
    int n1;
1171
    if ((sector_num + nb_sectors) <= bs->total_sectors)
1172
        return nb_sectors;
1173
    if (sector_num >= bs->total_sectors)
1174
        n1 = 0;
1175
    else
1176
        n1 = bs->total_sectors - sector_num;
1177
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
1178
    return n1;
1179
}
1180

    
1181
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
1182
                     uint8_t *buf, int nb_sectors)
1183
{
1184
    BDRVQcowState *s = bs->opaque;
1185
    int ret, index_in_cluster, n, n1;
1186
    uint64_t cluster_offset;
1187

    
1188
    while (nb_sectors > 0) {
1189
        n = nb_sectors;
1190
        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
1191
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1192
        if (!cluster_offset) {
1193
            if (bs->backing_hd) {
1194
                /* read from the base image */
1195
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
1196
                if (n1 > 0) {
1197
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
1198
                    if (ret < 0)
1199
                        return -1;
1200
                }
1201
            } else {
1202
                memset(buf, 0, 512 * n);
1203
            }
1204
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1205
            if (decompress_cluster(s, cluster_offset) < 0)
1206
                return -1;
1207
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
1208
        } else {
1209
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1210
            if (ret != n * 512)
1211
                return -1;
1212
            if (s->crypt_method) {
1213
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
1214
                                &s->aes_decrypt_key);
1215
            }
1216
        }
1217
        nb_sectors -= n;
1218
        sector_num += n;
1219
        buf += n * 512;
1220
    }
1221
    return 0;
1222
}
1223

    
1224
static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1225
                     const uint8_t *buf, int nb_sectors)
1226
{
1227
    BDRVQcowState *s = bs->opaque;
1228
    int ret, index_in_cluster, n;
1229
    uint64_t cluster_offset;
1230
    int n_end;
1231
    QCowL2Meta l2meta;
1232

    
1233
    while (nb_sectors > 0) {
1234
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1235
        n_end = index_in_cluster + nb_sectors;
1236
        if (s->crypt_method &&
1237
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1238
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1239
        cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1240
                                              index_in_cluster,
1241
                                              n_end, &n, &l2meta);
1242
        if (!cluster_offset)
1243
            return -1;
1244
        if (s->crypt_method) {
1245
            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
1246
                            &s->aes_encrypt_key);
1247
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
1248
                              s->cluster_data, n * 512);
1249
        } else {
1250
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1251
        }
1252
        if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
1253
            free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
1254
            return -1;
1255
        }
1256
        nb_sectors -= n;
1257
        sector_num += n;
1258
        buf += n * 512;
1259
    }
1260
    s->cluster_cache_offset = -1; /* disable compressed cache */
1261
    return 0;
1262
}
1263

    
1264
typedef struct QCowAIOCB {
1265
    BlockDriverAIOCB common;
1266
    int64_t sector_num;
1267
    uint8_t *buf;
1268
    int nb_sectors;
1269
    int n;
1270
    uint64_t cluster_offset;
1271
    uint8_t *cluster_data;
1272
    BlockDriverAIOCB *hd_aiocb;
1273
    QEMUBH *bh;
1274
    QCowL2Meta l2meta;
1275
} QCowAIOCB;
1276

    
1277
static void qcow_aio_read_cb(void *opaque, int ret);
1278
static void qcow_aio_read_bh(void *opaque)
1279
{
1280
    QCowAIOCB *acb = opaque;
1281
    qemu_bh_delete(acb->bh);
1282
    acb->bh = NULL;
1283
    qcow_aio_read_cb(opaque, 0);
1284
}
1285

    
1286
static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
1287
{
1288
    if (acb->bh)
1289
        return -EIO;
1290

    
1291
    acb->bh = qemu_bh_new(cb, acb);
1292
    if (!acb->bh)
1293
        return -EIO;
1294

    
1295
    qemu_bh_schedule(acb->bh);
1296

    
1297
    return 0;
1298
}
1299

    
1300
static void qcow_aio_read_cb(void *opaque, int ret)
1301
{
1302
    QCowAIOCB *acb = opaque;
1303
    BlockDriverState *bs = acb->common.bs;
1304
    BDRVQcowState *s = bs->opaque;
1305
    int index_in_cluster, n1;
1306

    
1307
    acb->hd_aiocb = NULL;
1308
    if (ret < 0) {
1309
fail:
1310
        acb->common.cb(acb->common.opaque, ret);
1311
        qemu_aio_release(acb);
1312
        return;
1313
    }
1314

    
1315
    /* post process the read buffer */
1316
    if (!acb->cluster_offset) {
1317
        /* nothing to do */
1318
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1319
        /* nothing to do */
1320
    } else {
1321
        if (s->crypt_method) {
1322
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
1323
                            acb->n, 0,
1324
                            &s->aes_decrypt_key);
1325
        }
1326
    }
1327

    
1328
    acb->nb_sectors -= acb->n;
1329
    acb->sector_num += acb->n;
1330
    acb->buf += acb->n * 512;
1331

    
1332
    if (acb->nb_sectors == 0) {
1333
        /* request completed */
1334
        acb->common.cb(acb->common.opaque, 0);
1335
        qemu_aio_release(acb);
1336
        return;
1337
    }
1338

    
1339
    /* prepare next AIO request */
1340
    acb->n = acb->nb_sectors;
1341
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
1342
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1343

    
1344
    if (!acb->cluster_offset) {
1345
        if (bs->backing_hd) {
1346
            /* read from the base image */
1347
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
1348
                               acb->buf, acb->n);
1349
            if (n1 > 0) {
1350
                acb->hd_aiocb = bdrv_aio_read(bs->backing_hd, acb->sector_num,
1351
                                    acb->buf, acb->n, qcow_aio_read_cb, acb);
1352
                if (acb->hd_aiocb == NULL)
1353
                    goto fail;
1354
            } else {
1355
                ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1356
                if (ret < 0)
1357
                    goto fail;
1358
            }
1359
        } else {
1360
            /* Note: in this case, no need to wait */
1361
            memset(acb->buf, 0, 512 * acb->n);
1362
            ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1363
            if (ret < 0)
1364
                goto fail;
1365
        }
1366
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1367
        /* add AIO support for compressed blocks ? */
1368
        if (decompress_cluster(s, acb->cluster_offset) < 0)
1369
            goto fail;
1370
        memcpy(acb->buf,
1371
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
1372
        ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1373
        if (ret < 0)
1374
            goto fail;
1375
    } else {
1376
        if ((acb->cluster_offset & 511) != 0) {
1377
            ret = -EIO;
1378
            goto fail;
1379
        }
1380
        acb->hd_aiocb = bdrv_aio_read(s->hd,
1381
                            (acb->cluster_offset >> 9) + index_in_cluster,
1382
                            acb->buf, acb->n, qcow_aio_read_cb, acb);
1383
        if (acb->hd_aiocb == NULL)
1384
            goto fail;
1385
    }
1386
}
1387

    
1388
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1389
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1390
        BlockDriverCompletionFunc *cb, void *opaque)
1391
{
1392
    QCowAIOCB *acb;
1393

    
1394
    acb = qemu_aio_get(bs, cb, opaque);
1395
    if (!acb)
1396
        return NULL;
1397
    acb->hd_aiocb = NULL;
1398
    acb->sector_num = sector_num;
1399
    acb->buf = buf;
1400
    acb->nb_sectors = nb_sectors;
1401
    acb->n = 0;
1402
    acb->cluster_offset = 0;
1403
    acb->l2meta.nb_clusters = 0;
1404
    return acb;
1405
}
1406

    
1407
static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs,
1408
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1409
        BlockDriverCompletionFunc *cb, void *opaque)
1410
{
1411
    QCowAIOCB *acb;
1412

    
1413
    acb = qcow_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
1414
    if (!acb)
1415
        return NULL;
1416

    
1417
    qcow_aio_read_cb(acb, 0);
1418
    return &acb->common;
1419
}
1420

    
1421
static void qcow_aio_write_cb(void *opaque, int ret)
1422
{
1423
    QCowAIOCB *acb = opaque;
1424
    BlockDriverState *bs = acb->common.bs;
1425
    BDRVQcowState *s = bs->opaque;
1426
    int index_in_cluster;
1427
    const uint8_t *src_buf;
1428
    int n_end;
1429

    
1430
    acb->hd_aiocb = NULL;
1431

    
1432
    if (ret < 0) {
1433
    fail:
1434
        acb->common.cb(acb->common.opaque, ret);
1435
        qemu_aio_release(acb);
1436
        return;
1437
    }
1438

    
1439
    if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
1440
        free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
1441
        goto fail;
1442
    }
1443

    
1444
    acb->nb_sectors -= acb->n;
1445
    acb->sector_num += acb->n;
1446
    acb->buf += acb->n * 512;
1447

    
1448
    if (acb->nb_sectors == 0) {
1449
        /* request completed */
1450
        acb->common.cb(acb->common.opaque, 0);
1451
        qemu_aio_release(acb);
1452
        return;
1453
    }
1454

    
1455
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1456
    n_end = index_in_cluster + acb->nb_sectors;
1457
    if (s->crypt_method &&
1458
        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1459
        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1460

    
1461
    acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1462
                                          index_in_cluster,
1463
                                          n_end, &acb->n, &acb->l2meta);
1464
    if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
1465
        ret = -EIO;
1466
        goto fail;
1467
    }
1468
    if (s->crypt_method) {
1469
        if (!acb->cluster_data) {
1470
            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
1471
                                             s->cluster_size);
1472
        }
1473
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
1474
                        acb->n, 1, &s->aes_encrypt_key);
1475
        src_buf = acb->cluster_data;
1476
    } else {
1477
        src_buf = acb->buf;
1478
    }
1479
    acb->hd_aiocb = bdrv_aio_write(s->hd,
1480
                                   (acb->cluster_offset >> 9) + index_in_cluster,
1481
                                   src_buf, acb->n,
1482
                                   qcow_aio_write_cb, acb);
1483
    if (acb->hd_aiocb == NULL)
1484
        goto fail;
1485
}
1486

    
1487
static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs,
1488
        int64_t sector_num, const uint8_t *buf, int nb_sectors,
1489
        BlockDriverCompletionFunc *cb, void *opaque)
1490
{
1491
    BDRVQcowState *s = bs->opaque;
1492
    QCowAIOCB *acb;
1493

    
1494
    s->cluster_cache_offset = -1; /* disable compressed cache */
1495

    
1496
    acb = qcow_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
1497
    if (!acb)
1498
        return NULL;
1499

    
1500
    qcow_aio_write_cb(acb, 0);
1501
    return &acb->common;
1502
}
1503

    
1504
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1505
{
1506
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1507
    if (acb->hd_aiocb)
1508
        bdrv_aio_cancel(acb->hd_aiocb);
1509
    qemu_aio_release(acb);
1510
}
1511

    
1512
static void qcow_close(BlockDriverState *bs)
1513
{
1514
    BDRVQcowState *s = bs->opaque;
1515
    qemu_free(s->l1_table);
1516
    qemu_free(s->l2_cache);
1517
    qemu_free(s->cluster_cache);
1518
    qemu_free(s->cluster_data);
1519
    refcount_close(bs);
1520
    bdrv_delete(s->hd);
1521
}
1522

    
1523
/* XXX: use std qcow open function ? */
1524
typedef struct QCowCreateState {
1525
    int cluster_size;
1526
    int cluster_bits;
1527
    uint16_t *refcount_block;
1528
    uint64_t *refcount_table;
1529
    int64_t l1_table_offset;
1530
    int64_t refcount_table_offset;
1531
    int64_t refcount_block_offset;
1532
} QCowCreateState;
1533

    
1534
static void create_refcount_update(QCowCreateState *s,
1535
                                   int64_t offset, int64_t size)
1536
{
1537
    int refcount;
1538
    int64_t start, last, cluster_offset;
1539
    uint16_t *p;
1540

    
1541
    start = offset & ~(s->cluster_size - 1);
1542
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
1543
    for(cluster_offset = start; cluster_offset <= last;
1544
        cluster_offset += s->cluster_size) {
1545
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
1546
        refcount = be16_to_cpu(*p);
1547
        refcount++;
1548
        *p = cpu_to_be16(refcount);
1549
    }
1550
}
1551

    
1552
static int qcow_create2(const char *filename, int64_t total_size,
1553
                        const char *backing_file, const char *backing_format,
1554
                        int flags)
1555
{
1556

    
1557
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1558
    int ref_clusters, backing_format_len = 0;
1559
    QCowHeader header;
1560
    uint64_t tmp, offset;
1561
    QCowCreateState s1, *s = &s1;
1562
    QCowExtension ext_bf = {0, 0};
1563

    
1564

    
1565
    memset(s, 0, sizeof(*s));
1566

    
1567
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1568
    if (fd < 0)
1569
        return -1;
1570
    memset(&header, 0, sizeof(header));
1571
    header.magic = cpu_to_be32(QCOW_MAGIC);
1572
    header.version = cpu_to_be32(QCOW_VERSION);
1573
    header.size = cpu_to_be64(total_size * 512);
1574
    header_size = sizeof(header);
1575
    backing_filename_len = 0;
1576
    if (backing_file) {
1577
        if (backing_format) {
1578
            ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT;
1579
            backing_format_len = strlen(backing_format);
1580
            ext_bf.len = (backing_format_len + 7) & ~7;
1581
            header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7);
1582
        }
1583
        header.backing_file_offset = cpu_to_be64(header_size);
1584
        backing_filename_len = strlen(backing_file);
1585
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1586
        header_size += backing_filename_len;
1587
    }
1588
    s->cluster_bits = 12;  /* 4 KB clusters */
1589
    s->cluster_size = 1 << s->cluster_bits;
1590
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1591
    header_size = (header_size + 7) & ~7;
1592
    if (flags & BLOCK_FLAG_ENCRYPT) {
1593
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1594
    } else {
1595
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1596
    }
1597
    l2_bits = s->cluster_bits - 3;
1598
    shift = s->cluster_bits + l2_bits;
1599
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1600
    offset = align_offset(header_size, s->cluster_size);
1601
    s->l1_table_offset = offset;
1602
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1603
    header.l1_size = cpu_to_be32(l1_size);
1604
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1605

    
1606
    s->refcount_table = qemu_mallocz(s->cluster_size);
1607

    
1608
    s->refcount_table_offset = offset;
1609
    header.refcount_table_offset = cpu_to_be64(offset);
1610
    header.refcount_table_clusters = cpu_to_be32(1);
1611
    offset += s->cluster_size;
1612
    s->refcount_block_offset = offset;
1613

    
1614
    /* count how many refcount blocks needed */
1615
    tmp = offset >> s->cluster_bits;
1616
    ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1;
1617
    for (i=0; i < ref_clusters; i++) {
1618
        s->refcount_table[i] = cpu_to_be64(offset);
1619
        offset += s->cluster_size;
1620
    }
1621

    
1622
    s->refcount_block = qemu_mallocz(ref_clusters * s->cluster_size);
1623

    
1624
    /* update refcounts */
1625
    create_refcount_update(s, 0, header_size);
1626
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1627
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1628
    create_refcount_update(s, s->refcount_block_offset, ref_clusters * s->cluster_size);
1629

    
1630
    /* write all the data */
1631
    write(fd, &header, sizeof(header));
1632
    if (backing_file) {
1633
        if (backing_format_len) {
1634
            char zero[16];
1635
            int d = ext_bf.len - backing_format_len;
1636

    
1637
            memset(zero, 0, sizeof(zero));
1638
            cpu_to_be32s(&ext_bf.magic);
1639
            cpu_to_be32s(&ext_bf.len);
1640
            write(fd, &ext_bf, sizeof(ext_bf));
1641
            write(fd, backing_format, backing_format_len);
1642
            if (d>0) {
1643
                write(fd, zero, d);
1644
            }
1645
        }
1646
        write(fd, backing_file, backing_filename_len);
1647
    }
1648
    lseek(fd, s->l1_table_offset, SEEK_SET);
1649
    tmp = 0;
1650
    for(i = 0;i < l1_size; i++) {
1651
        write(fd, &tmp, sizeof(tmp));
1652
    }
1653
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1654
    write(fd, s->refcount_table, s->cluster_size);
1655

    
1656
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1657
    write(fd, s->refcount_block, ref_clusters * s->cluster_size);
1658

    
1659
    qemu_free(s->refcount_table);
1660
    qemu_free(s->refcount_block);
1661
    close(fd);
1662
    return 0;
1663
}
1664

    
1665
static int qcow_create(const char *filename, int64_t total_size,
1666
                       const char *backing_file, int flags)
1667
{
1668
    return qcow_create2(filename, total_size, backing_file, NULL, flags);
1669
}
1670

    
1671
static int qcow_make_empty(BlockDriverState *bs)
1672
{
1673
#if 0
1674
    /* XXX: not correct */
1675
    BDRVQcowState *s = bs->opaque;
1676
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1677
    int ret;
1678

1679
    memset(s->l1_table, 0, l1_length);
1680
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1681
        return -1;
1682
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1683
    if (ret < 0)
1684
        return ret;
1685

1686
    l2_cache_reset(bs);
1687
#endif
1688
    return 0;
1689
}
1690

    
1691
/* XXX: put compressed sectors first, then all the cluster aligned
1692
   tables to avoid losing bytes in alignment */
1693
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1694
                                 const uint8_t *buf, int nb_sectors)
1695
{
1696
    BDRVQcowState *s = bs->opaque;
1697
    z_stream strm;
1698
    int ret, out_len;
1699
    uint8_t *out_buf;
1700
    uint64_t cluster_offset;
1701

    
1702
    if (nb_sectors == 0) {
1703
        /* align end of file to a sector boundary to ease reading with
1704
           sector based I/Os */
1705
        cluster_offset = bdrv_getlength(s->hd);
1706
        cluster_offset = (cluster_offset + 511) & ~511;
1707
        bdrv_truncate(s->hd, cluster_offset);
1708
        return 0;
1709
    }
1710

    
1711
    if (nb_sectors != s->cluster_sectors)
1712
        return -EINVAL;
1713

    
1714
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1715

    
1716
    /* best compression, small window, no zlib header */
1717
    memset(&strm, 0, sizeof(strm));
1718
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1719
                       Z_DEFLATED, -12,
1720
                       9, Z_DEFAULT_STRATEGY);
1721
    if (ret != 0) {
1722
        qemu_free(out_buf);
1723
        return -1;
1724
    }
1725

    
1726
    strm.avail_in = s->cluster_size;
1727
    strm.next_in = (uint8_t *)buf;
1728
    strm.avail_out = s->cluster_size;
1729
    strm.next_out = out_buf;
1730

    
1731
    ret = deflate(&strm, Z_FINISH);
1732
    if (ret != Z_STREAM_END && ret != Z_OK) {
1733
        qemu_free(out_buf);
1734
        deflateEnd(&strm);
1735
        return -1;
1736
    }
1737
    out_len = strm.next_out - out_buf;
1738

    
1739
    deflateEnd(&strm);
1740

    
1741
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1742
        /* could not compress: write normal cluster */
1743
        qcow_write(bs, sector_num, buf, s->cluster_sectors);
1744
    } else {
1745
        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
1746
                                              out_len);
1747
        if (!cluster_offset)
1748
            return -1;
1749
        cluster_offset &= s->cluster_offset_mask;
1750
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1751
            qemu_free(out_buf);
1752
            return -1;
1753
        }
1754
    }
1755

    
1756
    qemu_free(out_buf);
1757
    return 0;
1758
}
1759

    
1760
static void qcow_flush(BlockDriverState *bs)
1761
{
1762
    BDRVQcowState *s = bs->opaque;
1763
    bdrv_flush(s->hd);
1764
}
1765

    
1766
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1767
{
1768
    BDRVQcowState *s = bs->opaque;
1769
    bdi->cluster_size = s->cluster_size;
1770
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1771
        (s->cluster_bits + s->l2_bits);
1772
    return 0;
1773
}
1774

    
1775
/*********************************************************/
1776
/* snapshot support */
1777

    
1778
/* update the refcounts of snapshots and the copied flag */
1779
static int update_snapshot_refcount(BlockDriverState *bs,
1780
                                    int64_t l1_table_offset,
1781
                                    int l1_size,
1782
                                    int addend)
1783
{
1784
    BDRVQcowState *s = bs->opaque;
1785
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
1786
    int64_t old_offset, old_l2_offset;
1787
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
1788

    
1789
    l2_cache_reset(bs);
1790

    
1791
    l2_table = NULL;
1792
    l1_table = NULL;
1793
    l1_size2 = l1_size * sizeof(uint64_t);
1794
    l1_allocated = 0;
1795
    if (l1_table_offset != s->l1_table_offset) {
1796
        l1_table = qemu_malloc(l1_size2);
1797
        l1_allocated = 1;
1798
        if (bdrv_pread(s->hd, l1_table_offset,
1799
                       l1_table, l1_size2) != l1_size2)
1800
            goto fail;
1801
        for(i = 0;i < l1_size; i++)
1802
            be64_to_cpus(&l1_table[i]);
1803
    } else {
1804
        assert(l1_size == s->l1_size);
1805
        l1_table = s->l1_table;
1806
        l1_allocated = 0;
1807
    }
1808

    
1809
    l2_size = s->l2_size * sizeof(uint64_t);
1810
    l2_table = qemu_malloc(l2_size);
1811
    l1_modified = 0;
1812
    for(i = 0; i < l1_size; i++) {
1813
        l2_offset = l1_table[i];
1814
        if (l2_offset) {
1815
            old_l2_offset = l2_offset;
1816
            l2_offset &= ~QCOW_OFLAG_COPIED;
1817
            l2_modified = 0;
1818
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
1819
                goto fail;
1820
            for(j = 0; j < s->l2_size; j++) {
1821
                offset = be64_to_cpu(l2_table[j]);
1822
                if (offset != 0) {
1823
                    old_offset = offset;
1824
                    offset &= ~QCOW_OFLAG_COPIED;
1825
                    if (offset & QCOW_OFLAG_COMPRESSED) {
1826
                        nb_csectors = ((offset >> s->csize_shift) &
1827
                                       s->csize_mask) + 1;
1828
                        if (addend != 0)
1829
                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
1830
                                            nb_csectors * 512, addend);
1831
                        /* compressed clusters are never modified */
1832
                        refcount = 2;
1833
                    } else {
1834
                        if (addend != 0) {
1835
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
1836
                        } else {
1837
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
1838
                        }
1839
                    }
1840

    
1841
                    if (refcount == 1) {
1842
                        offset |= QCOW_OFLAG_COPIED;
1843
                    }
1844
                    if (offset != old_offset) {
1845
                        l2_table[j] = cpu_to_be64(offset);
1846
                        l2_modified = 1;
1847
                    }
1848
                }
1849
            }
1850
            if (l2_modified) {
1851
                if (bdrv_pwrite(s->hd,
1852
                                l2_offset, l2_table, l2_size) != l2_size)
1853
                    goto fail;
1854
            }
1855

    
1856
            if (addend != 0) {
1857
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
1858
            } else {
1859
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1860
            }
1861
            if (refcount == 1) {
1862
                l2_offset |= QCOW_OFLAG_COPIED;
1863
            }
1864
            if (l2_offset != old_l2_offset) {
1865
                l1_table[i] = l2_offset;
1866
                l1_modified = 1;
1867
            }
1868
        }
1869
    }
1870
    if (l1_modified) {
1871
        for(i = 0; i < l1_size; i++)
1872
            cpu_to_be64s(&l1_table[i]);
1873
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
1874
                        l1_size2) != l1_size2)
1875
            goto fail;
1876
        for(i = 0; i < l1_size; i++)
1877
            be64_to_cpus(&l1_table[i]);
1878
    }
1879
    if (l1_allocated)
1880
        qemu_free(l1_table);
1881
    qemu_free(l2_table);
1882
    return 0;
1883
 fail:
1884
    if (l1_allocated)
1885
        qemu_free(l1_table);
1886
    qemu_free(l2_table);
1887
    return -EIO;
1888
}
1889

    
1890
static void qcow_free_snapshots(BlockDriverState *bs)
1891
{
1892
    BDRVQcowState *s = bs->opaque;
1893
    int i;
1894

    
1895
    for(i = 0; i < s->nb_snapshots; i++) {
1896
        qemu_free(s->snapshots[i].name);
1897
        qemu_free(s->snapshots[i].id_str);
1898
    }
1899
    qemu_free(s->snapshots);
1900
    s->snapshots = NULL;
1901
    s->nb_snapshots = 0;
1902
}
1903

    
1904
static int qcow_read_snapshots(BlockDriverState *bs)
1905
{
1906
    BDRVQcowState *s = bs->opaque;
1907
    QCowSnapshotHeader h;
1908
    QCowSnapshot *sn;
1909
    int i, id_str_size, name_size;
1910
    int64_t offset;
1911
    uint32_t extra_data_size;
1912

    
1913
    if (!s->nb_snapshots) {
1914
        s->snapshots = NULL;
1915
        s->snapshots_size = 0;
1916
        return 0;
1917
    }
1918

    
1919
    offset = s->snapshots_offset;
1920
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1921
    for(i = 0; i < s->nb_snapshots; i++) {
1922
        offset = align_offset(offset, 8);
1923
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1924
            goto fail;
1925
        offset += sizeof(h);
1926
        sn = s->snapshots + i;
1927
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1928
        sn->l1_size = be32_to_cpu(h.l1_size);
1929
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1930
        sn->date_sec = be32_to_cpu(h.date_sec);
1931
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1932
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1933
        extra_data_size = be32_to_cpu(h.extra_data_size);
1934

    
1935
        id_str_size = be16_to_cpu(h.id_str_size);
1936
        name_size = be16_to_cpu(h.name_size);
1937

    
1938
        offset += extra_data_size;
1939

    
1940
        sn->id_str = qemu_malloc(id_str_size + 1);
1941
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1942
            goto fail;
1943
        offset += id_str_size;
1944
        sn->id_str[id_str_size] = '\0';
1945

    
1946
        sn->name = qemu_malloc(name_size + 1);
1947
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1948
            goto fail;
1949
        offset += name_size;
1950
        sn->name[name_size] = '\0';
1951
    }
1952
    s->snapshots_size = offset - s->snapshots_offset;
1953
    return 0;
1954
 fail:
1955
    qcow_free_snapshots(bs);
1956
    return -1;
1957
}
1958

    
1959
/* add at the end of the file a new list of snapshots */
1960
static int qcow_write_snapshots(BlockDriverState *bs)
1961
{
1962
    BDRVQcowState *s = bs->opaque;
1963
    QCowSnapshot *sn;
1964
    QCowSnapshotHeader h;
1965
    int i, name_size, id_str_size, snapshots_size;
1966
    uint64_t data64;
1967
    uint32_t data32;
1968
    int64_t offset, snapshots_offset;
1969

    
1970
    /* compute the size of the snapshots */
1971
    offset = 0;
1972
    for(i = 0; i < s->nb_snapshots; i++) {
1973
        sn = s->snapshots + i;
1974
        offset = align_offset(offset, 8);
1975
        offset += sizeof(h);
1976
        offset += strlen(sn->id_str);
1977
        offset += strlen(sn->name);
1978
    }
1979
    snapshots_size = offset;
1980

    
1981
    snapshots_offset = alloc_clusters(bs, snapshots_size);
1982
    offset = snapshots_offset;
1983

    
1984
    for(i = 0; i < s->nb_snapshots; i++) {
1985
        sn = s->snapshots + i;
1986
        memset(&h, 0, sizeof(h));
1987
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
1988
        h.l1_size = cpu_to_be32(sn->l1_size);
1989
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
1990
        h.date_sec = cpu_to_be32(sn->date_sec);
1991
        h.date_nsec = cpu_to_be32(sn->date_nsec);
1992
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
1993

    
1994
        id_str_size = strlen(sn->id_str);
1995
        name_size = strlen(sn->name);
1996
        h.id_str_size = cpu_to_be16(id_str_size);
1997
        h.name_size = cpu_to_be16(name_size);
1998
        offset = align_offset(offset, 8);
1999
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
2000
            goto fail;
2001
        offset += sizeof(h);
2002
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
2003
            goto fail;
2004
        offset += id_str_size;
2005
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
2006
            goto fail;
2007
        offset += name_size;
2008
    }
2009

    
2010
    /* update the various header fields */
2011
    data64 = cpu_to_be64(snapshots_offset);
2012
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
2013
                    &data64, sizeof(data64)) != sizeof(data64))
2014
        goto fail;
2015
    data32 = cpu_to_be32(s->nb_snapshots);
2016
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
2017
                    &data32, sizeof(data32)) != sizeof(data32))
2018
        goto fail;
2019

    
2020
    /* free the old snapshot table */
2021
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
2022
    s->snapshots_offset = snapshots_offset;
2023
    s->snapshots_size = snapshots_size;
2024
    return 0;
2025
 fail:
2026
    return -1;
2027
}
2028

    
2029
static void find_new_snapshot_id(BlockDriverState *bs,
2030
                                 char *id_str, int id_str_size)
2031
{
2032
    BDRVQcowState *s = bs->opaque;
2033
    QCowSnapshot *sn;
2034
    int i, id, id_max = 0;
2035

    
2036
    for(i = 0; i < s->nb_snapshots; i++) {
2037
        sn = s->snapshots + i;
2038
        id = strtoul(sn->id_str, NULL, 10);
2039
        if (id > id_max)
2040
            id_max = id;
2041
    }
2042
    snprintf(id_str, id_str_size, "%d", id_max + 1);
2043
}
2044

    
2045
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
2046
{
2047
    BDRVQcowState *s = bs->opaque;
2048
    int i;
2049

    
2050
    for(i = 0; i < s->nb_snapshots; i++) {
2051
        if (!strcmp(s->snapshots[i].id_str, id_str))
2052
            return i;
2053
    }
2054
    return -1;
2055
}
2056

    
2057
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
2058
{
2059
    BDRVQcowState *s = bs->opaque;
2060
    int i, ret;
2061

    
2062
    ret = find_snapshot_by_id(bs, name);
2063
    if (ret >= 0)
2064
        return ret;
2065
    for(i = 0; i < s->nb_snapshots; i++) {
2066
        if (!strcmp(s->snapshots[i].name, name))
2067
            return i;
2068
    }
2069
    return -1;
2070
}
2071

    
2072
/* if no id is provided, a new one is constructed */
2073
static int qcow_snapshot_create(BlockDriverState *bs,
2074
                                QEMUSnapshotInfo *sn_info)
2075
{
2076
    BDRVQcowState *s = bs->opaque;
2077
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
2078
    int i, ret;
2079
    uint64_t *l1_table = NULL;
2080

    
2081
    memset(sn, 0, sizeof(*sn));
2082

    
2083
    if (sn_info->id_str[0] == '\0') {
2084
        /* compute a new id */
2085
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
2086
    }
2087

    
2088
    /* check that the ID is unique */
2089
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
2090
        return -ENOENT;
2091

    
2092
    sn->id_str = qemu_strdup(sn_info->id_str);
2093
    if (!sn->id_str)
2094
        goto fail;
2095
    sn->name = qemu_strdup(sn_info->name);
2096
    if (!sn->name)
2097
        goto fail;
2098
    sn->vm_state_size = sn_info->vm_state_size;
2099
    sn->date_sec = sn_info->date_sec;
2100
    sn->date_nsec = sn_info->date_nsec;
2101
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
2102

    
2103
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
2104
    if (ret < 0)
2105
        goto fail;
2106

    
2107
    /* create the L1 table of the snapshot */
2108
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
2109
    sn->l1_size = s->l1_size;
2110

    
2111
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
2112
    for(i = 0; i < s->l1_size; i++) {
2113
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
2114
    }
2115
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
2116
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
2117
        (s->l1_size * sizeof(uint64_t)))
2118
        goto fail;
2119
    qemu_free(l1_table);
2120
    l1_table = NULL;
2121

    
2122
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
2123
    if (s->snapshots) {
2124
        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
2125
        qemu_free(s->snapshots);
2126
    }
2127
    s->snapshots = snapshots1;
2128
    s->snapshots[s->nb_snapshots++] = *sn;
2129

    
2130
    if (qcow_write_snapshots(bs) < 0)
2131
        goto fail;
2132
#ifdef DEBUG_ALLOC
2133
    check_refcounts(bs);
2134
#endif
2135
    return 0;
2136
 fail:
2137
    qemu_free(sn->name);
2138
    qemu_free(l1_table);
2139
    return -1;
2140
}
2141

    
2142
/* copy the snapshot 'snapshot_name' into the current disk image */
2143
static int qcow_snapshot_goto(BlockDriverState *bs,
2144
                              const char *snapshot_id)
2145
{
2146
    BDRVQcowState *s = bs->opaque;
2147
    QCowSnapshot *sn;
2148
    int i, snapshot_index, l1_size2;
2149

    
2150
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2151
    if (snapshot_index < 0)
2152
        return -ENOENT;
2153
    sn = &s->snapshots[snapshot_index];
2154

    
2155
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
2156
        goto fail;
2157

    
2158
    if (grow_l1_table(bs, sn->l1_size) < 0)
2159
        goto fail;
2160

    
2161
    s->l1_size = sn->l1_size;
2162
    l1_size2 = s->l1_size * sizeof(uint64_t);
2163
    /* copy the snapshot l1 table to the current l1 table */
2164
    if (bdrv_pread(s->hd, sn->l1_table_offset,
2165
                   s->l1_table, l1_size2) != l1_size2)
2166
        goto fail;
2167
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
2168
                    s->l1_table, l1_size2) != l1_size2)
2169
        goto fail;
2170
    for(i = 0;i < s->l1_size; i++) {
2171
        be64_to_cpus(&s->l1_table[i]);
2172
    }
2173

    
2174
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
2175
        goto fail;
2176

    
2177
#ifdef DEBUG_ALLOC
2178
    check_refcounts(bs);
2179
#endif
2180
    return 0;
2181
 fail:
2182
    return -EIO;
2183
}
2184

    
2185
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2186
{
2187
    BDRVQcowState *s = bs->opaque;
2188
    QCowSnapshot *sn;
2189
    int snapshot_index, ret;
2190

    
2191
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2192
    if (snapshot_index < 0)
2193
        return -ENOENT;
2194
    sn = &s->snapshots[snapshot_index];
2195

    
2196
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
2197
    if (ret < 0)
2198
        return ret;
2199
    /* must update the copied flag on the current cluster offsets */
2200
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
2201
    if (ret < 0)
2202
        return ret;
2203
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
2204

    
2205
    qemu_free(sn->id_str);
2206
    qemu_free(sn->name);
2207
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
2208
    s->nb_snapshots--;
2209
    ret = qcow_write_snapshots(bs);
2210
    if (ret < 0) {
2211
        /* XXX: restore snapshot if error ? */
2212
        return ret;
2213
    }
2214
#ifdef DEBUG_ALLOC
2215
    check_refcounts(bs);
2216
#endif
2217
    return 0;
2218
}
2219

    
2220
static int qcow_snapshot_list(BlockDriverState *bs,
2221
                              QEMUSnapshotInfo **psn_tab)
2222
{
2223
    BDRVQcowState *s = bs->opaque;
2224
    QEMUSnapshotInfo *sn_tab, *sn_info;
2225
    QCowSnapshot *sn;
2226
    int i;
2227

    
2228
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
2229
    for(i = 0; i < s->nb_snapshots; i++) {
2230
        sn_info = sn_tab + i;
2231
        sn = s->snapshots + i;
2232
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
2233
                sn->id_str);
2234
        pstrcpy(sn_info->name, sizeof(sn_info->name),
2235
                sn->name);
2236
        sn_info->vm_state_size = sn->vm_state_size;
2237
        sn_info->date_sec = sn->date_sec;
2238
        sn_info->date_nsec = sn->date_nsec;
2239
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
2240
    }
2241
    *psn_tab = sn_tab;
2242
    return s->nb_snapshots;
2243
}
2244

    
2245
/*********************************************************/
2246
/* refcount handling */
2247

    
2248
static int refcount_init(BlockDriverState *bs)
2249
{
2250
    BDRVQcowState *s = bs->opaque;
2251
    int ret, refcount_table_size2, i;
2252

    
2253
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
2254
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
2255
    s->refcount_table = qemu_malloc(refcount_table_size2);
2256
    if (s->refcount_table_size > 0) {
2257
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
2258
                         s->refcount_table, refcount_table_size2);
2259
        if (ret != refcount_table_size2)
2260
            goto fail;
2261
        for(i = 0; i < s->refcount_table_size; i++)
2262
            be64_to_cpus(&s->refcount_table[i]);
2263
    }
2264
    return 0;
2265
 fail:
2266
    return -ENOMEM;
2267
}
2268

    
2269
static void refcount_close(BlockDriverState *bs)
2270
{
2271
    BDRVQcowState *s = bs->opaque;
2272
    qemu_free(s->refcount_block_cache);
2273
    qemu_free(s->refcount_table);
2274
}
2275

    
2276

    
2277
static int load_refcount_block(BlockDriverState *bs,
2278
                               int64_t refcount_block_offset)
2279
{
2280
    BDRVQcowState *s = bs->opaque;
2281
    int ret;
2282
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
2283
                     s->cluster_size);
2284
    if (ret != s->cluster_size)
2285
        return -EIO;
2286
    s->refcount_block_cache_offset = refcount_block_offset;
2287
    return 0;
2288
}
2289

    
2290
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
2291
{
2292
    BDRVQcowState *s = bs->opaque;
2293
    int refcount_table_index, block_index;
2294
    int64_t refcount_block_offset;
2295

    
2296
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2297
    if (refcount_table_index >= s->refcount_table_size)
2298
        return 0;
2299
    refcount_block_offset = s->refcount_table[refcount_table_index];
2300
    if (!refcount_block_offset)
2301
        return 0;
2302
    if (refcount_block_offset != s->refcount_block_cache_offset) {
2303
        /* better than nothing: return allocated if read error */
2304
        if (load_refcount_block(bs, refcount_block_offset) < 0)
2305
            return 1;
2306
    }
2307
    block_index = cluster_index &
2308
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2309
    return be16_to_cpu(s->refcount_block_cache[block_index]);
2310
}
2311

    
2312
/* return < 0 if error */
2313
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
2314
{
2315
    BDRVQcowState *s = bs->opaque;
2316
    int i, nb_clusters;
2317

    
2318
    nb_clusters = size_to_clusters(s, size);
2319
retry:
2320
    for(i = 0; i < nb_clusters; i++) {
2321
        int64_t i = s->free_cluster_index++;
2322
        if (get_refcount(bs, i) != 0)
2323
            goto retry;
2324
    }
2325
#ifdef DEBUG_ALLOC2
2326
    printf("alloc_clusters: size=%lld -> %lld\n",
2327
            size,
2328
            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
2329
#endif
2330
    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
2331
}
2332

    
2333
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
2334
{
2335
    int64_t offset;
2336

    
2337
    offset = alloc_clusters_noref(bs, size);
2338
    update_refcount(bs, offset, size, 1);
2339
    return offset;
2340
}
2341

    
2342
/* only used to allocate compressed sectors. We try to allocate
2343
   contiguous sectors. size must be <= cluster_size */
2344
static int64_t alloc_bytes(BlockDriverState *bs, int size)
2345
{
2346
    BDRVQcowState *s = bs->opaque;
2347
    int64_t offset, cluster_offset;
2348
    int free_in_cluster;
2349

    
2350
    assert(size > 0 && size <= s->cluster_size);
2351
    if (s->free_byte_offset == 0) {
2352
        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
2353
    }
2354
 redo:
2355
    free_in_cluster = s->cluster_size -
2356
        (s->free_byte_offset & (s->cluster_size - 1));
2357
    if (size <= free_in_cluster) {
2358
        /* enough space in current cluster */
2359
        offset = s->free_byte_offset;
2360
        s->free_byte_offset += size;
2361
        free_in_cluster -= size;
2362
        if (free_in_cluster == 0)
2363
            s->free_byte_offset = 0;
2364
        if ((offset & (s->cluster_size - 1)) != 0)
2365
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2366
    } else {
2367
        offset = alloc_clusters(bs, s->cluster_size);
2368
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
2369
        if ((cluster_offset + s->cluster_size) == offset) {
2370
            /* we are lucky: contiguous data */
2371
            offset = s->free_byte_offset;
2372
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2373
            s->free_byte_offset += size;
2374
        } else {
2375
            s->free_byte_offset = offset;
2376
            goto redo;
2377
        }
2378
    }
2379
    return offset;
2380
}
2381

    
2382
static void free_clusters(BlockDriverState *bs,
2383
                          int64_t offset, int64_t size)
2384
{
2385
    update_refcount(bs, offset, size, -1);
2386
}
2387

    
2388
static int grow_refcount_table(BlockDriverState *bs, int min_size)
2389
{
2390
    BDRVQcowState *s = bs->opaque;
2391
    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
2392
    uint64_t *new_table;
2393
    int64_t table_offset;
2394
    uint8_t data[12];
2395
    int old_table_size;
2396
    int64_t old_table_offset;
2397

    
2398
    if (min_size <= s->refcount_table_size)
2399
        return 0;
2400
    /* compute new table size */
2401
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2402
    for(;;) {
2403
        if (refcount_table_clusters == 0) {
2404
            refcount_table_clusters = 1;
2405
        } else {
2406
            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
2407
        }
2408
        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
2409
        if (min_size <= new_table_size)
2410
            break;
2411
    }
2412
#ifdef DEBUG_ALLOC2
2413
    printf("grow_refcount_table from %d to %d\n",
2414
           s->refcount_table_size,
2415
           new_table_size);
2416
#endif
2417
    new_table_size2 = new_table_size * sizeof(uint64_t);
2418
    new_table = qemu_mallocz(new_table_size2);
2419
    memcpy(new_table, s->refcount_table,
2420
           s->refcount_table_size * sizeof(uint64_t));
2421
    for(i = 0; i < s->refcount_table_size; i++)
2422
        cpu_to_be64s(&new_table[i]);
2423
    /* Note: we cannot update the refcount now to avoid recursion */
2424
    table_offset = alloc_clusters_noref(bs, new_table_size2);
2425
    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
2426
    if (ret != new_table_size2)
2427
        goto fail;
2428
    for(i = 0; i < s->refcount_table_size; i++)
2429
        be64_to_cpus(&new_table[i]);
2430

    
2431
    cpu_to_be64w((uint64_t*)data, table_offset);
2432
    cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters);
2433
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
2434
                    data, sizeof(data)) != sizeof(data))
2435
        goto fail;
2436
    qemu_free(s->refcount_table);
2437
    old_table_offset = s->refcount_table_offset;
2438
    old_table_size = s->refcount_table_size;
2439
    s->refcount_table = new_table;
2440
    s->refcount_table_size = new_table_size;
2441
    s->refcount_table_offset = table_offset;
2442

    
2443
    update_refcount(bs, table_offset, new_table_size2, 1);
2444
    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
2445
    return 0;
2446
 fail:
2447
    free_clusters(bs, table_offset, new_table_size2);
2448
    qemu_free(new_table);
2449
    return -EIO;
2450
}
2451

    
2452
/* addend must be 1 or -1 */
2453
/* XXX: cache several refcount block clusters ? */
2454
static int update_cluster_refcount(BlockDriverState *bs,
2455
                                   int64_t cluster_index,
2456
                                   int addend)
2457
{
2458
    BDRVQcowState *s = bs->opaque;
2459
    int64_t offset, refcount_block_offset;
2460
    int ret, refcount_table_index, block_index, refcount;
2461
    uint64_t data64;
2462

    
2463
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2464
    if (refcount_table_index >= s->refcount_table_size) {
2465
        if (addend < 0)
2466
            return -EINVAL;
2467
        ret = grow_refcount_table(bs, refcount_table_index + 1);
2468
        if (ret < 0)
2469
            return ret;
2470
    }
2471
    refcount_block_offset = s->refcount_table[refcount_table_index];
2472
    if (!refcount_block_offset) {
2473
        if (addend < 0)
2474
            return -EINVAL;
2475
        /* create a new refcount block */
2476
        /* Note: we cannot update the refcount now to avoid recursion */
2477
        offset = alloc_clusters_noref(bs, s->cluster_size);
2478
        memset(s->refcount_block_cache, 0, s->cluster_size);
2479
        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
2480
        if (ret != s->cluster_size)
2481
            return -EINVAL;
2482
        s->refcount_table[refcount_table_index] = offset;
2483
        data64 = cpu_to_be64(offset);
2484
        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
2485
                          refcount_table_index * sizeof(uint64_t),
2486
                          &data64, sizeof(data64));
2487
        if (ret != sizeof(data64))
2488
            return -EINVAL;
2489

    
2490
        refcount_block_offset = offset;
2491
        s->refcount_block_cache_offset = offset;
2492
        update_refcount(bs, offset, s->cluster_size, 1);
2493
    } else {
2494
        if (refcount_block_offset != s->refcount_block_cache_offset) {
2495
            if (load_refcount_block(bs, refcount_block_offset) < 0)
2496
                return -EIO;
2497
        }
2498
    }
2499
    /* we can update the count and save it */
2500
    block_index = cluster_index &
2501
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2502
    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
2503
    refcount += addend;
2504
    if (refcount < 0 || refcount > 0xffff)
2505
        return -EINVAL;
2506
    if (refcount == 0 && cluster_index < s->free_cluster_index) {
2507
        s->free_cluster_index = cluster_index;
2508
    }
2509
    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
2510
    if (bdrv_pwrite(s->hd,
2511
                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
2512
                    &s->refcount_block_cache[block_index], 2) != 2)
2513
        return -EIO;
2514
    return refcount;
2515
}
2516

    
2517
static void update_refcount(BlockDriverState *bs,
2518
                            int64_t offset, int64_t length,
2519
                            int addend)
2520
{
2521
    BDRVQcowState *s = bs->opaque;
2522
    int64_t start, last, cluster_offset;
2523

    
2524
#ifdef DEBUG_ALLOC2
2525
    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
2526
           offset, length, addend);
2527
#endif
2528
    if (length <= 0)
2529
        return;
2530
    start = offset & ~(s->cluster_size - 1);
2531
    last = (offset + length - 1) & ~(s->cluster_size - 1);
2532
    for(cluster_offset = start; cluster_offset <= last;
2533
        cluster_offset += s->cluster_size) {
2534
        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
2535
    }
2536
}
2537

    
2538
#ifdef DEBUG_ALLOC
2539
static void inc_refcounts(BlockDriverState *bs,
2540
                          uint16_t *refcount_table,
2541
                          int refcount_table_size,
2542
                          int64_t offset, int64_t size)
2543
{
2544
    BDRVQcowState *s = bs->opaque;
2545
    int64_t start, last, cluster_offset;
2546
    int k;
2547

    
2548
    if (size <= 0)
2549
        return;
2550

    
2551
    start = offset & ~(s->cluster_size - 1);
2552
    last = (offset + size - 1) & ~(s->cluster_size - 1);
2553
    for(cluster_offset = start; cluster_offset <= last;
2554
        cluster_offset += s->cluster_size) {
2555
        k = cluster_offset >> s->cluster_bits;
2556
        if (k < 0 || k >= refcount_table_size) {
2557
            printf("ERROR: invalid cluster offset=0x%llx\n", cluster_offset);
2558
        } else {
2559
            if (++refcount_table[k] == 0) {
2560
                printf("ERROR: overflow cluster offset=0x%llx\n", cluster_offset);
2561
            }
2562
        }
2563
    }
2564
}
2565

    
2566
static int check_refcounts_l1(BlockDriverState *bs,
2567
                              uint16_t *refcount_table,
2568
                              int refcount_table_size,
2569
                              int64_t l1_table_offset, int l1_size,
2570
                              int check_copied)
2571
{
2572
    BDRVQcowState *s = bs->opaque;
2573
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
2574
    int l2_size, i, j, nb_csectors, refcount;
2575

    
2576
    l2_table = NULL;
2577
    l1_size2 = l1_size * sizeof(uint64_t);
2578

    
2579
    inc_refcounts(bs, refcount_table, refcount_table_size,
2580
                  l1_table_offset, l1_size2);
2581

    
2582
    l1_table = qemu_malloc(l1_size2);
2583
    if (bdrv_pread(s->hd, l1_table_offset,
2584
                   l1_table, l1_size2) != l1_size2)
2585
        goto fail;
2586
    for(i = 0;i < l1_size; i++)
2587
        be64_to_cpus(&l1_table[i]);
2588

    
2589
    l2_size = s->l2_size * sizeof(uint64_t);
2590
    l2_table = qemu_malloc(l2_size);
2591
    for(i = 0; i < l1_size; i++) {
2592
        l2_offset = l1_table[i];
2593
        if (l2_offset) {
2594
            if (check_copied) {
2595
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2596
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
2597
                    printf("ERROR OFLAG_COPIED: l2_offset=%llx refcount=%d\n",
2598
                           l2_offset, refcount);
2599
                }
2600
            }
2601
            l2_offset &= ~QCOW_OFLAG_COPIED;
2602
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
2603
                goto fail;
2604
            for(j = 0; j < s->l2_size; j++) {
2605
                offset = be64_to_cpu(l2_table[j]);
2606
                if (offset != 0) {
2607
                    if (offset & QCOW_OFLAG_COMPRESSED) {
2608
                        if (offset & QCOW_OFLAG_COPIED) {
2609
                            printf("ERROR: cluster %lld: copied flag must never be set for compressed clusters\n",
2610
                                   offset >> s->cluster_bits);
2611
                            offset &= ~QCOW_OFLAG_COPIED;
2612
                        }
2613
                        nb_csectors = ((offset >> s->csize_shift) &
2614
                                       s->csize_mask) + 1;
2615
                        offset &= s->cluster_offset_mask;
2616
                        inc_refcounts(bs, refcount_table,
2617
                                      refcount_table_size,
2618
                                      offset & ~511, nb_csectors * 512);
2619
                    } else {
2620
                        if (check_copied) {
2621
                            refcount = get_refcount(bs, (offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2622
                            if ((refcount == 1) != ((offset & QCOW_OFLAG_COPIED) != 0)) {
2623
                                printf("ERROR OFLAG_COPIED: offset=%llx refcount=%d\n",
2624
                                       offset, refcount);
2625
                            }
2626
                        }
2627
                        offset &= ~QCOW_OFLAG_COPIED;
2628
                        inc_refcounts(bs, refcount_table,
2629
                                      refcount_table_size,
2630
                                      offset, s->cluster_size);
2631
                    }
2632
                }
2633
            }
2634
            inc_refcounts(bs, refcount_table,
2635
                          refcount_table_size,
2636
                          l2_offset,
2637
                          s->cluster_size);
2638
        }
2639
    }
2640
    qemu_free(l1_table);
2641
    qemu_free(l2_table);
2642
    return 0;
2643
 fail:
2644
    printf("ERROR: I/O error in check_refcounts_l1\n");
2645
    qemu_free(l1_table);
2646
    qemu_free(l2_table);
2647
    return -EIO;
2648
}
2649

    
2650
static void check_refcounts(BlockDriverState *bs)
2651
{
2652
    BDRVQcowState *s = bs->opaque;
2653
    int64_t size;
2654
    int nb_clusters, refcount1, refcount2, i;
2655
    QCowSnapshot *sn;
2656
    uint16_t *refcount_table;
2657

    
2658
    size = bdrv_getlength(s->hd);
2659
    nb_clusters = size_to_clusters(s, size);
2660
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
2661

    
2662
    /* header */
2663
    inc_refcounts(bs, refcount_table, nb_clusters,
2664
                  0, s->cluster_size);
2665

    
2666
    check_refcounts_l1(bs, refcount_table, nb_clusters,
2667
                       s->l1_table_offset, s->l1_size, 1);
2668

    
2669
    /* snapshots */
2670
    for(i = 0; i < s->nb_snapshots; i++) {
2671
        sn = s->snapshots + i;
2672
        check_refcounts_l1(bs, refcount_table, nb_clusters,
2673
                           sn->l1_table_offset, sn->l1_size, 0);
2674
    }
2675
    inc_refcounts(bs, refcount_table, nb_clusters,
2676
                  s->snapshots_offset, s->snapshots_size);
2677

    
2678
    /* refcount data */
2679
    inc_refcounts(bs, refcount_table, nb_clusters,
2680
                  s->refcount_table_offset,
2681
                  s->refcount_table_size * sizeof(uint64_t));
2682
    for(i = 0; i < s->refcount_table_size; i++) {
2683
        int64_t offset;
2684
        offset = s->refcount_table[i];
2685
        if (offset != 0) {
2686
            inc_refcounts(bs, refcount_table, nb_clusters,
2687
                          offset, s->cluster_size);
2688
        }
2689
    }
2690

    
2691
    /* compare ref counts */
2692
    for(i = 0; i < nb_clusters; i++) {
2693
        refcount1 = get_refcount(bs, i);
2694
        refcount2 = refcount_table[i];
2695
        if (refcount1 != refcount2)
2696
            printf("ERROR cluster %d refcount=%d reference=%d\n",
2697
                   i, refcount1, refcount2);
2698
    }
2699

    
2700
    qemu_free(refcount_table);
2701
}
2702

    
2703
#if 0
2704
static void dump_refcounts(BlockDriverState *bs)
2705
{
2706
    BDRVQcowState *s = bs->opaque;
2707
    int64_t nb_clusters, k, k1, size;
2708
    int refcount;
2709

2710
    size = bdrv_getlength(s->hd);
2711
    nb_clusters = size_to_clusters(s, size);
2712
    for(k = 0; k < nb_clusters;) {
2713
        k1 = k;
2714
        refcount = get_refcount(bs, k);
2715
        k++;
2716
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2717
            k++;
2718
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2719
    }
2720
}
2721
#endif
2722
#endif
2723

    
2724
BlockDriver bdrv_qcow2 = {
2725
    .format_name        = "qcow2",
2726
    .instance_size        = sizeof(BDRVQcowState),
2727
    .bdrv_probe                = qcow_probe,
2728
    .bdrv_open                = qcow_open,
2729
    .bdrv_close                = qcow_close,
2730
    .bdrv_create        = qcow_create,
2731
    .bdrv_flush                = qcow_flush,
2732
    .bdrv_is_allocated        = qcow_is_allocated,
2733
    .bdrv_set_key        = qcow_set_key,
2734
    .bdrv_make_empty        = qcow_make_empty,
2735

    
2736
    .bdrv_aio_read        = qcow_aio_read,
2737
    .bdrv_aio_write        = qcow_aio_write,
2738
    .bdrv_aio_cancel        = qcow_aio_cancel,
2739
    .aiocb_size                = sizeof(QCowAIOCB),
2740
    .bdrv_write_compressed = qcow_write_compressed,
2741

    
2742
    .bdrv_snapshot_create = qcow_snapshot_create,
2743
    .bdrv_snapshot_goto        = qcow_snapshot_goto,
2744
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2745
    .bdrv_snapshot_list        = qcow_snapshot_list,
2746
    .bdrv_get_info        = qcow_get_info,
2747

    
2748
    .bdrv_create2 = qcow_create2,
2749
};