Statistics
| Branch: | Revision:

root / block-qcow2.c @ f965509c

History | View | Annotate | Download (83.7 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include <zlib.h>
27
#include "aes.h"
28
#include <assert.h>
29

    
30
/*
31
  Differences with QCOW:
32

33
  - Support for multiple incremental snapshots.
34
  - Memory management by reference counts.
35
  - Clusters which have a reference count of one have the bit
36
    QCOW_OFLAG_COPIED to optimize write performance.
37
  - Size of compressed clusters is stored in sectors to reduce bit usage
38
    in the cluster offsets.
39
  - Support for storing additional data (such as the VM state) in the
40
    snapshots.
41
  - If a backing store is used, the cluster size is not constrained
42
    (could be backported to QCOW).
43
  - L2 tables have always a size of one cluster.
44
*/
45

    
46
//#define DEBUG_ALLOC
47
//#define DEBUG_ALLOC2
48
//#define DEBUG_EXT
49

    
50
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
51
#define QCOW_VERSION 2
52

    
53
#define QCOW_CRYPT_NONE 0
54
#define QCOW_CRYPT_AES  1
55

    
56
#define QCOW_MAX_CRYPT_CLUSTERS 32
57

    
58
/* indicate that the refcount of the referenced cluster is exactly one. */
59
#define QCOW_OFLAG_COPIED     (1LL << 63)
60
/* indicate that the cluster is compressed (they never have the copied flag) */
61
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
62

    
63
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
64

    
65
typedef struct QCowHeader {
66
    uint32_t magic;
67
    uint32_t version;
68
    uint64_t backing_file_offset;
69
    uint32_t backing_file_size;
70
    uint32_t cluster_bits;
71
    uint64_t size; /* in bytes */
72
    uint32_t crypt_method;
73
    uint32_t l1_size; /* XXX: save number of clusters instead ? */
74
    uint64_t l1_table_offset;
75
    uint64_t refcount_table_offset;
76
    uint32_t refcount_table_clusters;
77
    uint32_t nb_snapshots;
78
    uint64_t snapshots_offset;
79
} QCowHeader;
80

    
81

    
82
typedef struct {
83
    uint32_t magic;
84
    uint32_t len;
85
} QCowExtension;
86
#define  QCOW_EXT_MAGIC_END 0
87
#define  QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
88

    
89

    
90
typedef struct __attribute__((packed)) QCowSnapshotHeader {
91
    /* header is 8 byte aligned */
92
    uint64_t l1_table_offset;
93

    
94
    uint32_t l1_size;
95
    uint16_t id_str_size;
96
    uint16_t name_size;
97

    
98
    uint32_t date_sec;
99
    uint32_t date_nsec;
100

    
101
    uint64_t vm_clock_nsec;
102

    
103
    uint32_t vm_state_size;
104
    uint32_t extra_data_size; /* for extension */
105
    /* extra data follows */
106
    /* id_str follows */
107
    /* name follows  */
108
} QCowSnapshotHeader;
109

    
110
#define L2_CACHE_SIZE 16
111

    
112
typedef struct QCowSnapshot {
113
    uint64_t l1_table_offset;
114
    uint32_t l1_size;
115
    char *id_str;
116
    char *name;
117
    uint32_t vm_state_size;
118
    uint32_t date_sec;
119
    uint32_t date_nsec;
120
    uint64_t vm_clock_nsec;
121
} QCowSnapshot;
122

    
123
typedef struct BDRVQcowState {
124
    BlockDriverState *hd;
125
    int cluster_bits;
126
    int cluster_size;
127
    int cluster_sectors;
128
    int l2_bits;
129
    int l2_size;
130
    int l1_size;
131
    int l1_vm_state_index;
132
    int csize_shift;
133
    int csize_mask;
134
    uint64_t cluster_offset_mask;
135
    uint64_t l1_table_offset;
136
    uint64_t *l1_table;
137
    uint64_t *l2_cache;
138
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
139
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
140
    uint8_t *cluster_cache;
141
    uint8_t *cluster_data;
142
    uint64_t cluster_cache_offset;
143

    
144
    uint64_t *refcount_table;
145
    uint64_t refcount_table_offset;
146
    uint32_t refcount_table_size;
147
    uint64_t refcount_block_cache_offset;
148
    uint16_t *refcount_block_cache;
149
    int64_t free_cluster_index;
150
    int64_t free_byte_offset;
151

    
152
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
153
    uint32_t crypt_method_header;
154
    AES_KEY aes_encrypt_key;
155
    AES_KEY aes_decrypt_key;
156
    uint64_t snapshots_offset;
157
    int snapshots_size;
158
    int nb_snapshots;
159
    QCowSnapshot *snapshots;
160
} BDRVQcowState;
161

    
162
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
163
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
164
                     uint8_t *buf, int nb_sectors);
165
static int qcow_read_snapshots(BlockDriverState *bs);
166
static void qcow_free_snapshots(BlockDriverState *bs);
167
static int refcount_init(BlockDriverState *bs);
168
static void refcount_close(BlockDriverState *bs);
169
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
170
static int update_cluster_refcount(BlockDriverState *bs,
171
                                   int64_t cluster_index,
172
                                   int addend);
173
static void update_refcount(BlockDriverState *bs,
174
                            int64_t offset, int64_t length,
175
                            int addend);
176
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
177
static int64_t alloc_bytes(BlockDriverState *bs, int size);
178
static void free_clusters(BlockDriverState *bs,
179
                          int64_t offset, int64_t size);
180
#ifdef DEBUG_ALLOC
181
static void check_refcounts(BlockDriverState *bs);
182
#endif
183

    
184
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
185
{
186
    const QCowHeader *cow_header = (const void *)buf;
187

    
188
    if (buf_size >= sizeof(QCowHeader) &&
189
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
190
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
191
        return 100;
192
    else
193
        return 0;
194
}
195

    
196

    
197
/* 
198
 * read qcow2 extension and fill bs
199
 * start reading from start_offset
200
 * finish reading upon magic of value 0 or when end_offset reached
201
 * unknown magic is skipped (future extension this version knows nothing about)
202
 * return 0 upon success, non-0 otherwise
203
 */
204
static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset,
205
                                uint64_t end_offset)
206
{
207
    BDRVQcowState *s = bs->opaque;
208
    QCowExtension ext;
209
    uint64_t offset;
210

    
211
#ifdef DEBUG_EXT
212
    printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
213
#endif
214
    offset = start_offset;
215
    while (offset < end_offset) {
216

    
217
#ifdef DEBUG_EXT
218
        /* Sanity check */
219
        if (offset > s->cluster_size)
220
            printf("qcow_handle_extension: suspicious offset %lu\n", offset);
221

    
222
        printf("attemting to read extended header in offset %lu\n", offset);
223
#endif
224

    
225
        if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) {
226
            fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %lu\n",
227
                    offset);
228
            return 1;
229
        }
230
        be32_to_cpus(&ext.magic);
231
        be32_to_cpus(&ext.len);
232
        offset += sizeof(ext);
233
#ifdef DEBUG_EXT
234
        printf("ext.magic = 0x%x\n", ext.magic);
235
#endif
236
        switch (ext.magic) {
237
        case QCOW_EXT_MAGIC_END:
238
            return 0;
239

    
240
        case QCOW_EXT_MAGIC_BACKING_FORMAT:
241
            if (ext.len >= sizeof(bs->backing_format)) {
242
                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
243
                        " (>=%lu)\n",
244
                        ext.len, sizeof(bs->backing_format));
245
                return 2;
246
            }
247
            if (bdrv_pread(s->hd, offset , bs->backing_format,
248
                           ext.len) != ext.len)
249
                return 3;
250
            bs->backing_format[ext.len] = '\0';
251
#ifdef DEBUG_EXT
252
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
253
#endif
254
            offset += ((ext.len + 7) & ~7);
255
            break;
256

    
257
        default:
258
            /* unknown magic -- just skip it */
259
            offset += ((ext.len + 7) & ~7);
260
            break;
261
        }
262
    }
263

    
264
    return 0;
265
}
266

    
267

    
268
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
269
{
270
    BDRVQcowState *s = bs->opaque;
271
    int len, i, shift, ret;
272
    QCowHeader header;
273
    uint64_t ext_end;
274

    
275
    /* Performance is terrible right now with cache=writethrough due mainly
276
     * to reference count updates.  If the user does not explicitly specify
277
     * a caching type, force to writeback caching.
278
     */
279
    if ((flags & BDRV_O_CACHE_DEF)) {
280
        flags |= BDRV_O_CACHE_WB;
281
        flags &= ~BDRV_O_CACHE_DEF;
282
    }
283
    ret = bdrv_file_open(&s->hd, filename, flags);
284
    if (ret < 0)
285
        return ret;
286
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
287
        goto fail;
288
    be32_to_cpus(&header.magic);
289
    be32_to_cpus(&header.version);
290
    be64_to_cpus(&header.backing_file_offset);
291
    be32_to_cpus(&header.backing_file_size);
292
    be64_to_cpus(&header.size);
293
    be32_to_cpus(&header.cluster_bits);
294
    be32_to_cpus(&header.crypt_method);
295
    be64_to_cpus(&header.l1_table_offset);
296
    be32_to_cpus(&header.l1_size);
297
    be64_to_cpus(&header.refcount_table_offset);
298
    be32_to_cpus(&header.refcount_table_clusters);
299
    be64_to_cpus(&header.snapshots_offset);
300
    be32_to_cpus(&header.nb_snapshots);
301

    
302
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
303
        goto fail;
304
    if (header.size <= 1 ||
305
        header.cluster_bits < 9 ||
306
        header.cluster_bits > 16)
307
        goto fail;
308
    if (header.crypt_method > QCOW_CRYPT_AES)
309
        goto fail;
310
    s->crypt_method_header = header.crypt_method;
311
    if (s->crypt_method_header)
312
        bs->encrypted = 1;
313
    s->cluster_bits = header.cluster_bits;
314
    s->cluster_size = 1 << s->cluster_bits;
315
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
316
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
317
    s->l2_size = 1 << s->l2_bits;
318
    bs->total_sectors = header.size / 512;
319
    s->csize_shift = (62 - (s->cluster_bits - 8));
320
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
321
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
322
    s->refcount_table_offset = header.refcount_table_offset;
323
    s->refcount_table_size =
324
        header.refcount_table_clusters << (s->cluster_bits - 3);
325

    
326
    s->snapshots_offset = header.snapshots_offset;
327
    s->nb_snapshots = header.nb_snapshots;
328

    
329
    /* read the level 1 table */
330
    s->l1_size = header.l1_size;
331
    shift = s->cluster_bits + s->l2_bits;
332
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
333
    /* the L1 table must contain at least enough entries to put
334
       header.size bytes */
335
    if (s->l1_size < s->l1_vm_state_index)
336
        goto fail;
337
    s->l1_table_offset = header.l1_table_offset;
338
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
339
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
340
        s->l1_size * sizeof(uint64_t))
341
        goto fail;
342
    for(i = 0;i < s->l1_size; i++) {
343
        be64_to_cpus(&s->l1_table[i]);
344
    }
345
    /* alloc L2 cache */
346
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
347
    s->cluster_cache = qemu_malloc(s->cluster_size);
348
    /* one more sector for decompressed data alignment */
349
    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
350
                                  + 512);
351
    s->cluster_cache_offset = -1;
352

    
353
    if (refcount_init(bs) < 0)
354
        goto fail;
355

    
356
    /* read qcow2 extensions */
357
    if (header.backing_file_offset)
358
        ext_end = header.backing_file_offset;
359
    else
360
        ext_end = s->cluster_size;
361
    if (qcow_read_extensions(bs, sizeof(header), ext_end))
362
        goto fail;
363

    
364
    /* read the backing file name */
365
    if (header.backing_file_offset != 0) {
366
        len = header.backing_file_size;
367
        if (len > 1023)
368
            len = 1023;
369
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
370
            goto fail;
371
        bs->backing_file[len] = '\0';
372
    }
373
    if (qcow_read_snapshots(bs) < 0)
374
        goto fail;
375

    
376
#ifdef DEBUG_ALLOC
377
    check_refcounts(bs);
378
#endif
379
    return 0;
380

    
381
 fail:
382
    qcow_free_snapshots(bs);
383
    refcount_close(bs);
384
    qemu_free(s->l1_table);
385
    qemu_free(s->l2_cache);
386
    qemu_free(s->cluster_cache);
387
    qemu_free(s->cluster_data);
388
    bdrv_delete(s->hd);
389
    return -1;
390
}
391

    
392
static int qcow_set_key(BlockDriverState *bs, const char *key)
393
{
394
    BDRVQcowState *s = bs->opaque;
395
    uint8_t keybuf[16];
396
    int len, i;
397

    
398
    memset(keybuf, 0, 16);
399
    len = strlen(key);
400
    if (len > 16)
401
        len = 16;
402
    /* XXX: we could compress the chars to 7 bits to increase
403
       entropy */
404
    for(i = 0;i < len;i++) {
405
        keybuf[i] = key[i];
406
    }
407
    s->crypt_method = s->crypt_method_header;
408

    
409
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
410
        return -1;
411
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
412
        return -1;
413
#if 0
414
    /* test */
415
    {
416
        uint8_t in[16];
417
        uint8_t out[16];
418
        uint8_t tmp[16];
419
        for(i=0;i<16;i++)
420
            in[i] = i;
421
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
422
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
423
        for(i = 0; i < 16; i++)
424
            printf(" %02x", tmp[i]);
425
        printf("\n");
426
        for(i = 0; i < 16; i++)
427
            printf(" %02x", out[i]);
428
        printf("\n");
429
    }
430
#endif
431
    return 0;
432
}
433

    
434
/* The crypt function is compatible with the linux cryptoloop
435
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
436
   supported */
437
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
438
                            uint8_t *out_buf, const uint8_t *in_buf,
439
                            int nb_sectors, int enc,
440
                            const AES_KEY *key)
441
{
442
    union {
443
        uint64_t ll[2];
444
        uint8_t b[16];
445
    } ivec;
446
    int i;
447

    
448
    for(i = 0; i < nb_sectors; i++) {
449
        ivec.ll[0] = cpu_to_le64(sector_num);
450
        ivec.ll[1] = 0;
451
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
452
                        ivec.b, enc);
453
        sector_num++;
454
        in_buf += 512;
455
        out_buf += 512;
456
    }
457
}
458

    
459
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
460
                        uint64_t cluster_offset, int n_start, int n_end)
461
{
462
    BDRVQcowState *s = bs->opaque;
463
    int n, ret;
464

    
465
    n = n_end - n_start;
466
    if (n <= 0)
467
        return 0;
468
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
469
    if (ret < 0)
470
        return ret;
471
    if (s->crypt_method) {
472
        encrypt_sectors(s, start_sect + n_start,
473
                        s->cluster_data,
474
                        s->cluster_data, n, 1,
475
                        &s->aes_encrypt_key);
476
    }
477
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
478
                     s->cluster_data, n);
479
    if (ret < 0)
480
        return ret;
481
    return 0;
482
}
483

    
484
static void l2_cache_reset(BlockDriverState *bs)
485
{
486
    BDRVQcowState *s = bs->opaque;
487

    
488
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
489
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
490
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
491
}
492

    
493
static inline int l2_cache_new_entry(BlockDriverState *bs)
494
{
495
    BDRVQcowState *s = bs->opaque;
496
    uint32_t min_count;
497
    int min_index, i;
498

    
499
    /* find a new entry in the least used one */
500
    min_index = 0;
501
    min_count = 0xffffffff;
502
    for(i = 0; i < L2_CACHE_SIZE; i++) {
503
        if (s->l2_cache_counts[i] < min_count) {
504
            min_count = s->l2_cache_counts[i];
505
            min_index = i;
506
        }
507
    }
508
    return min_index;
509
}
510

    
511
static int64_t align_offset(int64_t offset, int n)
512
{
513
    offset = (offset + n - 1) & ~(n - 1);
514
    return offset;
515
}
516

    
517
static int grow_l1_table(BlockDriverState *bs, int min_size)
518
{
519
    BDRVQcowState *s = bs->opaque;
520
    int new_l1_size, new_l1_size2, ret, i;
521
    uint64_t *new_l1_table;
522
    uint64_t new_l1_table_offset;
523
    uint8_t data[12];
524

    
525
    new_l1_size = s->l1_size;
526
    if (min_size <= new_l1_size)
527
        return 0;
528
    while (min_size > new_l1_size) {
529
        new_l1_size = (new_l1_size * 3 + 1) / 2;
530
    }
531
#ifdef DEBUG_ALLOC2
532
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
533
#endif
534

    
535
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
536
    new_l1_table = qemu_mallocz(new_l1_size2);
537
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
538

    
539
    /* write new table (align to cluster) */
540
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
541

    
542
    for(i = 0; i < s->l1_size; i++)
543
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
544
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
545
    if (ret != new_l1_size2)
546
        goto fail;
547
    for(i = 0; i < s->l1_size; i++)
548
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
549

    
550
    /* set new table */
551
    cpu_to_be32w((uint32_t*)data, new_l1_size);
552
    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
553
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
554
                sizeof(data)) != sizeof(data))
555
        goto fail;
556
    qemu_free(s->l1_table);
557
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
558
    s->l1_table_offset = new_l1_table_offset;
559
    s->l1_table = new_l1_table;
560
    s->l1_size = new_l1_size;
561
    return 0;
562
 fail:
563
    qemu_free(s->l1_table);
564
    return -EIO;
565
}
566

    
567
/*
568
 * seek_l2_table
569
 *
570
 * seek l2_offset in the l2_cache table
571
 * if not found, return NULL,
572
 * if found,
573
 *   increments the l2 cache hit count of the entry,
574
 *   if counter overflow, divide by two all counters
575
 *   return the pointer to the l2 cache entry
576
 *
577
 */
578

    
579
static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
580
{
581
    int i, j;
582

    
583
    for(i = 0; i < L2_CACHE_SIZE; i++) {
584
        if (l2_offset == s->l2_cache_offsets[i]) {
585
            /* increment the hit count */
586
            if (++s->l2_cache_counts[i] == 0xffffffff) {
587
                for(j = 0; j < L2_CACHE_SIZE; j++) {
588
                    s->l2_cache_counts[j] >>= 1;
589
                }
590
            }
591
            return s->l2_cache + (i << s->l2_bits);
592
        }
593
    }
594
    return NULL;
595
}
596

    
597
/*
598
 * l2_load
599
 *
600
 * Loads a L2 table into memory. If the table is in the cache, the cache
601
 * is used; otherwise the L2 table is loaded from the image file.
602
 *
603
 * Returns a pointer to the L2 table on success, or NULL if the read from
604
 * the image file failed.
605
 */
606

    
607
static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
608
{
609
    BDRVQcowState *s = bs->opaque;
610
    int min_index;
611
    uint64_t *l2_table;
612

    
613
    /* seek if the table for the given offset is in the cache */
614

    
615
    l2_table = seek_l2_table(s, l2_offset);
616
    if (l2_table != NULL)
617
        return l2_table;
618

    
619
    /* not found: load a new entry in the least used one */
620

    
621
    min_index = l2_cache_new_entry(bs);
622
    l2_table = s->l2_cache + (min_index << s->l2_bits);
623
    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
624
        s->l2_size * sizeof(uint64_t))
625
        return NULL;
626
    s->l2_cache_offsets[min_index] = l2_offset;
627
    s->l2_cache_counts[min_index] = 1;
628

    
629
    return l2_table;
630
}
631

    
632
/*
633
 * l2_allocate
634
 *
635
 * Allocate a new l2 entry in the file. If l1_index points to an already
636
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
637
 * table) copy the contents of the old L2 table into the newly allocated one.
638
 * Otherwise the new table is initialized with zeros.
639
 *
640
 */
641

    
642
static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
643
{
644
    BDRVQcowState *s = bs->opaque;
645
    int min_index;
646
    uint64_t old_l2_offset, tmp;
647
    uint64_t *l2_table, l2_offset;
648

    
649
    old_l2_offset = s->l1_table[l1_index];
650

    
651
    /* allocate a new l2 entry */
652

    
653
    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
654

    
655
    /* update the L1 entry */
656

    
657
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
658

    
659
    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
660
    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
661
                    &tmp, sizeof(tmp)) != sizeof(tmp))
662
        return NULL;
663

    
664
    /* allocate a new entry in the l2 cache */
665

    
666
    min_index = l2_cache_new_entry(bs);
667
    l2_table = s->l2_cache + (min_index << s->l2_bits);
668

    
669
    if (old_l2_offset == 0) {
670
        /* if there was no old l2 table, clear the new table */
671
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
672
    } else {
673
        /* if there was an old l2 table, read it from the disk */
674
        if (bdrv_pread(s->hd, old_l2_offset,
675
                       l2_table, s->l2_size * sizeof(uint64_t)) !=
676
            s->l2_size * sizeof(uint64_t))
677
            return NULL;
678
    }
679
    /* write the l2 table to the file */
680
    if (bdrv_pwrite(s->hd, l2_offset,
681
                    l2_table, s->l2_size * sizeof(uint64_t)) !=
682
        s->l2_size * sizeof(uint64_t))
683
        return NULL;
684

    
685
    /* update the l2 cache entry */
686

    
687
    s->l2_cache_offsets[min_index] = l2_offset;
688
    s->l2_cache_counts[min_index] = 1;
689

    
690
    return l2_table;
691
}
692

    
693
static int size_to_clusters(BDRVQcowState *s, int64_t size)
694
{
695
    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
696
}
697

    
698
static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
699
        uint64_t *l2_table, uint64_t start, uint64_t mask)
700
{
701
    int i;
702
    uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
703

    
704
    if (!offset)
705
        return 0;
706

    
707
    for (i = start; i < start + nb_clusters; i++)
708
        if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
709
            break;
710

    
711
        return (i - start);
712
}
713

    
714
static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
715
{
716
    int i = 0;
717

    
718
    while(nb_clusters-- && l2_table[i] == 0)
719
        i++;
720

    
721
    return i;
722
}
723

    
724
/*
725
 * get_cluster_offset
726
 *
727
 * For a given offset of the disk image, return cluster offset in
728
 * qcow2 file.
729
 *
730
 * on entry, *num is the number of contiguous clusters we'd like to
731
 * access following offset.
732
 *
733
 * on exit, *num is the number of contiguous clusters we can read.
734
 *
735
 * Return 1, if the offset is found
736
 * Return 0, otherwise.
737
 *
738
 */
739

    
740
static uint64_t get_cluster_offset(BlockDriverState *bs,
741
                                   uint64_t offset, int *num)
742
{
743
    BDRVQcowState *s = bs->opaque;
744
    int l1_index, l2_index;
745
    uint64_t l2_offset, *l2_table, cluster_offset;
746
    int l1_bits, c;
747
    int index_in_cluster, nb_available, nb_needed, nb_clusters;
748

    
749
    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
750
    nb_needed = *num + index_in_cluster;
751

    
752
    l1_bits = s->l2_bits + s->cluster_bits;
753

    
754
    /* compute how many bytes there are between the offset and
755
     * the end of the l1 entry
756
     */
757

    
758
    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
759

    
760
    /* compute the number of available sectors */
761

    
762
    nb_available = (nb_available >> 9) + index_in_cluster;
763

    
764
    cluster_offset = 0;
765

    
766
    /* seek the the l2 offset in the l1 table */
767

    
768
    l1_index = offset >> l1_bits;
769
    if (l1_index >= s->l1_size)
770
        goto out;
771

    
772
    l2_offset = s->l1_table[l1_index];
773

    
774
    /* seek the l2 table of the given l2 offset */
775

    
776
    if (!l2_offset)
777
        goto out;
778

    
779
    /* load the l2 table in memory */
780

    
781
    l2_offset &= ~QCOW_OFLAG_COPIED;
782
    l2_table = l2_load(bs, l2_offset);
783
    if (l2_table == NULL)
784
        return 0;
785

    
786
    /* find the cluster offset for the given disk offset */
787

    
788
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
789
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
790
    nb_clusters = size_to_clusters(s, nb_needed << 9);
791

    
792
    if (!cluster_offset) {
793
        /* how many empty clusters ? */
794
        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
795
    } else {
796
        /* how many allocated clusters ? */
797
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
798
                &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
799
    }
800

    
801
   nb_available = (c * s->cluster_sectors);
802
out:
803
    if (nb_available > nb_needed)
804
        nb_available = nb_needed;
805

    
806
    *num = nb_available - index_in_cluster;
807

    
808
    return cluster_offset & ~QCOW_OFLAG_COPIED;
809
}
810

    
811
/*
812
 * free_any_clusters
813
 *
814
 * free clusters according to its type: compressed or not
815
 *
816
 */
817

    
818
static void free_any_clusters(BlockDriverState *bs,
819
                              uint64_t cluster_offset, int nb_clusters)
820
{
821
    BDRVQcowState *s = bs->opaque;
822

    
823
    /* free the cluster */
824

    
825
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
826
        int nb_csectors;
827
        nb_csectors = ((cluster_offset >> s->csize_shift) &
828
                       s->csize_mask) + 1;
829
        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
830
                      nb_csectors * 512);
831
        return;
832
    }
833

    
834
    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
835

    
836
    return;
837
}
838

    
839
/*
840
 * get_cluster_table
841
 *
842
 * for a given disk offset, load (and allocate if needed)
843
 * the l2 table.
844
 *
845
 * the l2 table offset in the qcow2 file and the cluster index
846
 * in the l2 table are given to the caller.
847
 *
848
 */
849

    
850
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
851
                             uint64_t **new_l2_table,
852
                             uint64_t *new_l2_offset,
853
                             int *new_l2_index)
854
{
855
    BDRVQcowState *s = bs->opaque;
856
    int l1_index, l2_index, ret;
857
    uint64_t l2_offset, *l2_table;
858

    
859
    /* seek the the l2 offset in the l1 table */
860

    
861
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
862
    if (l1_index >= s->l1_size) {
863
        ret = grow_l1_table(bs, l1_index + 1);
864
        if (ret < 0)
865
            return 0;
866
    }
867
    l2_offset = s->l1_table[l1_index];
868

    
869
    /* seek the l2 table of the given l2 offset */
870

    
871
    if (l2_offset & QCOW_OFLAG_COPIED) {
872
        /* load the l2 table in memory */
873
        l2_offset &= ~QCOW_OFLAG_COPIED;
874
        l2_table = l2_load(bs, l2_offset);
875
        if (l2_table == NULL)
876
            return 0;
877
    } else {
878
        if (l2_offset)
879
            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
880
        l2_table = l2_allocate(bs, l1_index);
881
        if (l2_table == NULL)
882
            return 0;
883
        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
884
    }
885

    
886
    /* find the cluster offset for the given disk offset */
887

    
888
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
889

    
890
    *new_l2_table = l2_table;
891
    *new_l2_offset = l2_offset;
892
    *new_l2_index = l2_index;
893

    
894
    return 1;
895
}
896

    
897
/*
898
 * alloc_compressed_cluster_offset
899
 *
900
 * For a given offset of the disk image, return cluster offset in
901
 * qcow2 file.
902
 *
903
 * If the offset is not found, allocate a new compressed cluster.
904
 *
905
 * Return the cluster offset if successful,
906
 * Return 0, otherwise.
907
 *
908
 */
909

    
910
static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
911
                                                uint64_t offset,
912
                                                int compressed_size)
913
{
914
    BDRVQcowState *s = bs->opaque;
915
    int l2_index, ret;
916
    uint64_t l2_offset, *l2_table, cluster_offset;
917
    int nb_csectors;
918

    
919
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
920
    if (ret == 0)
921
        return 0;
922

    
923
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
924
    if (cluster_offset & QCOW_OFLAG_COPIED)
925
        return cluster_offset & ~QCOW_OFLAG_COPIED;
926

    
927
    if (cluster_offset)
928
        free_any_clusters(bs, cluster_offset, 1);
929

    
930
    cluster_offset = alloc_bytes(bs, compressed_size);
931
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
932
                  (cluster_offset >> 9);
933

    
934
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
935
                      ((uint64_t)nb_csectors << s->csize_shift);
936

    
937
    /* update L2 table */
938

    
939
    /* compressed clusters never have the copied flag */
940

    
941
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
942
    if (bdrv_pwrite(s->hd,
943
                    l2_offset + l2_index * sizeof(uint64_t),
944
                    l2_table + l2_index,
945
                    sizeof(uint64_t)) != sizeof(uint64_t))
946
        return 0;
947

    
948
    return cluster_offset;
949
}
950

    
951
typedef struct QCowL2Meta
952
{
953
    uint64_t offset;
954
    int n_start;
955
    int nb_available;
956
    int nb_clusters;
957
} QCowL2Meta;
958

    
959
static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
960
        QCowL2Meta *m)
961
{
962
    BDRVQcowState *s = bs->opaque;
963
    int i, j = 0, l2_index, ret;
964
    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
965

    
966
    if (m->nb_clusters == 0)
967
        return 0;
968

    
969
    old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
970

    
971
    /* copy content of unmodified sectors */
972
    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
973
    if (m->n_start) {
974
        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
975
        if (ret < 0)
976
            goto err;
977
    }
978

    
979
    if (m->nb_available & (s->cluster_sectors - 1)) {
980
        uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
981
        ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
982
                m->nb_available - end, s->cluster_sectors);
983
        if (ret < 0)
984
            goto err;
985
    }
986

    
987
    ret = -EIO;
988
    /* update L2 table */
989
    if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
990
        goto err;
991

    
992
    for (i = 0; i < m->nb_clusters; i++) {
993
        if(l2_table[l2_index + i] != 0)
994
            old_cluster[j++] = l2_table[l2_index + i];
995

    
996
        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
997
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
998
     }
999

    
1000
    if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
1001
                l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
1002
            m->nb_clusters * sizeof(uint64_t))
1003
        goto err;
1004

    
1005
    for (i = 0; i < j; i++)
1006
        free_any_clusters(bs, old_cluster[i], 1);
1007

    
1008
    ret = 0;
1009
err:
1010
    qemu_free(old_cluster);
1011
    return ret;
1012
 }
1013

    
1014
/*
1015
 * alloc_cluster_offset
1016
 *
1017
 * For a given offset of the disk image, return cluster offset in
1018
 * qcow2 file.
1019
 *
1020
 * If the offset is not found, allocate a new cluster.
1021
 *
1022
 * Return the cluster offset if successful,
1023
 * Return 0, otherwise.
1024
 *
1025
 */
1026

    
1027
static uint64_t alloc_cluster_offset(BlockDriverState *bs,
1028
                                     uint64_t offset,
1029
                                     int n_start, int n_end,
1030
                                     int *num, QCowL2Meta *m)
1031
{
1032
    BDRVQcowState *s = bs->opaque;
1033
    int l2_index, ret;
1034
    uint64_t l2_offset, *l2_table, cluster_offset;
1035
    int nb_clusters, i = 0;
1036

    
1037
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
1038
    if (ret == 0)
1039
        return 0;
1040

    
1041
    nb_clusters = size_to_clusters(s, n_end << 9);
1042

    
1043
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1044

    
1045
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
1046

    
1047
    /* We keep all QCOW_OFLAG_COPIED clusters */
1048

    
1049
    if (cluster_offset & QCOW_OFLAG_COPIED) {
1050
        nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
1051
                &l2_table[l2_index], 0, 0);
1052

    
1053
        cluster_offset &= ~QCOW_OFLAG_COPIED;
1054
        m->nb_clusters = 0;
1055

    
1056
        goto out;
1057
    }
1058

    
1059
    /* for the moment, multiple compressed clusters are not managed */
1060

    
1061
    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
1062
        nb_clusters = 1;
1063

    
1064
    /* how many available clusters ? */
1065

    
1066
    while (i < nb_clusters) {
1067
        i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
1068
                &l2_table[l2_index], i, 0);
1069

    
1070
        if(be64_to_cpu(l2_table[l2_index + i]))
1071
            break;
1072

    
1073
        i += count_contiguous_free_clusters(nb_clusters - i,
1074
                &l2_table[l2_index + i]);
1075

    
1076
        cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
1077

    
1078
        if ((cluster_offset & QCOW_OFLAG_COPIED) ||
1079
                (cluster_offset & QCOW_OFLAG_COMPRESSED))
1080
            break;
1081
    }
1082
    nb_clusters = i;
1083

    
1084
    /* allocate a new cluster */
1085

    
1086
    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
1087

    
1088
    /* save info needed for meta data update */
1089
    m->offset = offset;
1090
    m->n_start = n_start;
1091
    m->nb_clusters = nb_clusters;
1092

    
1093
out:
1094
    m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
1095

    
1096
    *num = m->nb_available - n_start;
1097

    
1098
    return cluster_offset;
1099
}
1100

    
1101
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
1102
                             int nb_sectors, int *pnum)
1103
{
1104
    uint64_t cluster_offset;
1105

    
1106
    *pnum = nb_sectors;
1107
    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
1108

    
1109
    return (cluster_offset != 0);
1110
}
1111

    
1112
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1113
                             const uint8_t *buf, int buf_size)
1114
{
1115
    z_stream strm1, *strm = &strm1;
1116
    int ret, out_len;
1117

    
1118
    memset(strm, 0, sizeof(*strm));
1119

    
1120
    strm->next_in = (uint8_t *)buf;
1121
    strm->avail_in = buf_size;
1122
    strm->next_out = out_buf;
1123
    strm->avail_out = out_buf_size;
1124

    
1125
    ret = inflateInit2(strm, -12);
1126
    if (ret != Z_OK)
1127
        return -1;
1128
    ret = inflate(strm, Z_FINISH);
1129
    out_len = strm->next_out - out_buf;
1130
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1131
        out_len != out_buf_size) {
1132
        inflateEnd(strm);
1133
        return -1;
1134
    }
1135
    inflateEnd(strm);
1136
    return 0;
1137
}
1138

    
1139
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
1140
{
1141
    int ret, csize, nb_csectors, sector_offset;
1142
    uint64_t coffset;
1143

    
1144
    coffset = cluster_offset & s->cluster_offset_mask;
1145
    if (s->cluster_cache_offset != coffset) {
1146
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1147
        sector_offset = coffset & 511;
1148
        csize = nb_csectors * 512 - sector_offset;
1149
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
1150
        if (ret < 0) {
1151
            return -1;
1152
        }
1153
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1154
                              s->cluster_data + sector_offset, csize) < 0) {
1155
            return -1;
1156
        }
1157
        s->cluster_cache_offset = coffset;
1158
    }
1159
    return 0;
1160
}
1161

    
1162
/* handle reading after the end of the backing file */
1163
static int backing_read1(BlockDriverState *bs,
1164
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
1165
{
1166
    int n1;
1167
    if ((sector_num + nb_sectors) <= bs->total_sectors)
1168
        return nb_sectors;
1169
    if (sector_num >= bs->total_sectors)
1170
        n1 = 0;
1171
    else
1172
        n1 = bs->total_sectors - sector_num;
1173
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
1174
    return n1;
1175
}
1176

    
1177
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
1178
                     uint8_t *buf, int nb_sectors)
1179
{
1180
    BDRVQcowState *s = bs->opaque;
1181
    int ret, index_in_cluster, n, n1;
1182
    uint64_t cluster_offset;
1183

    
1184
    while (nb_sectors > 0) {
1185
        n = nb_sectors;
1186
        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
1187
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1188
        if (!cluster_offset) {
1189
            if (bs->backing_hd) {
1190
                /* read from the base image */
1191
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
1192
                if (n1 > 0) {
1193
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
1194
                    if (ret < 0)
1195
                        return -1;
1196
                }
1197
            } else {
1198
                memset(buf, 0, 512 * n);
1199
            }
1200
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1201
            if (decompress_cluster(s, cluster_offset) < 0)
1202
                return -1;
1203
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
1204
        } else {
1205
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1206
            if (ret != n * 512)
1207
                return -1;
1208
            if (s->crypt_method) {
1209
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
1210
                                &s->aes_decrypt_key);
1211
            }
1212
        }
1213
        nb_sectors -= n;
1214
        sector_num += n;
1215
        buf += n * 512;
1216
    }
1217
    return 0;
1218
}
1219

    
1220
static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1221
                     const uint8_t *buf, int nb_sectors)
1222
{
1223
    BDRVQcowState *s = bs->opaque;
1224
    int ret, index_in_cluster, n;
1225
    uint64_t cluster_offset;
1226
    int n_end;
1227
    QCowL2Meta l2meta;
1228

    
1229
    while (nb_sectors > 0) {
1230
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1231
        n_end = index_in_cluster + nb_sectors;
1232
        if (s->crypt_method &&
1233
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1234
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1235
        cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1236
                                              index_in_cluster,
1237
                                              n_end, &n, &l2meta);
1238
        if (!cluster_offset)
1239
            return -1;
1240
        if (s->crypt_method) {
1241
            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
1242
                            &s->aes_encrypt_key);
1243
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
1244
                              s->cluster_data, n * 512);
1245
        } else {
1246
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1247
        }
1248
        if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
1249
            free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
1250
            return -1;
1251
        }
1252
        nb_sectors -= n;
1253
        sector_num += n;
1254
        buf += n * 512;
1255
    }
1256
    s->cluster_cache_offset = -1; /* disable compressed cache */
1257
    return 0;
1258
}
1259

    
1260
typedef struct QCowAIOCB {
1261
    BlockDriverAIOCB common;
1262
    int64_t sector_num;
1263
    uint8_t *buf;
1264
    int nb_sectors;
1265
    int n;
1266
    uint64_t cluster_offset;
1267
    uint8_t *cluster_data;
1268
    BlockDriverAIOCB *hd_aiocb;
1269
    QEMUBH *bh;
1270
    QCowL2Meta l2meta;
1271
} QCowAIOCB;
1272

    
1273
static void qcow_aio_read_cb(void *opaque, int ret);
1274
static void qcow_aio_read_bh(void *opaque)
1275
{
1276
    QCowAIOCB *acb = opaque;
1277
    qemu_bh_delete(acb->bh);
1278
    acb->bh = NULL;
1279
    qcow_aio_read_cb(opaque, 0);
1280
}
1281

    
1282
static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
1283
{
1284
    if (acb->bh)
1285
        return -EIO;
1286

    
1287
    acb->bh = qemu_bh_new(cb, acb);
1288
    if (!acb->bh)
1289
        return -EIO;
1290

    
1291
    qemu_bh_schedule(acb->bh);
1292

    
1293
    return 0;
1294
}
1295

    
1296
static void qcow_aio_read_cb(void *opaque, int ret)
1297
{
1298
    QCowAIOCB *acb = opaque;
1299
    BlockDriverState *bs = acb->common.bs;
1300
    BDRVQcowState *s = bs->opaque;
1301
    int index_in_cluster, n1;
1302

    
1303
    acb->hd_aiocb = NULL;
1304
    if (ret < 0) {
1305
fail:
1306
        acb->common.cb(acb->common.opaque, ret);
1307
        qemu_aio_release(acb);
1308
        return;
1309
    }
1310

    
1311
    /* post process the read buffer */
1312
    if (!acb->cluster_offset) {
1313
        /* nothing to do */
1314
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1315
        /* nothing to do */
1316
    } else {
1317
        if (s->crypt_method) {
1318
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
1319
                            acb->n, 0,
1320
                            &s->aes_decrypt_key);
1321
        }
1322
    }
1323

    
1324
    acb->nb_sectors -= acb->n;
1325
    acb->sector_num += acb->n;
1326
    acb->buf += acb->n * 512;
1327

    
1328
    if (acb->nb_sectors == 0) {
1329
        /* request completed */
1330
        acb->common.cb(acb->common.opaque, 0);
1331
        qemu_aio_release(acb);
1332
        return;
1333
    }
1334

    
1335
    /* prepare next AIO request */
1336
    acb->n = acb->nb_sectors;
1337
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
1338
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1339

    
1340
    if (!acb->cluster_offset) {
1341
        if (bs->backing_hd) {
1342
            /* read from the base image */
1343
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
1344
                               acb->buf, acb->n);
1345
            if (n1 > 0) {
1346
                acb->hd_aiocb = bdrv_aio_read(bs->backing_hd, acb->sector_num,
1347
                                    acb->buf, acb->n, qcow_aio_read_cb, acb);
1348
                if (acb->hd_aiocb == NULL)
1349
                    goto fail;
1350
            } else {
1351
                ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1352
                if (ret < 0)
1353
                    goto fail;
1354
            }
1355
        } else {
1356
            /* Note: in this case, no need to wait */
1357
            memset(acb->buf, 0, 512 * acb->n);
1358
            ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1359
            if (ret < 0)
1360
                goto fail;
1361
        }
1362
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1363
        /* add AIO support for compressed blocks ? */
1364
        if (decompress_cluster(s, acb->cluster_offset) < 0)
1365
            goto fail;
1366
        memcpy(acb->buf,
1367
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
1368
        ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1369
        if (ret < 0)
1370
            goto fail;
1371
    } else {
1372
        if ((acb->cluster_offset & 511) != 0) {
1373
            ret = -EIO;
1374
            goto fail;
1375
        }
1376
        acb->hd_aiocb = bdrv_aio_read(s->hd,
1377
                            (acb->cluster_offset >> 9) + index_in_cluster,
1378
                            acb->buf, acb->n, qcow_aio_read_cb, acb);
1379
        if (acb->hd_aiocb == NULL)
1380
            goto fail;
1381
    }
1382
}
1383

    
1384
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1385
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1386
        BlockDriverCompletionFunc *cb, void *opaque)
1387
{
1388
    QCowAIOCB *acb;
1389

    
1390
    acb = qemu_aio_get(bs, cb, opaque);
1391
    if (!acb)
1392
        return NULL;
1393
    acb->hd_aiocb = NULL;
1394
    acb->sector_num = sector_num;
1395
    acb->buf = buf;
1396
    acb->nb_sectors = nb_sectors;
1397
    acb->n = 0;
1398
    acb->cluster_offset = 0;
1399
    acb->l2meta.nb_clusters = 0;
1400
    return acb;
1401
}
1402

    
1403
static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs,
1404
        int64_t sector_num, uint8_t *buf, int nb_sectors,
1405
        BlockDriverCompletionFunc *cb, void *opaque)
1406
{
1407
    QCowAIOCB *acb;
1408

    
1409
    acb = qcow_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
1410
    if (!acb)
1411
        return NULL;
1412

    
1413
    qcow_aio_read_cb(acb, 0);
1414
    return &acb->common;
1415
}
1416

    
1417
static void qcow_aio_write_cb(void *opaque, int ret)
1418
{
1419
    QCowAIOCB *acb = opaque;
1420
    BlockDriverState *bs = acb->common.bs;
1421
    BDRVQcowState *s = bs->opaque;
1422
    int index_in_cluster;
1423
    const uint8_t *src_buf;
1424
    int n_end;
1425

    
1426
    acb->hd_aiocb = NULL;
1427

    
1428
    if (ret < 0) {
1429
    fail:
1430
        acb->common.cb(acb->common.opaque, ret);
1431
        qemu_aio_release(acb);
1432
        return;
1433
    }
1434

    
1435
    if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
1436
        free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
1437
        goto fail;
1438
    }
1439

    
1440
    acb->nb_sectors -= acb->n;
1441
    acb->sector_num += acb->n;
1442
    acb->buf += acb->n * 512;
1443

    
1444
    if (acb->nb_sectors == 0) {
1445
        /* request completed */
1446
        acb->common.cb(acb->common.opaque, 0);
1447
        qemu_aio_release(acb);
1448
        return;
1449
    }
1450

    
1451
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1452
    n_end = index_in_cluster + acb->nb_sectors;
1453
    if (s->crypt_method &&
1454
        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1455
        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1456

    
1457
    acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1458
                                          index_in_cluster,
1459
                                          n_end, &acb->n, &acb->l2meta);
1460
    if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
1461
        ret = -EIO;
1462
        goto fail;
1463
    }
1464
    if (s->crypt_method) {
1465
        if (!acb->cluster_data) {
1466
            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
1467
                                             s->cluster_size);
1468
        }
1469
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
1470
                        acb->n, 1, &s->aes_encrypt_key);
1471
        src_buf = acb->cluster_data;
1472
    } else {
1473
        src_buf = acb->buf;
1474
    }
1475
    acb->hd_aiocb = bdrv_aio_write(s->hd,
1476
                                   (acb->cluster_offset >> 9) + index_in_cluster,
1477
                                   src_buf, acb->n,
1478
                                   qcow_aio_write_cb, acb);
1479
    if (acb->hd_aiocb == NULL)
1480
        goto fail;
1481
}
1482

    
1483
static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs,
1484
        int64_t sector_num, const uint8_t *buf, int nb_sectors,
1485
        BlockDriverCompletionFunc *cb, void *opaque)
1486
{
1487
    BDRVQcowState *s = bs->opaque;
1488
    QCowAIOCB *acb;
1489

    
1490
    s->cluster_cache_offset = -1; /* disable compressed cache */
1491

    
1492
    acb = qcow_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
1493
    if (!acb)
1494
        return NULL;
1495

    
1496
    qcow_aio_write_cb(acb, 0);
1497
    return &acb->common;
1498
}
1499

    
1500
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1501
{
1502
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1503
    if (acb->hd_aiocb)
1504
        bdrv_aio_cancel(acb->hd_aiocb);
1505
    qemu_aio_release(acb);
1506
}
1507

    
1508
static void qcow_close(BlockDriverState *bs)
1509
{
1510
    BDRVQcowState *s = bs->opaque;
1511
    qemu_free(s->l1_table);
1512
    qemu_free(s->l2_cache);
1513
    qemu_free(s->cluster_cache);
1514
    qemu_free(s->cluster_data);
1515
    refcount_close(bs);
1516
    bdrv_delete(s->hd);
1517
}
1518

    
1519
/* XXX: use std qcow open function ? */
1520
typedef struct QCowCreateState {
1521
    int cluster_size;
1522
    int cluster_bits;
1523
    uint16_t *refcount_block;
1524
    uint64_t *refcount_table;
1525
    int64_t l1_table_offset;
1526
    int64_t refcount_table_offset;
1527
    int64_t refcount_block_offset;
1528
} QCowCreateState;
1529

    
1530
static void create_refcount_update(QCowCreateState *s,
1531
                                   int64_t offset, int64_t size)
1532
{
1533
    int refcount;
1534
    int64_t start, last, cluster_offset;
1535
    uint16_t *p;
1536

    
1537
    start = offset & ~(s->cluster_size - 1);
1538
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
1539
    for(cluster_offset = start; cluster_offset <= last;
1540
        cluster_offset += s->cluster_size) {
1541
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
1542
        refcount = be16_to_cpu(*p);
1543
        refcount++;
1544
        *p = cpu_to_be16(refcount);
1545
    }
1546
}
1547

    
1548
static int qcow_create2(const char *filename, int64_t total_size,
1549
                        const char *backing_file, const char *backing_format,
1550
                        int flags)
1551
{
1552

    
1553
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1554
    int backing_format_len = 0;
1555
    QCowHeader header;
1556
    uint64_t tmp, offset;
1557
    QCowCreateState s1, *s = &s1;
1558
    QCowExtension ext_bf = {0, 0};
1559

    
1560

    
1561
    memset(s, 0, sizeof(*s));
1562

    
1563
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1564
    if (fd < 0)
1565
        return -1;
1566
    memset(&header, 0, sizeof(header));
1567
    header.magic = cpu_to_be32(QCOW_MAGIC);
1568
    header.version = cpu_to_be32(QCOW_VERSION);
1569
    header.size = cpu_to_be64(total_size * 512);
1570
    header_size = sizeof(header);
1571
    backing_filename_len = 0;
1572
    if (backing_file) {
1573
        if (backing_format) {
1574
            ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT;
1575
            backing_format_len = strlen(backing_format);
1576
            ext_bf.len = (backing_format_len + 7) & ~7;
1577
            header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7);
1578
        }
1579
        header.backing_file_offset = cpu_to_be64(header_size);
1580
        backing_filename_len = strlen(backing_file);
1581
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1582
        header_size += backing_filename_len;
1583
    }
1584
    s->cluster_bits = 12;  /* 4 KB clusters */
1585
    s->cluster_size = 1 << s->cluster_bits;
1586
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1587
    header_size = (header_size + 7) & ~7;
1588
    if (flags & BLOCK_FLAG_ENCRYPT) {
1589
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1590
    } else {
1591
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1592
    }
1593
    l2_bits = s->cluster_bits - 3;
1594
    shift = s->cluster_bits + l2_bits;
1595
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1596
    offset = align_offset(header_size, s->cluster_size);
1597
    s->l1_table_offset = offset;
1598
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1599
    header.l1_size = cpu_to_be32(l1_size);
1600
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1601

    
1602
    s->refcount_table = qemu_mallocz(s->cluster_size);
1603
    s->refcount_block = qemu_mallocz(s->cluster_size);
1604

    
1605
    s->refcount_table_offset = offset;
1606
    header.refcount_table_offset = cpu_to_be64(offset);
1607
    header.refcount_table_clusters = cpu_to_be32(1);
1608
    offset += s->cluster_size;
1609

    
1610
    s->refcount_table[0] = cpu_to_be64(offset);
1611
    s->refcount_block_offset = offset;
1612
    offset += s->cluster_size;
1613

    
1614
    /* update refcounts */
1615
    create_refcount_update(s, 0, header_size);
1616
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1617
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1618
    create_refcount_update(s, s->refcount_block_offset, s->cluster_size);
1619

    
1620
    /* write all the data */
1621
    write(fd, &header, sizeof(header));
1622
    if (backing_file) {
1623
        if (backing_format_len) {
1624
            char zero[16];
1625
            int d = ext_bf.len - backing_format_len;
1626

    
1627
            memset(zero, 0, sizeof(zero));
1628
            cpu_to_be32s(&ext_bf.magic);
1629
            cpu_to_be32s(&ext_bf.len);
1630
            write(fd, &ext_bf, sizeof(ext_bf));
1631
            write(fd, backing_format, backing_format_len);
1632
            if (d>0) {
1633
                write(fd, zero, d);
1634
            }
1635
        }
1636
        write(fd, backing_file, backing_filename_len);
1637
    }
1638
    lseek(fd, s->l1_table_offset, SEEK_SET);
1639
    tmp = 0;
1640
    for(i = 0;i < l1_size; i++) {
1641
        write(fd, &tmp, sizeof(tmp));
1642
    }
1643
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1644
    write(fd, s->refcount_table, s->cluster_size);
1645

    
1646
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1647
    write(fd, s->refcount_block, s->cluster_size);
1648

    
1649
    qemu_free(s->refcount_table);
1650
    qemu_free(s->refcount_block);
1651
    close(fd);
1652
    return 0;
1653
}
1654

    
1655
static int qcow_create(const char *filename, int64_t total_size,
1656
                       const char *backing_file, int flags)
1657
{
1658
    return qcow_create2(filename, total_size, backing_file, NULL, flags);
1659
}
1660

    
1661
static int qcow_make_empty(BlockDriverState *bs)
1662
{
1663
#if 0
1664
    /* XXX: not correct */
1665
    BDRVQcowState *s = bs->opaque;
1666
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1667
    int ret;
1668

1669
    memset(s->l1_table, 0, l1_length);
1670
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1671
        return -1;
1672
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1673
    if (ret < 0)
1674
        return ret;
1675

1676
    l2_cache_reset(bs);
1677
#endif
1678
    return 0;
1679
}
1680

    
1681
/* XXX: put compressed sectors first, then all the cluster aligned
1682
   tables to avoid losing bytes in alignment */
1683
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1684
                                 const uint8_t *buf, int nb_sectors)
1685
{
1686
    BDRVQcowState *s = bs->opaque;
1687
    z_stream strm;
1688
    int ret, out_len;
1689
    uint8_t *out_buf;
1690
    uint64_t cluster_offset;
1691

    
1692
    if (nb_sectors == 0) {
1693
        /* align end of file to a sector boundary to ease reading with
1694
           sector based I/Os */
1695
        cluster_offset = bdrv_getlength(s->hd);
1696
        cluster_offset = (cluster_offset + 511) & ~511;
1697
        bdrv_truncate(s->hd, cluster_offset);
1698
        return 0;
1699
    }
1700

    
1701
    if (nb_sectors != s->cluster_sectors)
1702
        return -EINVAL;
1703

    
1704
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1705

    
1706
    /* best compression, small window, no zlib header */
1707
    memset(&strm, 0, sizeof(strm));
1708
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1709
                       Z_DEFLATED, -12,
1710
                       9, Z_DEFAULT_STRATEGY);
1711
    if (ret != 0) {
1712
        qemu_free(out_buf);
1713
        return -1;
1714
    }
1715

    
1716
    strm.avail_in = s->cluster_size;
1717
    strm.next_in = (uint8_t *)buf;
1718
    strm.avail_out = s->cluster_size;
1719
    strm.next_out = out_buf;
1720

    
1721
    ret = deflate(&strm, Z_FINISH);
1722
    if (ret != Z_STREAM_END && ret != Z_OK) {
1723
        qemu_free(out_buf);
1724
        deflateEnd(&strm);
1725
        return -1;
1726
    }
1727
    out_len = strm.next_out - out_buf;
1728

    
1729
    deflateEnd(&strm);
1730

    
1731
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1732
        /* could not compress: write normal cluster */
1733
        qcow_write(bs, sector_num, buf, s->cluster_sectors);
1734
    } else {
1735
        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
1736
                                              out_len);
1737
        if (!cluster_offset)
1738
            return -1;
1739
        cluster_offset &= s->cluster_offset_mask;
1740
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1741
            qemu_free(out_buf);
1742
            return -1;
1743
        }
1744
    }
1745

    
1746
    qemu_free(out_buf);
1747
    return 0;
1748
}
1749

    
1750
static void qcow_flush(BlockDriverState *bs)
1751
{
1752
    BDRVQcowState *s = bs->opaque;
1753
    bdrv_flush(s->hd);
1754
}
1755

    
1756
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1757
{
1758
    BDRVQcowState *s = bs->opaque;
1759
    bdi->cluster_size = s->cluster_size;
1760
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1761
        (s->cluster_bits + s->l2_bits);
1762
    return 0;
1763
}
1764

    
1765
/*********************************************************/
1766
/* snapshot support */
1767

    
1768
/* update the refcounts of snapshots and the copied flag */
1769
static int update_snapshot_refcount(BlockDriverState *bs,
1770
                                    int64_t l1_table_offset,
1771
                                    int l1_size,
1772
                                    int addend)
1773
{
1774
    BDRVQcowState *s = bs->opaque;
1775
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
1776
    int64_t old_offset, old_l2_offset;
1777
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
1778

    
1779
    l2_cache_reset(bs);
1780

    
1781
    l2_table = NULL;
1782
    l1_table = NULL;
1783
    l1_size2 = l1_size * sizeof(uint64_t);
1784
    l1_allocated = 0;
1785
    if (l1_table_offset != s->l1_table_offset) {
1786
        l1_table = qemu_malloc(l1_size2);
1787
        l1_allocated = 1;
1788
        if (bdrv_pread(s->hd, l1_table_offset,
1789
                       l1_table, l1_size2) != l1_size2)
1790
            goto fail;
1791
        for(i = 0;i < l1_size; i++)
1792
            be64_to_cpus(&l1_table[i]);
1793
    } else {
1794
        assert(l1_size == s->l1_size);
1795
        l1_table = s->l1_table;
1796
        l1_allocated = 0;
1797
    }
1798

    
1799
    l2_size = s->l2_size * sizeof(uint64_t);
1800
    l2_table = qemu_malloc(l2_size);
1801
    l1_modified = 0;
1802
    for(i = 0; i < l1_size; i++) {
1803
        l2_offset = l1_table[i];
1804
        if (l2_offset) {
1805
            old_l2_offset = l2_offset;
1806
            l2_offset &= ~QCOW_OFLAG_COPIED;
1807
            l2_modified = 0;
1808
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
1809
                goto fail;
1810
            for(j = 0; j < s->l2_size; j++) {
1811
                offset = be64_to_cpu(l2_table[j]);
1812
                if (offset != 0) {
1813
                    old_offset = offset;
1814
                    offset &= ~QCOW_OFLAG_COPIED;
1815
                    if (offset & QCOW_OFLAG_COMPRESSED) {
1816
                        nb_csectors = ((offset >> s->csize_shift) &
1817
                                       s->csize_mask) + 1;
1818
                        if (addend != 0)
1819
                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
1820
                                            nb_csectors * 512, addend);
1821
                        /* compressed clusters are never modified */
1822
                        refcount = 2;
1823
                    } else {
1824
                        if (addend != 0) {
1825
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
1826
                        } else {
1827
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
1828
                        }
1829
                    }
1830

    
1831
                    if (refcount == 1) {
1832
                        offset |= QCOW_OFLAG_COPIED;
1833
                    }
1834
                    if (offset != old_offset) {
1835
                        l2_table[j] = cpu_to_be64(offset);
1836
                        l2_modified = 1;
1837
                    }
1838
                }
1839
            }
1840
            if (l2_modified) {
1841
                if (bdrv_pwrite(s->hd,
1842
                                l2_offset, l2_table, l2_size) != l2_size)
1843
                    goto fail;
1844
            }
1845

    
1846
            if (addend != 0) {
1847
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
1848
            } else {
1849
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1850
            }
1851
            if (refcount == 1) {
1852
                l2_offset |= QCOW_OFLAG_COPIED;
1853
            }
1854
            if (l2_offset != old_l2_offset) {
1855
                l1_table[i] = l2_offset;
1856
                l1_modified = 1;
1857
            }
1858
        }
1859
    }
1860
    if (l1_modified) {
1861
        for(i = 0; i < l1_size; i++)
1862
            cpu_to_be64s(&l1_table[i]);
1863
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
1864
                        l1_size2) != l1_size2)
1865
            goto fail;
1866
        for(i = 0; i < l1_size; i++)
1867
            be64_to_cpus(&l1_table[i]);
1868
    }
1869
    if (l1_allocated)
1870
        qemu_free(l1_table);
1871
    qemu_free(l2_table);
1872
    return 0;
1873
 fail:
1874
    if (l1_allocated)
1875
        qemu_free(l1_table);
1876
    qemu_free(l2_table);
1877
    return -EIO;
1878
}
1879

    
1880
static void qcow_free_snapshots(BlockDriverState *bs)
1881
{
1882
    BDRVQcowState *s = bs->opaque;
1883
    int i;
1884

    
1885
    for(i = 0; i < s->nb_snapshots; i++) {
1886
        qemu_free(s->snapshots[i].name);
1887
        qemu_free(s->snapshots[i].id_str);
1888
    }
1889
    qemu_free(s->snapshots);
1890
    s->snapshots = NULL;
1891
    s->nb_snapshots = 0;
1892
}
1893

    
1894
static int qcow_read_snapshots(BlockDriverState *bs)
1895
{
1896
    BDRVQcowState *s = bs->opaque;
1897
    QCowSnapshotHeader h;
1898
    QCowSnapshot *sn;
1899
    int i, id_str_size, name_size;
1900
    int64_t offset;
1901
    uint32_t extra_data_size;
1902

    
1903
    if (!s->nb_snapshots) {
1904
        s->snapshots = NULL;
1905
        s->snapshots_size = 0;
1906
        return 0;
1907
    }
1908

    
1909
    offset = s->snapshots_offset;
1910
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1911
    for(i = 0; i < s->nb_snapshots; i++) {
1912
        offset = align_offset(offset, 8);
1913
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1914
            goto fail;
1915
        offset += sizeof(h);
1916
        sn = s->snapshots + i;
1917
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1918
        sn->l1_size = be32_to_cpu(h.l1_size);
1919
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1920
        sn->date_sec = be32_to_cpu(h.date_sec);
1921
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1922
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1923
        extra_data_size = be32_to_cpu(h.extra_data_size);
1924

    
1925
        id_str_size = be16_to_cpu(h.id_str_size);
1926
        name_size = be16_to_cpu(h.name_size);
1927

    
1928
        offset += extra_data_size;
1929

    
1930
        sn->id_str = qemu_malloc(id_str_size + 1);
1931
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1932
            goto fail;
1933
        offset += id_str_size;
1934
        sn->id_str[id_str_size] = '\0';
1935

    
1936
        sn->name = qemu_malloc(name_size + 1);
1937
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1938
            goto fail;
1939
        offset += name_size;
1940
        sn->name[name_size] = '\0';
1941
    }
1942
    s->snapshots_size = offset - s->snapshots_offset;
1943
    return 0;
1944
 fail:
1945
    qcow_free_snapshots(bs);
1946
    return -1;
1947
}
1948

    
1949
/* add at the end of the file a new list of snapshots */
1950
static int qcow_write_snapshots(BlockDriverState *bs)
1951
{
1952
    BDRVQcowState *s = bs->opaque;
1953
    QCowSnapshot *sn;
1954
    QCowSnapshotHeader h;
1955
    int i, name_size, id_str_size, snapshots_size;
1956
    uint64_t data64;
1957
    uint32_t data32;
1958
    int64_t offset, snapshots_offset;
1959

    
1960
    /* compute the size of the snapshots */
1961
    offset = 0;
1962
    for(i = 0; i < s->nb_snapshots; i++) {
1963
        sn = s->snapshots + i;
1964
        offset = align_offset(offset, 8);
1965
        offset += sizeof(h);
1966
        offset += strlen(sn->id_str);
1967
        offset += strlen(sn->name);
1968
    }
1969
    snapshots_size = offset;
1970

    
1971
    snapshots_offset = alloc_clusters(bs, snapshots_size);
1972
    offset = snapshots_offset;
1973

    
1974
    for(i = 0; i < s->nb_snapshots; i++) {
1975
        sn = s->snapshots + i;
1976
        memset(&h, 0, sizeof(h));
1977
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
1978
        h.l1_size = cpu_to_be32(sn->l1_size);
1979
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
1980
        h.date_sec = cpu_to_be32(sn->date_sec);
1981
        h.date_nsec = cpu_to_be32(sn->date_nsec);
1982
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
1983

    
1984
        id_str_size = strlen(sn->id_str);
1985
        name_size = strlen(sn->name);
1986
        h.id_str_size = cpu_to_be16(id_str_size);
1987
        h.name_size = cpu_to_be16(name_size);
1988
        offset = align_offset(offset, 8);
1989
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1990
            goto fail;
1991
        offset += sizeof(h);
1992
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1993
            goto fail;
1994
        offset += id_str_size;
1995
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
1996
            goto fail;
1997
        offset += name_size;
1998
    }
1999

    
2000
    /* update the various header fields */
2001
    data64 = cpu_to_be64(snapshots_offset);
2002
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
2003
                    &data64, sizeof(data64)) != sizeof(data64))
2004
        goto fail;
2005
    data32 = cpu_to_be32(s->nb_snapshots);
2006
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
2007
                    &data32, sizeof(data32)) != sizeof(data32))
2008
        goto fail;
2009

    
2010
    /* free the old snapshot table */
2011
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
2012
    s->snapshots_offset = snapshots_offset;
2013
    s->snapshots_size = snapshots_size;
2014
    return 0;
2015
 fail:
2016
    return -1;
2017
}
2018

    
2019
static void find_new_snapshot_id(BlockDriverState *bs,
2020
                                 char *id_str, int id_str_size)
2021
{
2022
    BDRVQcowState *s = bs->opaque;
2023
    QCowSnapshot *sn;
2024
    int i, id, id_max = 0;
2025

    
2026
    for(i = 0; i < s->nb_snapshots; i++) {
2027
        sn = s->snapshots + i;
2028
        id = strtoul(sn->id_str, NULL, 10);
2029
        if (id > id_max)
2030
            id_max = id;
2031
    }
2032
    snprintf(id_str, id_str_size, "%d", id_max + 1);
2033
}
2034

    
2035
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
2036
{
2037
    BDRVQcowState *s = bs->opaque;
2038
    int i;
2039

    
2040
    for(i = 0; i < s->nb_snapshots; i++) {
2041
        if (!strcmp(s->snapshots[i].id_str, id_str))
2042
            return i;
2043
    }
2044
    return -1;
2045
}
2046

    
2047
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
2048
{
2049
    BDRVQcowState *s = bs->opaque;
2050
    int i, ret;
2051

    
2052
    ret = find_snapshot_by_id(bs, name);
2053
    if (ret >= 0)
2054
        return ret;
2055
    for(i = 0; i < s->nb_snapshots; i++) {
2056
        if (!strcmp(s->snapshots[i].name, name))
2057
            return i;
2058
    }
2059
    return -1;
2060
}
2061

    
2062
/* if no id is provided, a new one is constructed */
2063
static int qcow_snapshot_create(BlockDriverState *bs,
2064
                                QEMUSnapshotInfo *sn_info)
2065
{
2066
    BDRVQcowState *s = bs->opaque;
2067
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
2068
    int i, ret;
2069
    uint64_t *l1_table = NULL;
2070

    
2071
    memset(sn, 0, sizeof(*sn));
2072

    
2073
    if (sn_info->id_str[0] == '\0') {
2074
        /* compute a new id */
2075
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
2076
    }
2077

    
2078
    /* check that the ID is unique */
2079
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
2080
        return -ENOENT;
2081

    
2082
    sn->id_str = qemu_strdup(sn_info->id_str);
2083
    if (!sn->id_str)
2084
        goto fail;
2085
    sn->name = qemu_strdup(sn_info->name);
2086
    if (!sn->name)
2087
        goto fail;
2088
    sn->vm_state_size = sn_info->vm_state_size;
2089
    sn->date_sec = sn_info->date_sec;
2090
    sn->date_nsec = sn_info->date_nsec;
2091
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
2092

    
2093
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
2094
    if (ret < 0)
2095
        goto fail;
2096

    
2097
    /* create the L1 table of the snapshot */
2098
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
2099
    sn->l1_size = s->l1_size;
2100

    
2101
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
2102
    for(i = 0; i < s->l1_size; i++) {
2103
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
2104
    }
2105
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
2106
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
2107
        (s->l1_size * sizeof(uint64_t)))
2108
        goto fail;
2109
    qemu_free(l1_table);
2110
    l1_table = NULL;
2111

    
2112
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
2113
    if (s->snapshots) {
2114
        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
2115
        qemu_free(s->snapshots);
2116
    }
2117
    s->snapshots = snapshots1;
2118
    s->snapshots[s->nb_snapshots++] = *sn;
2119

    
2120
    if (qcow_write_snapshots(bs) < 0)
2121
        goto fail;
2122
#ifdef DEBUG_ALLOC
2123
    check_refcounts(bs);
2124
#endif
2125
    return 0;
2126
 fail:
2127
    qemu_free(sn->name);
2128
    qemu_free(l1_table);
2129
    return -1;
2130
}
2131

    
2132
/* copy the snapshot 'snapshot_name' into the current disk image */
2133
static int qcow_snapshot_goto(BlockDriverState *bs,
2134
                              const char *snapshot_id)
2135
{
2136
    BDRVQcowState *s = bs->opaque;
2137
    QCowSnapshot *sn;
2138
    int i, snapshot_index, l1_size2;
2139

    
2140
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2141
    if (snapshot_index < 0)
2142
        return -ENOENT;
2143
    sn = &s->snapshots[snapshot_index];
2144

    
2145
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
2146
        goto fail;
2147

    
2148
    if (grow_l1_table(bs, sn->l1_size) < 0)
2149
        goto fail;
2150

    
2151
    s->l1_size = sn->l1_size;
2152
    l1_size2 = s->l1_size * sizeof(uint64_t);
2153
    /* copy the snapshot l1 table to the current l1 table */
2154
    if (bdrv_pread(s->hd, sn->l1_table_offset,
2155
                   s->l1_table, l1_size2) != l1_size2)
2156
        goto fail;
2157
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
2158
                    s->l1_table, l1_size2) != l1_size2)
2159
        goto fail;
2160
    for(i = 0;i < s->l1_size; i++) {
2161
        be64_to_cpus(&s->l1_table[i]);
2162
    }
2163

    
2164
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
2165
        goto fail;
2166

    
2167
#ifdef DEBUG_ALLOC
2168
    check_refcounts(bs);
2169
#endif
2170
    return 0;
2171
 fail:
2172
    return -EIO;
2173
}
2174

    
2175
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2176
{
2177
    BDRVQcowState *s = bs->opaque;
2178
    QCowSnapshot *sn;
2179
    int snapshot_index, ret;
2180

    
2181
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2182
    if (snapshot_index < 0)
2183
        return -ENOENT;
2184
    sn = &s->snapshots[snapshot_index];
2185

    
2186
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
2187
    if (ret < 0)
2188
        return ret;
2189
    /* must update the copied flag on the current cluster offsets */
2190
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
2191
    if (ret < 0)
2192
        return ret;
2193
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
2194

    
2195
    qemu_free(sn->id_str);
2196
    qemu_free(sn->name);
2197
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
2198
    s->nb_snapshots--;
2199
    ret = qcow_write_snapshots(bs);
2200
    if (ret < 0) {
2201
        /* XXX: restore snapshot if error ? */
2202
        return ret;
2203
    }
2204
#ifdef DEBUG_ALLOC
2205
    check_refcounts(bs);
2206
#endif
2207
    return 0;
2208
}
2209

    
2210
static int qcow_snapshot_list(BlockDriverState *bs,
2211
                              QEMUSnapshotInfo **psn_tab)
2212
{
2213
    BDRVQcowState *s = bs->opaque;
2214
    QEMUSnapshotInfo *sn_tab, *sn_info;
2215
    QCowSnapshot *sn;
2216
    int i;
2217

    
2218
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
2219
    for(i = 0; i < s->nb_snapshots; i++) {
2220
        sn_info = sn_tab + i;
2221
        sn = s->snapshots + i;
2222
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
2223
                sn->id_str);
2224
        pstrcpy(sn_info->name, sizeof(sn_info->name),
2225
                sn->name);
2226
        sn_info->vm_state_size = sn->vm_state_size;
2227
        sn_info->date_sec = sn->date_sec;
2228
        sn_info->date_nsec = sn->date_nsec;
2229
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
2230
    }
2231
    *psn_tab = sn_tab;
2232
    return s->nb_snapshots;
2233
}
2234

    
2235
/*********************************************************/
2236
/* refcount handling */
2237

    
2238
static int refcount_init(BlockDriverState *bs)
2239
{
2240
    BDRVQcowState *s = bs->opaque;
2241
    int ret, refcount_table_size2, i;
2242

    
2243
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
2244
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
2245
    s->refcount_table = qemu_malloc(refcount_table_size2);
2246
    if (s->refcount_table_size > 0) {
2247
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
2248
                         s->refcount_table, refcount_table_size2);
2249
        if (ret != refcount_table_size2)
2250
            goto fail;
2251
        for(i = 0; i < s->refcount_table_size; i++)
2252
            be64_to_cpus(&s->refcount_table[i]);
2253
    }
2254
    return 0;
2255
 fail:
2256
    return -ENOMEM;
2257
}
2258

    
2259
static void refcount_close(BlockDriverState *bs)
2260
{
2261
    BDRVQcowState *s = bs->opaque;
2262
    qemu_free(s->refcount_block_cache);
2263
    qemu_free(s->refcount_table);
2264
}
2265

    
2266

    
2267
static int load_refcount_block(BlockDriverState *bs,
2268
                               int64_t refcount_block_offset)
2269
{
2270
    BDRVQcowState *s = bs->opaque;
2271
    int ret;
2272
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
2273
                     s->cluster_size);
2274
    if (ret != s->cluster_size)
2275
        return -EIO;
2276
    s->refcount_block_cache_offset = refcount_block_offset;
2277
    return 0;
2278
}
2279

    
2280
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
2281
{
2282
    BDRVQcowState *s = bs->opaque;
2283
    int refcount_table_index, block_index;
2284
    int64_t refcount_block_offset;
2285

    
2286
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2287
    if (refcount_table_index >= s->refcount_table_size)
2288
        return 0;
2289
    refcount_block_offset = s->refcount_table[refcount_table_index];
2290
    if (!refcount_block_offset)
2291
        return 0;
2292
    if (refcount_block_offset != s->refcount_block_cache_offset) {
2293
        /* better than nothing: return allocated if read error */
2294
        if (load_refcount_block(bs, refcount_block_offset) < 0)
2295
            return 1;
2296
    }
2297
    block_index = cluster_index &
2298
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2299
    return be16_to_cpu(s->refcount_block_cache[block_index]);
2300
}
2301

    
2302
/* return < 0 if error */
2303
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
2304
{
2305
    BDRVQcowState *s = bs->opaque;
2306
    int i, nb_clusters;
2307

    
2308
    nb_clusters = size_to_clusters(s, size);
2309
retry:
2310
    for(i = 0; i < nb_clusters; i++) {
2311
        int64_t i = s->free_cluster_index++;
2312
        if (get_refcount(bs, i) != 0)
2313
            goto retry;
2314
    }
2315
#ifdef DEBUG_ALLOC2
2316
    printf("alloc_clusters: size=%lld -> %lld\n",
2317
            size,
2318
            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
2319
#endif
2320
    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
2321
}
2322

    
2323
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
2324
{
2325
    int64_t offset;
2326

    
2327
    offset = alloc_clusters_noref(bs, size);
2328
    update_refcount(bs, offset, size, 1);
2329
    return offset;
2330
}
2331

    
2332
/* only used to allocate compressed sectors. We try to allocate
2333
   contiguous sectors. size must be <= cluster_size */
2334
static int64_t alloc_bytes(BlockDriverState *bs, int size)
2335
{
2336
    BDRVQcowState *s = bs->opaque;
2337
    int64_t offset, cluster_offset;
2338
    int free_in_cluster;
2339

    
2340
    assert(size > 0 && size <= s->cluster_size);
2341
    if (s->free_byte_offset == 0) {
2342
        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
2343
    }
2344
 redo:
2345
    free_in_cluster = s->cluster_size -
2346
        (s->free_byte_offset & (s->cluster_size - 1));
2347
    if (size <= free_in_cluster) {
2348
        /* enough space in current cluster */
2349
        offset = s->free_byte_offset;
2350
        s->free_byte_offset += size;
2351
        free_in_cluster -= size;
2352
        if (free_in_cluster == 0)
2353
            s->free_byte_offset = 0;
2354
        if ((offset & (s->cluster_size - 1)) != 0)
2355
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2356
    } else {
2357
        offset = alloc_clusters(bs, s->cluster_size);
2358
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
2359
        if ((cluster_offset + s->cluster_size) == offset) {
2360
            /* we are lucky: contiguous data */
2361
            offset = s->free_byte_offset;
2362
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2363
            s->free_byte_offset += size;
2364
        } else {
2365
            s->free_byte_offset = offset;
2366
            goto redo;
2367
        }
2368
    }
2369
    return offset;
2370
}
2371

    
2372
static void free_clusters(BlockDriverState *bs,
2373
                          int64_t offset, int64_t size)
2374
{
2375
    update_refcount(bs, offset, size, -1);
2376
}
2377

    
2378
static int grow_refcount_table(BlockDriverState *bs, int min_size)
2379
{
2380
    BDRVQcowState *s = bs->opaque;
2381
    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
2382
    uint64_t *new_table;
2383
    int64_t table_offset;
2384
    uint8_t data[12];
2385
    int old_table_size;
2386
    int64_t old_table_offset;
2387

    
2388
    if (min_size <= s->refcount_table_size)
2389
        return 0;
2390
    /* compute new table size */
2391
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2392
    for(;;) {
2393
        if (refcount_table_clusters == 0) {
2394
            refcount_table_clusters = 1;
2395
        } else {
2396
            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
2397
        }
2398
        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
2399
        if (min_size <= new_table_size)
2400
            break;
2401
    }
2402
#ifdef DEBUG_ALLOC2
2403
    printf("grow_refcount_table from %d to %d\n",
2404
           s->refcount_table_size,
2405
           new_table_size);
2406
#endif
2407
    new_table_size2 = new_table_size * sizeof(uint64_t);
2408
    new_table = qemu_mallocz(new_table_size2);
2409
    memcpy(new_table, s->refcount_table,
2410
           s->refcount_table_size * sizeof(uint64_t));
2411
    for(i = 0; i < s->refcount_table_size; i++)
2412
        cpu_to_be64s(&new_table[i]);
2413
    /* Note: we cannot update the refcount now to avoid recursion */
2414
    table_offset = alloc_clusters_noref(bs, new_table_size2);
2415
    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
2416
    if (ret != new_table_size2)
2417
        goto fail;
2418
    for(i = 0; i < s->refcount_table_size; i++)
2419
        be64_to_cpus(&new_table[i]);
2420

    
2421
    cpu_to_be64w((uint64_t*)data, table_offset);
2422
    cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters);
2423
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
2424
                    data, sizeof(data)) != sizeof(data))
2425
        goto fail;
2426
    qemu_free(s->refcount_table);
2427
    old_table_offset = s->refcount_table_offset;
2428
    old_table_size = s->refcount_table_size;
2429
    s->refcount_table = new_table;
2430
    s->refcount_table_size = new_table_size;
2431
    s->refcount_table_offset = table_offset;
2432

    
2433
    update_refcount(bs, table_offset, new_table_size2, 1);
2434
    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
2435
    return 0;
2436
 fail:
2437
    free_clusters(bs, table_offset, new_table_size2);
2438
    qemu_free(new_table);
2439
    return -EIO;
2440
}
2441

    
2442
/* addend must be 1 or -1 */
2443
/* XXX: cache several refcount block clusters ? */
2444
static int update_cluster_refcount(BlockDriverState *bs,
2445
                                   int64_t cluster_index,
2446
                                   int addend)
2447
{
2448
    BDRVQcowState *s = bs->opaque;
2449
    int64_t offset, refcount_block_offset;
2450
    int ret, refcount_table_index, block_index, refcount;
2451
    uint64_t data64;
2452

    
2453
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2454
    if (refcount_table_index >= s->refcount_table_size) {
2455
        if (addend < 0)
2456
            return -EINVAL;
2457
        ret = grow_refcount_table(bs, refcount_table_index + 1);
2458
        if (ret < 0)
2459
            return ret;
2460
    }
2461
    refcount_block_offset = s->refcount_table[refcount_table_index];
2462
    if (!refcount_block_offset) {
2463
        if (addend < 0)
2464
            return -EINVAL;
2465
        /* create a new refcount block */
2466
        /* Note: we cannot update the refcount now to avoid recursion */
2467
        offset = alloc_clusters_noref(bs, s->cluster_size);
2468
        memset(s->refcount_block_cache, 0, s->cluster_size);
2469
        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
2470
        if (ret != s->cluster_size)
2471
            return -EINVAL;
2472
        s->refcount_table[refcount_table_index] = offset;
2473
        data64 = cpu_to_be64(offset);
2474
        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
2475
                          refcount_table_index * sizeof(uint64_t),
2476
                          &data64, sizeof(data64));
2477
        if (ret != sizeof(data64))
2478
            return -EINVAL;
2479

    
2480
        refcount_block_offset = offset;
2481
        s->refcount_block_cache_offset = offset;
2482
        update_refcount(bs, offset, s->cluster_size, 1);
2483
    } else {
2484
        if (refcount_block_offset != s->refcount_block_cache_offset) {
2485
            if (load_refcount_block(bs, refcount_block_offset) < 0)
2486
                return -EIO;
2487
        }
2488
    }
2489
    /* we can update the count and save it */
2490
    block_index = cluster_index &
2491
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2492
    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
2493
    refcount += addend;
2494
    if (refcount < 0 || refcount > 0xffff)
2495
        return -EINVAL;
2496
    if (refcount == 0 && cluster_index < s->free_cluster_index) {
2497
        s->free_cluster_index = cluster_index;
2498
    }
2499
    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
2500
    if (bdrv_pwrite(s->hd,
2501
                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
2502
                    &s->refcount_block_cache[block_index], 2) != 2)
2503
        return -EIO;
2504
    return refcount;
2505
}
2506

    
2507
static void update_refcount(BlockDriverState *bs,
2508
                            int64_t offset, int64_t length,
2509
                            int addend)
2510
{
2511
    BDRVQcowState *s = bs->opaque;
2512
    int64_t start, last, cluster_offset;
2513

    
2514
#ifdef DEBUG_ALLOC2
2515
    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
2516
           offset, length, addend);
2517
#endif
2518
    if (length <= 0)
2519
        return;
2520
    start = offset & ~(s->cluster_size - 1);
2521
    last = (offset + length - 1) & ~(s->cluster_size - 1);
2522
    for(cluster_offset = start; cluster_offset <= last;
2523
        cluster_offset += s->cluster_size) {
2524
        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
2525
    }
2526
}
2527

    
2528
#ifdef DEBUG_ALLOC
2529
static void inc_refcounts(BlockDriverState *bs,
2530
                          uint16_t *refcount_table,
2531
                          int refcount_table_size,
2532
                          int64_t offset, int64_t size)
2533
{
2534
    BDRVQcowState *s = bs->opaque;
2535
    int64_t start, last, cluster_offset;
2536
    int k;
2537

    
2538
    if (size <= 0)
2539
        return;
2540

    
2541
    start = offset & ~(s->cluster_size - 1);
2542
    last = (offset + size - 1) & ~(s->cluster_size - 1);
2543
    for(cluster_offset = start; cluster_offset <= last;
2544
        cluster_offset += s->cluster_size) {
2545
        k = cluster_offset >> s->cluster_bits;
2546
        if (k < 0 || k >= refcount_table_size) {
2547
            printf("ERROR: invalid cluster offset=0x%llx\n", cluster_offset);
2548
        } else {
2549
            if (++refcount_table[k] == 0) {
2550
                printf("ERROR: overflow cluster offset=0x%llx\n", cluster_offset);
2551
            }
2552
        }
2553
    }
2554
}
2555

    
2556
static int check_refcounts_l1(BlockDriverState *bs,
2557
                              uint16_t *refcount_table,
2558
                              int refcount_table_size,
2559
                              int64_t l1_table_offset, int l1_size,
2560
                              int check_copied)
2561
{
2562
    BDRVQcowState *s = bs->opaque;
2563
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
2564
    int l2_size, i, j, nb_csectors, refcount;
2565

    
2566
    l2_table = NULL;
2567
    l1_size2 = l1_size * sizeof(uint64_t);
2568

    
2569
    inc_refcounts(bs, refcount_table, refcount_table_size,
2570
                  l1_table_offset, l1_size2);
2571

    
2572
    l1_table = qemu_malloc(l1_size2);
2573
    if (bdrv_pread(s->hd, l1_table_offset,
2574
                   l1_table, l1_size2) != l1_size2)
2575
        goto fail;
2576
    for(i = 0;i < l1_size; i++)
2577
        be64_to_cpus(&l1_table[i]);
2578

    
2579
    l2_size = s->l2_size * sizeof(uint64_t);
2580
    l2_table = qemu_malloc(l2_size);
2581
    for(i = 0; i < l1_size; i++) {
2582
        l2_offset = l1_table[i];
2583
        if (l2_offset) {
2584
            if (check_copied) {
2585
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2586
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
2587
                    printf("ERROR OFLAG_COPIED: l2_offset=%llx refcount=%d\n",
2588
                           l2_offset, refcount);
2589
                }
2590
            }
2591
            l2_offset &= ~QCOW_OFLAG_COPIED;
2592
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
2593
                goto fail;
2594
            for(j = 0; j < s->l2_size; j++) {
2595
                offset = be64_to_cpu(l2_table[j]);
2596
                if (offset != 0) {
2597
                    if (offset & QCOW_OFLAG_COMPRESSED) {
2598
                        if (offset & QCOW_OFLAG_COPIED) {
2599
                            printf("ERROR: cluster %lld: copied flag must never be set for compressed clusters\n",
2600
                                   offset >> s->cluster_bits);
2601
                            offset &= ~QCOW_OFLAG_COPIED;
2602
                        }
2603
                        nb_csectors = ((offset >> s->csize_shift) &
2604
                                       s->csize_mask) + 1;
2605
                        offset &= s->cluster_offset_mask;
2606
                        inc_refcounts(bs, refcount_table,
2607
                                      refcount_table_size,
2608
                                      offset & ~511, nb_csectors * 512);
2609
                    } else {
2610
                        if (check_copied) {
2611
                            refcount = get_refcount(bs, (offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2612
                            if ((refcount == 1) != ((offset & QCOW_OFLAG_COPIED) != 0)) {
2613
                                printf("ERROR OFLAG_COPIED: offset=%llx refcount=%d\n",
2614
                                       offset, refcount);
2615
                            }
2616
                        }
2617
                        offset &= ~QCOW_OFLAG_COPIED;
2618
                        inc_refcounts(bs, refcount_table,
2619
                                      refcount_table_size,
2620
                                      offset, s->cluster_size);
2621
                    }
2622
                }
2623
            }
2624
            inc_refcounts(bs, refcount_table,
2625
                          refcount_table_size,
2626
                          l2_offset,
2627
                          s->cluster_size);
2628
        }
2629
    }
2630
    qemu_free(l1_table);
2631
    qemu_free(l2_table);
2632
    return 0;
2633
 fail:
2634
    printf("ERROR: I/O error in check_refcounts_l1\n");
2635
    qemu_free(l1_table);
2636
    qemu_free(l2_table);
2637
    return -EIO;
2638
}
2639

    
2640
static void check_refcounts(BlockDriverState *bs)
2641
{
2642
    BDRVQcowState *s = bs->opaque;
2643
    int64_t size;
2644
    int nb_clusters, refcount1, refcount2, i;
2645
    QCowSnapshot *sn;
2646
    uint16_t *refcount_table;
2647

    
2648
    size = bdrv_getlength(s->hd);
2649
    nb_clusters = size_to_clusters(s, size);
2650
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
2651

    
2652
    /* header */
2653
    inc_refcounts(bs, refcount_table, nb_clusters,
2654
                  0, s->cluster_size);
2655

    
2656
    check_refcounts_l1(bs, refcount_table, nb_clusters,
2657
                       s->l1_table_offset, s->l1_size, 1);
2658

    
2659
    /* snapshots */
2660
    for(i = 0; i < s->nb_snapshots; i++) {
2661
        sn = s->snapshots + i;
2662
        check_refcounts_l1(bs, refcount_table, nb_clusters,
2663
                           sn->l1_table_offset, sn->l1_size, 0);
2664
    }
2665
    inc_refcounts(bs, refcount_table, nb_clusters,
2666
                  s->snapshots_offset, s->snapshots_size);
2667

    
2668
    /* refcount data */
2669
    inc_refcounts(bs, refcount_table, nb_clusters,
2670
                  s->refcount_table_offset,
2671
                  s->refcount_table_size * sizeof(uint64_t));
2672
    for(i = 0; i < s->refcount_table_size; i++) {
2673
        int64_t offset;
2674
        offset = s->refcount_table[i];
2675
        if (offset != 0) {
2676
            inc_refcounts(bs, refcount_table, nb_clusters,
2677
                          offset, s->cluster_size);
2678
        }
2679
    }
2680

    
2681
    /* compare ref counts */
2682
    for(i = 0; i < nb_clusters; i++) {
2683
        refcount1 = get_refcount(bs, i);
2684
        refcount2 = refcount_table[i];
2685
        if (refcount1 != refcount2)
2686
            printf("ERROR cluster %d refcount=%d reference=%d\n",
2687
                   i, refcount1, refcount2);
2688
    }
2689

    
2690
    qemu_free(refcount_table);
2691
}
2692

    
2693
#if 0
2694
static void dump_refcounts(BlockDriverState *bs)
2695
{
2696
    BDRVQcowState *s = bs->opaque;
2697
    int64_t nb_clusters, k, k1, size;
2698
    int refcount;
2699

2700
    size = bdrv_getlength(s->hd);
2701
    nb_clusters = size_to_clusters(s, size);
2702
    for(k = 0; k < nb_clusters;) {
2703
        k1 = k;
2704
        refcount = get_refcount(bs, k);
2705
        k++;
2706
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2707
            k++;
2708
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2709
    }
2710
}
2711
#endif
2712
#endif
2713

    
2714
BlockDriver bdrv_qcow2 = {
2715
    .format_name        = "qcow2",
2716
    .instance_size        = sizeof(BDRVQcowState),
2717
    .bdrv_probe                = qcow_probe,
2718
    .bdrv_open                = qcow_open,
2719
    .bdrv_close                = qcow_close,
2720
    .bdrv_create        = qcow_create,
2721
    .bdrv_flush                = qcow_flush,
2722
    .bdrv_is_allocated        = qcow_is_allocated,
2723
    .bdrv_set_key        = qcow_set_key,
2724
    .bdrv_make_empty        = qcow_make_empty,
2725

    
2726
    .bdrv_aio_read        = qcow_aio_read,
2727
    .bdrv_aio_write        = qcow_aio_write,
2728
    .bdrv_aio_cancel        = qcow_aio_cancel,
2729
    .aiocb_size                = sizeof(QCowAIOCB),
2730
    .bdrv_write_compressed = qcow_write_compressed,
2731

    
2732
    .bdrv_snapshot_create = qcow_snapshot_create,
2733
    .bdrv_snapshot_goto        = qcow_snapshot_goto,
2734
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2735
    .bdrv_snapshot_list        = qcow_snapshot_list,
2736
    .bdrv_get_info        = qcow_get_info,
2737

    
2738
    .bdrv_create2 = qcow_create2,
2739
};