Statistics
| Branch: | Revision:

root / block-qcow2.c @ 3f4cb3d3

History | View | Annotate | Download (85.7 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include <zlib.h>
27
#include "aes.h"
28
#include <assert.h>
29

    
30
/*
31
  Differences with QCOW:
32

33
  - Support for multiple incremental snapshots.
34
  - Memory management by reference counts.
35
  - Clusters which have a reference count of one have the bit
36
    QCOW_OFLAG_COPIED to optimize write performance.
37
  - Size of compressed clusters is stored in sectors to reduce bit usage
38
    in the cluster offsets.
39
  - Support for storing additional data (such as the VM state) in the
40
    snapshots.
41
  - If a backing store is used, the cluster size is not constrained
42
    (could be backported to QCOW).
43
  - L2 tables have always a size of one cluster.
44
*/
45

    
46
//#define DEBUG_ALLOC
47
//#define DEBUG_ALLOC2
48
//#define DEBUG_EXT
49

    
50
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
51
#define QCOW_VERSION 2
52

    
53
#define QCOW_CRYPT_NONE 0
54
#define QCOW_CRYPT_AES  1
55

    
56
#define QCOW_MAX_CRYPT_CLUSTERS 32
57

    
58
/* indicate that the refcount of the referenced cluster is exactly one. */
59
#define QCOW_OFLAG_COPIED     (1LL << 63)
60
/* indicate that the cluster is compressed (they never have the copied flag) */
61
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
62

    
63
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
64

    
65
typedef struct QCowHeader {
66
    uint32_t magic;
67
    uint32_t version;
68
    uint64_t backing_file_offset;
69
    uint32_t backing_file_size;
70
    uint32_t cluster_bits;
71
    uint64_t size; /* in bytes */
72
    uint32_t crypt_method;
73
    uint32_t l1_size; /* XXX: save number of clusters instead ? */
74
    uint64_t l1_table_offset;
75
    uint64_t refcount_table_offset;
76
    uint32_t refcount_table_clusters;
77
    uint32_t nb_snapshots;
78
    uint64_t snapshots_offset;
79
} QCowHeader;
80

    
81

    
82
typedef struct {
83
    uint32_t magic;
84
    uint32_t len;
85
} QCowExtension;
86
#define  QCOW_EXT_MAGIC_END 0
87
#define  QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
88

    
89

    
90
typedef struct __attribute__((packed)) QCowSnapshotHeader {
91
    /* header is 8 byte aligned */
92
    uint64_t l1_table_offset;
93

    
94
    uint32_t l1_size;
95
    uint16_t id_str_size;
96
    uint16_t name_size;
97

    
98
    uint32_t date_sec;
99
    uint32_t date_nsec;
100

    
101
    uint64_t vm_clock_nsec;
102

    
103
    uint32_t vm_state_size;
104
    uint32_t extra_data_size; /* for extension */
105
    /* extra data follows */
106
    /* id_str follows */
107
    /* name follows  */
108
} QCowSnapshotHeader;
109

    
110
#define L2_CACHE_SIZE 16
111

    
112
typedef struct QCowSnapshot {
113
    uint64_t l1_table_offset;
114
    uint32_t l1_size;
115
    char *id_str;
116
    char *name;
117
    uint32_t vm_state_size;
118
    uint32_t date_sec;
119
    uint32_t date_nsec;
120
    uint64_t vm_clock_nsec;
121
} QCowSnapshot;
122

    
123
typedef struct BDRVQcowState {
124
    BlockDriverState *hd;
125
    int cluster_bits;
126
    int cluster_size;
127
    int cluster_sectors;
128
    int l2_bits;
129
    int l2_size;
130
    int l1_size;
131
    int l1_vm_state_index;
132
    int csize_shift;
133
    int csize_mask;
134
    uint64_t cluster_offset_mask;
135
    uint64_t l1_table_offset;
136
    uint64_t *l1_table;
137
    uint64_t *l2_cache;
138
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
139
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
140
    uint8_t *cluster_cache;
141
    uint8_t *cluster_data;
142
    uint64_t cluster_cache_offset;
143

    
144
    uint64_t *refcount_table;
145
    uint64_t refcount_table_offset;
146
    uint32_t refcount_table_size;
147
    uint64_t refcount_block_cache_offset;
148
    uint16_t *refcount_block_cache;
149
    int64_t free_cluster_index;
150
    int64_t free_byte_offset;
151

    
152
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
153
    uint32_t crypt_method_header;
154
    AES_KEY aes_encrypt_key;
155
    AES_KEY aes_decrypt_key;
156
    uint64_t snapshots_offset;
157
    int snapshots_size;
158
    int nb_snapshots;
159
    QCowSnapshot *snapshots;
160
} BDRVQcowState;
161

    
162
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
163
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
164
                     uint8_t *buf, int nb_sectors);
165
static int qcow_read_snapshots(BlockDriverState *bs);
166
static void qcow_free_snapshots(BlockDriverState *bs);
167
static int refcount_init(BlockDriverState *bs);
168
static void refcount_close(BlockDriverState *bs);
169
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
170
static int update_cluster_refcount(BlockDriverState *bs,
171
                                   int64_t cluster_index,
172
                                   int addend);
173
static void update_refcount(BlockDriverState *bs,
174
                            int64_t offset, int64_t length,
175
                            int addend);
176
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
177
static int64_t alloc_bytes(BlockDriverState *bs, int size);
178
static void free_clusters(BlockDriverState *bs,
179
                          int64_t offset, int64_t size);
180
#ifdef DEBUG_ALLOC
181
static void check_refcounts(BlockDriverState *bs);
182
#endif
183

    
184
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
185
{
186
    const QCowHeader *cow_header = (const void *)buf;
187

    
188
    if (buf_size >= sizeof(QCowHeader) &&
189
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
190
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
191
        return 100;
192
    else
193
        return 0;
194
}
195

    
196

    
197
/* 
198
 * read qcow2 extension and fill bs
199
 * start reading from start_offset
200
 * finish reading upon magic of value 0 or when end_offset reached
201
 * unknown magic is skipped (future extension this version knows nothing about)
202
 * return 0 upon success, non-0 otherwise
203
 */
204
static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset,
205
                                uint64_t end_offset)
206
{
207
    BDRVQcowState *s = bs->opaque;
208
    QCowExtension ext;
209
    uint64_t offset;
210

    
211
#ifdef DEBUG_EXT
212
    printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
213
#endif
214
    offset = start_offset;
215
    while (offset < end_offset) {
216

    
217
#ifdef DEBUG_EXT
218
        /* Sanity check */
219
        if (offset > s->cluster_size)
220
            printf("qcow_handle_extension: suspicious offset %lu\n", offset);
221

    
222
        printf("attemting to read extended header in offset %lu\n", offset);
223
#endif
224

    
225
        if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) {
226
            fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n",
227
                    (unsigned long long)offset);
228
            return 1;
229
        }
230
        be32_to_cpus(&ext.magic);
231
        be32_to_cpus(&ext.len);
232
        offset += sizeof(ext);
233
#ifdef DEBUG_EXT
234
        printf("ext.magic = 0x%x\n", ext.magic);
235
#endif
236
        switch (ext.magic) {
237
        case QCOW_EXT_MAGIC_END:
238
            return 0;
239

    
240
        case QCOW_EXT_MAGIC_BACKING_FORMAT:
241
            if (ext.len >= sizeof(bs->backing_format)) {
242
                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
243
                        " (>=%zu)\n",
244
                        ext.len, sizeof(bs->backing_format));
245
                return 2;
246
            }
247
            if (bdrv_pread(s->hd, offset , bs->backing_format,
248
                           ext.len) != ext.len)
249
                return 3;
250
            bs->backing_format[ext.len] = '\0';
251
#ifdef DEBUG_EXT
252
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
253
#endif
254
            offset += ((ext.len + 7) & ~7);
255
            break;
256

    
257
        default:
258
            /* unknown magic -- just skip it */
259
            offset += ((ext.len + 7) & ~7);
260
            break;
261
        }
262
    }
263

    
264
    return 0;
265
}
266

    
267

    
268
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
269
{
270
    BDRVQcowState *s = bs->opaque;
271
    int len, i, shift, ret;
272
    QCowHeader header;
273
    uint64_t ext_end;
274

    
275
    /* Performance is terrible right now with cache=writethrough due mainly
276
     * to reference count updates.  If the user does not explicitly specify
277
     * a caching type, force to writeback caching.
278
     */
279
    if ((flags & BDRV_O_CACHE_DEF)) {
280
        flags |= BDRV_O_CACHE_WB;
281
        flags &= ~BDRV_O_CACHE_DEF;
282
    }
283
    ret = bdrv_file_open(&s->hd, filename, flags);
284
    if (ret < 0)
285
        return ret;
286
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
287
        goto fail;
288
    be32_to_cpus(&header.magic);
289
    be32_to_cpus(&header.version);
290
    be64_to_cpus(&header.backing_file_offset);
291
    be32_to_cpus(&header.backing_file_size);
292
    be64_to_cpus(&header.size);
293
    be32_to_cpus(&header.cluster_bits);
294
    be32_to_cpus(&header.crypt_method);
295
    be64_to_cpus(&header.l1_table_offset);
296
    be32_to_cpus(&header.l1_size);
297
    be64_to_cpus(&header.refcount_table_offset);
298
    be32_to_cpus(&header.refcount_table_clusters);
299
    be64_to_cpus(&header.snapshots_offset);
300
    be32_to_cpus(&header.nb_snapshots);
301

    
302
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
303
        goto fail;
304
    if (header.size <= 1 ||
305
        header.cluster_bits < 9 ||
306
        header.cluster_bits > 16)
307
        goto fail;
308
    if (header.crypt_method > QCOW_CRYPT_AES)
309
        goto fail;
310
    s->crypt_method_header = header.crypt_method;
311
    if (s->crypt_method_header)
312
        bs->encrypted = 1;
313
    s->cluster_bits = header.cluster_bits;
314
    s->cluster_size = 1 << s->cluster_bits;
315
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
316
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
317
    s->l2_size = 1 << s->l2_bits;
318
    bs->total_sectors = header.size / 512;
319
    s->csize_shift = (62 - (s->cluster_bits - 8));
320
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
321
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
322
    s->refcount_table_offset = header.refcount_table_offset;
323
    s->refcount_table_size =
324
        header.refcount_table_clusters << (s->cluster_bits - 3);
325

    
326
    s->snapshots_offset = header.snapshots_offset;
327
    s->nb_snapshots = header.nb_snapshots;
328

    
329
    /* read the level 1 table */
330
    s->l1_size = header.l1_size;
331
    shift = s->cluster_bits + s->l2_bits;
332
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
333
    /* the L1 table must contain at least enough entries to put
334
       header.size bytes */
335
    if (s->l1_size < s->l1_vm_state_index)
336
        goto fail;
337
    s->l1_table_offset = header.l1_table_offset;
338
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
339
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
340
        s->l1_size * sizeof(uint64_t))
341
        goto fail;
342
    for(i = 0;i < s->l1_size; i++) {
343
        be64_to_cpus(&s->l1_table[i]);
344
    }
345
    /* alloc L2 cache */
346
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
347
    s->cluster_cache = qemu_malloc(s->cluster_size);
348
    /* one more sector for decompressed data alignment */
349
    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
350
                                  + 512);
351
    s->cluster_cache_offset = -1;
352

    
353
    if (refcount_init(bs) < 0)
354
        goto fail;
355

    
356
    /* read qcow2 extensions */
357
    if (header.backing_file_offset)
358
        ext_end = header.backing_file_offset;
359
    else
360
        ext_end = s->cluster_size;
361
    if (qcow_read_extensions(bs, sizeof(header), ext_end))
362
        goto fail;
363

    
364
    /* read the backing file name */
365
    if (header.backing_file_offset != 0) {
366
        len = header.backing_file_size;
367
        if (len > 1023)
368
            len = 1023;
369
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
370
            goto fail;
371
        bs->backing_file[len] = '\0';
372
    }
373
    if (qcow_read_snapshots(bs) < 0)
374
        goto fail;
375

    
376
#ifdef DEBUG_ALLOC
377
    check_refcounts(bs);
378
#endif
379
    return 0;
380

    
381
 fail:
382
    qcow_free_snapshots(bs);
383
    refcount_close(bs);
384
    qemu_free(s->l1_table);
385
    qemu_free(s->l2_cache);
386
    qemu_free(s->cluster_cache);
387
    qemu_free(s->cluster_data);
388
    bdrv_delete(s->hd);
389
    return -1;
390
}
391

    
392
static int qcow_set_key(BlockDriverState *bs, const char *key)
393
{
394
    BDRVQcowState *s = bs->opaque;
395
    uint8_t keybuf[16];
396
    int len, i;
397

    
398
    memset(keybuf, 0, 16);
399
    len = strlen(key);
400
    if (len > 16)
401
        len = 16;
402
    /* XXX: we could compress the chars to 7 bits to increase
403
       entropy */
404
    for(i = 0;i < len;i++) {
405
        keybuf[i] = key[i];
406
    }
407
    s->crypt_method = s->crypt_method_header;
408

    
409
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
410
        return -1;
411
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
412
        return -1;
413
#if 0
414
    /* test */
415
    {
416
        uint8_t in[16];
417
        uint8_t out[16];
418
        uint8_t tmp[16];
419
        for(i=0;i<16;i++)
420
            in[i] = i;
421
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
422
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
423
        for(i = 0; i < 16; i++)
424
            printf(" %02x", tmp[i]);
425
        printf("\n");
426
        for(i = 0; i < 16; i++)
427
            printf(" %02x", out[i]);
428
        printf("\n");
429
    }
430
#endif
431
    return 0;
432
}
433

    
434
/* The crypt function is compatible with the linux cryptoloop
435
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
436
   supported */
437
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
438
                            uint8_t *out_buf, const uint8_t *in_buf,
439
                            int nb_sectors, int enc,
440
                            const AES_KEY *key)
441
{
442
    union {
443
        uint64_t ll[2];
444
        uint8_t b[16];
445
    } ivec;
446
    int i;
447

    
448
    for(i = 0; i < nb_sectors; i++) {
449
        ivec.ll[0] = cpu_to_le64(sector_num);
450
        ivec.ll[1] = 0;
451
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
452
                        ivec.b, enc);
453
        sector_num++;
454
        in_buf += 512;
455
        out_buf += 512;
456
    }
457
}
458

    
459
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
460
                        uint64_t cluster_offset, int n_start, int n_end)
461
{
462
    BDRVQcowState *s = bs->opaque;
463
    int n, ret;
464

    
465
    n = n_end - n_start;
466
    if (n <= 0)
467
        return 0;
468
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
469
    if (ret < 0)
470
        return ret;
471
    if (s->crypt_method) {
472
        encrypt_sectors(s, start_sect + n_start,
473
                        s->cluster_data,
474
                        s->cluster_data, n, 1,
475
                        &s->aes_encrypt_key);
476
    }
477
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
478
                     s->cluster_data, n);
479
    if (ret < 0)
480
        return ret;
481
    return 0;
482
}
483

    
484
static void l2_cache_reset(BlockDriverState *bs)
485
{
486
    BDRVQcowState *s = bs->opaque;
487

    
488
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
489
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
490
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
491
}
492

    
493
static inline int l2_cache_new_entry(BlockDriverState *bs)
494
{
495
    BDRVQcowState *s = bs->opaque;
496
    uint32_t min_count;
497
    int min_index, i;
498

    
499
    /* find a new entry in the least used one */
500
    min_index = 0;
501
    min_count = 0xffffffff;
502
    for(i = 0; i < L2_CACHE_SIZE; i++) {
503
        if (s->l2_cache_counts[i] < min_count) {
504
            min_count = s->l2_cache_counts[i];
505
            min_index = i;
506
        }
507
    }
508
    return min_index;
509
}
510

    
511
static int64_t align_offset(int64_t offset, int n)
512
{
513
    offset = (offset + n - 1) & ~(n - 1);
514
    return offset;
515
}
516

    
517
static int grow_l1_table(BlockDriverState *bs, int min_size)
518
{
519
    BDRVQcowState *s = bs->opaque;
520
    int new_l1_size, new_l1_size2, ret, i;
521
    uint64_t *new_l1_table;
522
    uint64_t new_l1_table_offset;
523
    uint8_t data[12];
524

    
525
    new_l1_size = s->l1_size;
526
    if (min_size <= new_l1_size)
527
        return 0;
528
    while (min_size > new_l1_size) {
529
        new_l1_size = (new_l1_size * 3 + 1) / 2;
530
    }
531
#ifdef DEBUG_ALLOC2
532
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
533
#endif
534

    
535
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
536
    new_l1_table = qemu_mallocz(new_l1_size2);
537
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
538

    
539
    /* write new table (align to cluster) */
540
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
541

    
542
    for(i = 0; i < s->l1_size; i++)
543
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
544
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
545
    if (ret != new_l1_size2)
546
        goto fail;
547
    for(i = 0; i < s->l1_size; i++)
548
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
549

    
550
    /* set new table */
551
    cpu_to_be32w((uint32_t*)data, new_l1_size);
552
    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
553
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
554
                sizeof(data)) != sizeof(data))
555
        goto fail;
556
    qemu_free(s->l1_table);
557
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
558
    s->l1_table_offset = new_l1_table_offset;
559
    s->l1_table = new_l1_table;
560
    s->l1_size = new_l1_size;
561
    return 0;
562
 fail:
563
    qemu_free(s->l1_table);
564
    return -EIO;
565
}
566

    
567
/*
568
 * seek_l2_table
569
 *
570
 * seek l2_offset in the l2_cache table
571
 * if not found, return NULL,
572
 * if found,
573
 *   increments the l2 cache hit count of the entry,
574
 *   if counter overflow, divide by two all counters
575
 *   return the pointer to the l2 cache entry
576
 *
577
 */
578

    
579
static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
580
{
581
    int i, j;
582

    
583
    for(i = 0; i < L2_CACHE_SIZE; i++) {
584
        if (l2_offset == s->l2_cache_offsets[i]) {
585
            /* increment the hit count */
586
            if (++s->l2_cache_counts[i] == 0xffffffff) {
587
                for(j = 0; j < L2_CACHE_SIZE; j++) {
588
                    s->l2_cache_counts[j] >>= 1;
589
                }
590
            }
591
            return s->l2_cache + (i << s->l2_bits);
592
        }
593
    }
594
    return NULL;
595
}
596

    
597
/*
598
 * l2_load
599
 *
600
 * Loads a L2 table into memory. If the table is in the cache, the cache
601
 * is used; otherwise the L2 table is loaded from the image file.
602
 *
603
 * Returns a pointer to the L2 table on success, or NULL if the read from
604
 * the image file failed.
605
 */
606

    
607
static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
608
{
609
    BDRVQcowState *s = bs->opaque;
610
    int min_index;
611
    uint64_t *l2_table;
612

    
613
    /* seek if the table for the given offset is in the cache */
614

    
615
    l2_table = seek_l2_table(s, l2_offset);
616
    if (l2_table != NULL)
617
        return l2_table;
618

    
619
    /* not found: load a new entry in the least used one */
620

    
621
    min_index = l2_cache_new_entry(bs);
622
    l2_table = s->l2_cache + (min_index << s->l2_bits);
623
    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
624
        s->l2_size * sizeof(uint64_t))
625
        return NULL;
626
    s->l2_cache_offsets[min_index] = l2_offset;
627
    s->l2_cache_counts[min_index] = 1;
628

    
629
    return l2_table;
630
}
631

    
632
/*
633
 * l2_allocate
634
 *
635
 * Allocate a new l2 entry in the file. If l1_index points to an already
636
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
637
 * table) copy the contents of the old L2 table into the newly allocated one.
638
 * Otherwise the new table is initialized with zeros.
639
 *
640
 */
641

    
642
static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
643
{
644
    BDRVQcowState *s = bs->opaque;
645
    int min_index;
646
    uint64_t old_l2_offset, tmp;
647
    uint64_t *l2_table, l2_offset;
648

    
649
    old_l2_offset = s->l1_table[l1_index];
650

    
651
    /* allocate a new l2 entry */
652

    
653
    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
654

    
655
    /* update the L1 entry */
656

    
657
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
658

    
659
    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
660
    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
661
                    &tmp, sizeof(tmp)) != sizeof(tmp))
662
        return NULL;
663

    
664
    /* allocate a new entry in the l2 cache */
665

    
666
    min_index = l2_cache_new_entry(bs);
667
    l2_table = s->l2_cache + (min_index << s->l2_bits);
668

    
669
    if (old_l2_offset == 0) {
670
        /* if there was no old l2 table, clear the new table */
671
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
672
    } else {
673
        /* if there was an old l2 table, read it from the disk */
674
        if (bdrv_pread(s->hd, old_l2_offset,
675
                       l2_table, s->l2_size * sizeof(uint64_t)) !=
676
            s->l2_size * sizeof(uint64_t))
677
            return NULL;
678
    }
679
    /* write the l2 table to the file */
680
    if (bdrv_pwrite(s->hd, l2_offset,
681
                    l2_table, s->l2_size * sizeof(uint64_t)) !=
682
        s->l2_size * sizeof(uint64_t))
683
        return NULL;
684

    
685
    /* update the l2 cache entry */
686

    
687
    s->l2_cache_offsets[min_index] = l2_offset;
688
    s->l2_cache_counts[min_index] = 1;
689

    
690
    return l2_table;
691
}
692

    
693
static int size_to_clusters(BDRVQcowState *s, int64_t size)
694
{
695
    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
696
}
697

    
698
static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
699
        uint64_t *l2_table, uint64_t start, uint64_t mask)
700
{
701
    int i;
702
    uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
703

    
704
    if (!offset)
705
        return 0;
706

    
707
    for (i = start; i < start + nb_clusters; i++)
708
        if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
709
            break;
710

    
711
        return (i - start);
712
}
713

    
714
static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
715
{
716
    int i = 0;
717

    
718
    while(nb_clusters-- && l2_table[i] == 0)
719
        i++;
720

    
721
    return i;
722
}
723

    
724
/*
725
 * get_cluster_offset
726
 *
727
 * For a given offset of the disk image, return cluster offset in
728
 * qcow2 file.
729
 *
730
 * on entry, *num is the number of contiguous clusters we'd like to
731
 * access following offset.
732
 *
733
 * on exit, *num is the number of contiguous clusters we can read.
734
 *
735
 * Return 1, if the offset is found
736
 * Return 0, otherwise.
737
 *
738
 */
739

    
740
static uint64_t get_cluster_offset(BlockDriverState *bs,
741
                                   uint64_t offset, int *num)
742
{
743
    BDRVQcowState *s = bs->opaque;
744
    int l1_index, l2_index;
745
    uint64_t l2_offset, *l2_table, cluster_offset;
746
    int l1_bits, c;
747
    int index_in_cluster, nb_available, nb_needed, nb_clusters;
748

    
749
    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
750
    nb_needed = *num + index_in_cluster;
751

    
752
    l1_bits = s->l2_bits + s->cluster_bits;
753

    
754
    /* compute how many bytes there are between the offset and
755
     * the end of the l1 entry
756
     */
757

    
758
    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
759

    
760
    /* compute the number of available sectors */
761

    
762
    nb_available = (nb_available >> 9) + index_in_cluster;
763

    
764
    if (nb_needed > nb_available) {
765
        nb_needed = nb_available;
766
    }
767

    
768
    cluster_offset = 0;
769

    
770
    /* seek the the l2 offset in the l1 table */
771

    
772
    l1_index = offset >> l1_bits;
773
    if (l1_index >= s->l1_size)
774
        goto out;
775

    
776
    l2_offset = s->l1_table[l1_index];
777

    
778
    /* seek the l2 table of the given l2 offset */
779

    
780
    if (!l2_offset)
781
        goto out;
782

    
783
    /* load the l2 table in memory */
784

    
785
    l2_offset &= ~QCOW_OFLAG_COPIED;
786
    l2_table = l2_load(bs, l2_offset);
787
    if (l2_table == NULL)
788
        return 0;
789

    
790
    /* find the cluster offset for the given disk offset */
791

    
792
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
793
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
794
    nb_clusters = size_to_clusters(s, nb_needed << 9);
795

    
796
    if (!cluster_offset) {
797
        /* how many empty clusters ? */
798
        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
799
    } else {
800
        /* how many allocated clusters ? */
801
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
802
                &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
803
    }
804

    
805
   nb_available = (c * s->cluster_sectors);
806
out:
807
    if (nb_available > nb_needed)
808
        nb_available = nb_needed;
809

    
810
    *num = nb_available - index_in_cluster;
811

    
812
    return cluster_offset & ~QCOW_OFLAG_COPIED;
813
}
814

    
815
/*
816
 * free_any_clusters
817
 *
818
 * free clusters according to its type: compressed or not
819
 *
820
 */
821

    
822
static void free_any_clusters(BlockDriverState *bs,
823
                              uint64_t cluster_offset, int nb_clusters)
824
{
825
    BDRVQcowState *s = bs->opaque;
826

    
827
    /* free the cluster */
828

    
829
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
830
        int nb_csectors;
831
        nb_csectors = ((cluster_offset >> s->csize_shift) &
832
                       s->csize_mask) + 1;
833
        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
834
                      nb_csectors * 512);
835
        return;
836
    }
837

    
838
    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
839

    
840
    return;
841
}
842

    
843
/*
844
 * get_cluster_table
845
 *
846
 * for a given disk offset, load (and allocate if needed)
847
 * the l2 table.
848
 *
849
 * the l2 table offset in the qcow2 file and the cluster index
850
 * in the l2 table are given to the caller.
851
 *
852
 */
853

    
854
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
855
                             uint64_t **new_l2_table,
856
                             uint64_t *new_l2_offset,
857
                             int *new_l2_index)
858
{
859
    BDRVQcowState *s = bs->opaque;
860
    int l1_index, l2_index, ret;
861
    uint64_t l2_offset, *l2_table;
862

    
863
    /* seek the the l2 offset in the l1 table */
864

    
865
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
866
    if (l1_index >= s->l1_size) {
867
        ret = grow_l1_table(bs, l1_index + 1);
868
        if (ret < 0)
869
            return 0;
870
    }
871
    l2_offset = s->l1_table[l1_index];
872

    
873
    /* seek the l2 table of the given l2 offset */
874

    
875
    if (l2_offset & QCOW_OFLAG_COPIED) {
876
        /* load the l2 table in memory */
877
        l2_offset &= ~QCOW_OFLAG_COPIED;
878
        l2_table = l2_load(bs, l2_offset);
879
        if (l2_table == NULL)
880
            return 0;
881
    } else {
882
        if (l2_offset)
883
            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
884
        l2_table = l2_allocate(bs, l1_index);
885
        if (l2_table == NULL)
886
            return 0;
887
        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
888
    }
889

    
890
    /* find the cluster offset for the given disk offset */
891

    
892
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
893

    
894
    *new_l2_table = l2_table;
895
    *new_l2_offset = l2_offset;
896
    *new_l2_index = l2_index;
897

    
898
    return 1;
899
}
900

    
901
/*
902
 * alloc_compressed_cluster_offset
903
 *
904
 * For a given offset of the disk image, return cluster offset in
905
 * qcow2 file.
906
 *
907
 * If the offset is not found, allocate a new compressed cluster.
908
 *
909
 * Return the cluster offset if successful,
910
 * Return 0, otherwise.
911
 *
912
 */
913

    
914
static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
915
                                                uint64_t offset,
916
                                                int compressed_size)
917
{
918
    BDRVQcowState *s = bs->opaque;
919
    int l2_index, ret;
920
    uint64_t l2_offset, *l2_table, cluster_offset;
921
    int nb_csectors;
922

    
923
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
924
    if (ret == 0)
925
        return 0;
926

    
927
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
928
    if (cluster_offset & QCOW_OFLAG_COPIED)
929
        return cluster_offset & ~QCOW_OFLAG_COPIED;
930

    
931
    if (cluster_offset)
932
        free_any_clusters(bs, cluster_offset, 1);
933

    
934
    cluster_offset = alloc_bytes(bs, compressed_size);
935
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
936
                  (cluster_offset >> 9);
937

    
938
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
939
                      ((uint64_t)nb_csectors << s->csize_shift);
940

    
941
    /* update L2 table */
942

    
943
    /* compressed clusters never have the copied flag */
944

    
945
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
946
    if (bdrv_pwrite(s->hd,
947
                    l2_offset + l2_index * sizeof(uint64_t),
948
                    l2_table + l2_index,
949
                    sizeof(uint64_t)) != sizeof(uint64_t))
950
        return 0;
951

    
952
    return cluster_offset;
953
}
954

    
955
typedef struct QCowL2Meta
956
{
957
    uint64_t offset;
958
    int n_start;
959
    int nb_available;
960
    int nb_clusters;
961
} QCowL2Meta;
962

    
963
static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
964
        QCowL2Meta *m)
965
{
966
    BDRVQcowState *s = bs->opaque;
967
    int i, j = 0, l2_index, ret;
968
    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
969

    
970
    if (m->nb_clusters == 0)
971
        return 0;
972

    
973
    old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
974

    
975
    /* copy content of unmodified sectors */
976
    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
977
    if (m->n_start) {
978
        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
979
        if (ret < 0)
980
            goto err;
981
    }
982

    
983
    if (m->nb_available & (s->cluster_sectors - 1)) {
984
        uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
985
        ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
986
                m->nb_available - end, s->cluster_sectors);
987
        if (ret < 0)
988
            goto err;
989
    }
990

    
991
    ret = -EIO;
992
    /* update L2 table */
993
    if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
994
        goto err;
995

    
996
    for (i = 0; i < m->nb_clusters; i++) {
997
        if(l2_table[l2_index + i] != 0)
998
            old_cluster[j++] = l2_table[l2_index + i];
999

    
1000
        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
1001
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
1002
     }
1003

    
1004
    if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
1005
                l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
1006
            m->nb_clusters * sizeof(uint64_t))
1007
        goto err;
1008

    
1009
    for (i = 0; i < j; i++)
1010
        free_any_clusters(bs, old_cluster[i], 1);
1011

    
1012
    ret = 0;
1013
err:
1014
    qemu_free(old_cluster);
1015
    return ret;
1016
 }
1017

    
1018
/*
1019
 * alloc_cluster_offset
1020
 *
1021
 * For a given offset of the disk image, return cluster offset in
1022
 * qcow2 file.
1023
 *
1024
 * If the offset is not found, allocate a new cluster.
1025
 *
1026
 * Return the cluster offset if successful,
1027
 * Return 0, otherwise.
1028
 *
1029
 */
1030

    
1031
static uint64_t alloc_cluster_offset(BlockDriverState *bs,
1032
                                     uint64_t offset,
1033
                                     int n_start, int n_end,
1034
                                     int *num, QCowL2Meta *m)
1035
{
1036
    BDRVQcowState *s = bs->opaque;
1037
    int l2_index, ret;
1038
    uint64_t l2_offset, *l2_table, cluster_offset;
1039
    int nb_clusters, i = 0;
1040

    
1041
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
1042
    if (ret == 0)
1043
        return 0;
1044

    
1045
    nb_clusters = size_to_clusters(s, n_end << 9);
1046

    
1047
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1048

    
1049
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
1050

    
1051
    /* We keep all QCOW_OFLAG_COPIED clusters */
1052

    
1053
    if (cluster_offset & QCOW_OFLAG_COPIED) {
1054
        nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
1055
                &l2_table[l2_index], 0, 0);
1056

    
1057
        cluster_offset &= ~QCOW_OFLAG_COPIED;
1058
        m->nb_clusters = 0;
1059

    
1060
        goto out;
1061
    }
1062

    
1063
    /* for the moment, multiple compressed clusters are not managed */
1064

    
1065
    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
1066
        nb_clusters = 1;
1067

    
1068
    /* how many available clusters ? */
1069

    
1070
    while (i < nb_clusters) {
1071
        i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
1072
                &l2_table[l2_index], i, 0);
1073

    
1074
        if(be64_to_cpu(l2_table[l2_index + i]))
1075
            break;
1076

    
1077
        i += count_contiguous_free_clusters(nb_clusters - i,
1078
                &l2_table[l2_index + i]);
1079

    
1080
        cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
1081

    
1082
        if ((cluster_offset & QCOW_OFLAG_COPIED) ||
1083
                (cluster_offset & QCOW_OFLAG_COMPRESSED))
1084
            break;
1085
    }
1086
    nb_clusters = i;
1087

    
1088
    /* allocate a new cluster */
1089

    
1090
    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
1091

    
1092
    /* save info needed for meta data update */
1093
    m->offset = offset;
1094
    m->n_start = n_start;
1095
    m->nb_clusters = nb_clusters;
1096

    
1097
out:
1098
    m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
1099

    
1100
    *num = m->nb_available - n_start;
1101

    
1102
    return cluster_offset;
1103
}
1104

    
1105
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
1106
                             int nb_sectors, int *pnum)
1107
{
1108
    uint64_t cluster_offset;
1109

    
1110
    *pnum = nb_sectors;
1111
    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
1112

    
1113
    return (cluster_offset != 0);
1114
}
1115

    
1116
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1117
                             const uint8_t *buf, int buf_size)
1118
{
1119
    z_stream strm1, *strm = &strm1;
1120
    int ret, out_len;
1121

    
1122
    memset(strm, 0, sizeof(*strm));
1123

    
1124
    strm->next_in = (uint8_t *)buf;
1125
    strm->avail_in = buf_size;
1126
    strm->next_out = out_buf;
1127
    strm->avail_out = out_buf_size;
1128

    
1129
    ret = inflateInit2(strm, -12);
1130
    if (ret != Z_OK)
1131
        return -1;
1132
    ret = inflate(strm, Z_FINISH);
1133
    out_len = strm->next_out - out_buf;
1134
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1135
        out_len != out_buf_size) {
1136
        inflateEnd(strm);
1137
        return -1;
1138
    }
1139
    inflateEnd(strm);
1140
    return 0;
1141
}
1142

    
1143
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
1144
{
1145
    int ret, csize, nb_csectors, sector_offset;
1146
    uint64_t coffset;
1147

    
1148
    coffset = cluster_offset & s->cluster_offset_mask;
1149
    if (s->cluster_cache_offset != coffset) {
1150
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1151
        sector_offset = coffset & 511;
1152
        csize = nb_csectors * 512 - sector_offset;
1153
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
1154
        if (ret < 0) {
1155
            return -1;
1156
        }
1157
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1158
                              s->cluster_data + sector_offset, csize) < 0) {
1159
            return -1;
1160
        }
1161
        s->cluster_cache_offset = coffset;
1162
    }
1163
    return 0;
1164
}
1165

    
1166
/* handle reading after the end of the backing file */
1167
static int backing_read1(BlockDriverState *bs,
1168
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
1169
{
1170
    int n1;
1171
    if ((sector_num + nb_sectors) <= bs->total_sectors)
1172
        return nb_sectors;
1173
    if (sector_num >= bs->total_sectors)
1174
        n1 = 0;
1175
    else
1176
        n1 = bs->total_sectors - sector_num;
1177
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
1178
    return n1;
1179
}
1180

    
1181
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
1182
                     uint8_t *buf, int nb_sectors)
1183
{
1184
    BDRVQcowState *s = bs->opaque;
1185
    int ret, index_in_cluster, n, n1;
1186
    uint64_t cluster_offset;
1187

    
1188
    while (nb_sectors > 0) {
1189
        n = nb_sectors;
1190
        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
1191
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1192
        if (!cluster_offset) {
1193
            if (bs->backing_hd) {
1194
                /* read from the base image */
1195
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
1196
                if (n1 > 0) {
1197
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
1198
                    if (ret < 0)
1199
                        return -1;
1200
                }
1201
            } else {
1202
                memset(buf, 0, 512 * n);
1203
            }
1204
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1205
            if (decompress_cluster(s, cluster_offset) < 0)
1206
                return -1;
1207
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
1208
        } else {
1209
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1210
            if (ret != n * 512)
1211
                return -1;
1212
            if (s->crypt_method) {
1213
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
1214
                                &s->aes_decrypt_key);
1215
            }
1216
        }
1217
        nb_sectors -= n;
1218
        sector_num += n;
1219
        buf += n * 512;
1220
    }
1221
    return 0;
1222
}
1223

    
1224
static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1225
                     const uint8_t *buf, int nb_sectors)
1226
{
1227
    BDRVQcowState *s = bs->opaque;
1228
    int ret, index_in_cluster, n;
1229
    uint64_t cluster_offset;
1230
    int n_end;
1231
    QCowL2Meta l2meta;
1232

    
1233
    while (nb_sectors > 0) {
1234
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1235
        n_end = index_in_cluster + nb_sectors;
1236
        if (s->crypt_method &&
1237
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1238
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1239
        cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1240
                                              index_in_cluster,
1241
                                              n_end, &n, &l2meta);
1242
        if (!cluster_offset)
1243
            return -1;
1244
        if (s->crypt_method) {
1245
            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
1246
                            &s->aes_encrypt_key);
1247
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
1248
                              s->cluster_data, n * 512);
1249
        } else {
1250
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1251
        }
1252
        if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
1253
            free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
1254
            return -1;
1255
        }
1256
        nb_sectors -= n;
1257
        sector_num += n;
1258
        buf += n * 512;
1259
    }
1260
    s->cluster_cache_offset = -1; /* disable compressed cache */
1261
    return 0;
1262
}
1263

    
1264
typedef struct QCowAIOCB {
1265
    BlockDriverAIOCB common;
1266
    int64_t sector_num;
1267
    QEMUIOVector *qiov;
1268
    uint8_t *buf;
1269
    void *orig_buf;
1270
    int nb_sectors;
1271
    int n;
1272
    uint64_t cluster_offset;
1273
    uint8_t *cluster_data;
1274
    BlockDriverAIOCB *hd_aiocb;
1275
    struct iovec hd_iov;
1276
    QEMUIOVector hd_qiov;
1277
    QEMUBH *bh;
1278
    QCowL2Meta l2meta;
1279
} QCowAIOCB;
1280

    
1281
static void qcow_aio_read_cb(void *opaque, int ret);
1282
static void qcow_aio_read_bh(void *opaque)
1283
{
1284
    QCowAIOCB *acb = opaque;
1285
    qemu_bh_delete(acb->bh);
1286
    acb->bh = NULL;
1287
    qcow_aio_read_cb(opaque, 0);
1288
}
1289

    
1290
static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
1291
{
1292
    if (acb->bh)
1293
        return -EIO;
1294

    
1295
    acb->bh = qemu_bh_new(cb, acb);
1296
    if (!acb->bh)
1297
        return -EIO;
1298

    
1299
    qemu_bh_schedule(acb->bh);
1300

    
1301
    return 0;
1302
}
1303

    
1304
static void qcow_aio_read_cb(void *opaque, int ret)
1305
{
1306
    QCowAIOCB *acb = opaque;
1307
    BlockDriverState *bs = acb->common.bs;
1308
    BDRVQcowState *s = bs->opaque;
1309
    int index_in_cluster, n1;
1310

    
1311
    acb->hd_aiocb = NULL;
1312
    if (ret < 0)
1313
        goto done;
1314

    
1315
    /* post process the read buffer */
1316
    if (!acb->cluster_offset) {
1317
        /* nothing to do */
1318
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1319
        /* nothing to do */
1320
    } else {
1321
        if (s->crypt_method) {
1322
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
1323
                            acb->n, 0,
1324
                            &s->aes_decrypt_key);
1325
        }
1326
    }
1327

    
1328
    acb->nb_sectors -= acb->n;
1329
    acb->sector_num += acb->n;
1330
    acb->buf += acb->n * 512;
1331

    
1332
    if (acb->nb_sectors == 0) {
1333
        /* request completed */
1334
        ret = 0;
1335
        goto done;
1336
    }
1337

    
1338
    /* prepare next AIO request */
1339
    acb->n = acb->nb_sectors;
1340
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
1341
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1342

    
1343
    if (!acb->cluster_offset) {
1344
        if (bs->backing_hd) {
1345
            /* read from the base image */
1346
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
1347
                               acb->buf, acb->n);
1348
            if (n1 > 0) {
1349
                acb->hd_iov.iov_base = (void *)acb->buf;
1350
                acb->hd_iov.iov_len = acb->n * 512;
1351
                qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
1352
                acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
1353
                                    &acb->hd_qiov, acb->n,
1354
                                    qcow_aio_read_cb, acb);
1355
                if (acb->hd_aiocb == NULL)
1356
                    goto done;
1357
            } else {
1358
                ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1359
                if (ret < 0)
1360
                    goto done;
1361
            }
1362
        } else {
1363
            /* Note: in this case, no need to wait */
1364
            memset(acb->buf, 0, 512 * acb->n);
1365
            ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1366
            if (ret < 0)
1367
                goto done;
1368
        }
1369
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1370
        /* add AIO support for compressed blocks ? */
1371
        if (decompress_cluster(s, acb->cluster_offset) < 0)
1372
            goto done;
1373
        memcpy(acb->buf,
1374
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
1375
        ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1376
        if (ret < 0)
1377
            goto done;
1378
    } else {
1379
        if ((acb->cluster_offset & 511) != 0) {
1380
            ret = -EIO;
1381
            goto done;
1382
        }
1383

    
1384
        acb->hd_iov.iov_base = (void *)acb->buf;
1385
        acb->hd_iov.iov_len = acb->n * 512;
1386
        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
1387
        acb->hd_aiocb = bdrv_aio_readv(s->hd,
1388
                            (acb->cluster_offset >> 9) + index_in_cluster,
1389
                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
1390
        if (acb->hd_aiocb == NULL)
1391
            goto done;
1392
    }
1393

    
1394
    return;
1395
done:
1396
    if (acb->qiov->niov > 1) {
1397
        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
1398
        qemu_vfree(acb->orig_buf);
1399
    }
1400
    acb->common.cb(acb->common.opaque, ret);
1401
    qemu_aio_release(acb);
1402
}
1403

    
1404
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1405
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1406
        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
1407
{
1408
    QCowAIOCB *acb;
1409

    
1410
    acb = qemu_aio_get(bs, cb, opaque);
1411
    if (!acb)
1412
        return NULL;
1413
    acb->hd_aiocb = NULL;
1414
    acb->sector_num = sector_num;
1415
    acb->qiov = qiov;
1416
    if (qiov->niov > 1) {
1417
        acb->buf = acb->orig_buf = qemu_memalign(512, qiov->size);
1418
        if (is_write)
1419
            qemu_iovec_to_buffer(qiov, acb->buf);
1420
    } else {
1421
        acb->buf = (uint8_t *)qiov->iov->iov_base;
1422
    }
1423
    acb->nb_sectors = nb_sectors;
1424
    acb->n = 0;
1425
    acb->cluster_offset = 0;
1426
    acb->l2meta.nb_clusters = 0;
1427
    return acb;
1428
}
1429

    
1430
static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
1431
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1432
        BlockDriverCompletionFunc *cb, void *opaque)
1433
{
1434
    QCowAIOCB *acb;
1435

    
1436
    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
1437
    if (!acb)
1438
        return NULL;
1439

    
1440
    qcow_aio_read_cb(acb, 0);
1441
    return &acb->common;
1442
}
1443

    
1444
static void qcow_aio_write_cb(void *opaque, int ret)
1445
{
1446
    QCowAIOCB *acb = opaque;
1447
    BlockDriverState *bs = acb->common.bs;
1448
    BDRVQcowState *s = bs->opaque;
1449
    int index_in_cluster;
1450
    const uint8_t *src_buf;
1451
    int n_end;
1452

    
1453
    acb->hd_aiocb = NULL;
1454

    
1455
    if (ret < 0)
1456
        goto done;
1457

    
1458
    if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
1459
        free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
1460
        goto done;
1461
    }
1462

    
1463
    acb->nb_sectors -= acb->n;
1464
    acb->sector_num += acb->n;
1465
    acb->buf += acb->n * 512;
1466

    
1467
    if (acb->nb_sectors == 0) {
1468
        /* request completed */
1469
        ret = 0;
1470
        goto done;
1471
    }
1472

    
1473
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1474
    n_end = index_in_cluster + acb->nb_sectors;
1475
    if (s->crypt_method &&
1476
        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1477
        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1478

    
1479
    acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1480
                                          index_in_cluster,
1481
                                          n_end, &acb->n, &acb->l2meta);
1482
    if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
1483
        ret = -EIO;
1484
        goto done;
1485
    }
1486
    if (s->crypt_method) {
1487
        if (!acb->cluster_data) {
1488
            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
1489
                                             s->cluster_size);
1490
        }
1491
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
1492
                        acb->n, 1, &s->aes_encrypt_key);
1493
        src_buf = acb->cluster_data;
1494
    } else {
1495
        src_buf = acb->buf;
1496
    }
1497
    acb->hd_iov.iov_base = (void *)src_buf;
1498
    acb->hd_iov.iov_len = acb->n * 512;
1499
    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
1500
    acb->hd_aiocb = bdrv_aio_writev(s->hd,
1501
                                    (acb->cluster_offset >> 9) + index_in_cluster,
1502
                                    &acb->hd_qiov, acb->n,
1503
                                    qcow_aio_write_cb, acb);
1504
    if (acb->hd_aiocb == NULL)
1505
        goto done;
1506

    
1507
    return;
1508

    
1509
done:
1510
    if (acb->qiov->niov > 1)
1511
        qemu_vfree(acb->orig_buf);
1512
    acb->common.cb(acb->common.opaque, ret);
1513
    qemu_aio_release(acb);
1514
}
1515

    
1516
static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
1517
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1518
        BlockDriverCompletionFunc *cb, void *opaque)
1519
{
1520
    BDRVQcowState *s = bs->opaque;
1521
    QCowAIOCB *acb;
1522

    
1523
    s->cluster_cache_offset = -1; /* disable compressed cache */
1524

    
1525
    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
1526
    if (!acb)
1527
        return NULL;
1528

    
1529
    qcow_aio_write_cb(acb, 0);
1530
    return &acb->common;
1531
}
1532

    
1533
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1534
{
1535
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1536
    if (acb->hd_aiocb)
1537
        bdrv_aio_cancel(acb->hd_aiocb);
1538
    qemu_aio_release(acb);
1539
}
1540

    
1541
static void qcow_close(BlockDriverState *bs)
1542
{
1543
    BDRVQcowState *s = bs->opaque;
1544
    qemu_free(s->l1_table);
1545
    qemu_free(s->l2_cache);
1546
    qemu_free(s->cluster_cache);
1547
    qemu_free(s->cluster_data);
1548
    refcount_close(bs);
1549
    bdrv_delete(s->hd);
1550
}
1551

    
1552
/* XXX: use std qcow open function ? */
1553
typedef struct QCowCreateState {
1554
    int cluster_size;
1555
    int cluster_bits;
1556
    uint16_t *refcount_block;
1557
    uint64_t *refcount_table;
1558
    int64_t l1_table_offset;
1559
    int64_t refcount_table_offset;
1560
    int64_t refcount_block_offset;
1561
} QCowCreateState;
1562

    
1563
static void create_refcount_update(QCowCreateState *s,
1564
                                   int64_t offset, int64_t size)
1565
{
1566
    int refcount;
1567
    int64_t start, last, cluster_offset;
1568
    uint16_t *p;
1569

    
1570
    start = offset & ~(s->cluster_size - 1);
1571
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
1572
    for(cluster_offset = start; cluster_offset <= last;
1573
        cluster_offset += s->cluster_size) {
1574
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
1575
        refcount = be16_to_cpu(*p);
1576
        refcount++;
1577
        *p = cpu_to_be16(refcount);
1578
    }
1579
}
1580

    
1581
static int qcow_create2(const char *filename, int64_t total_size,
1582
                        const char *backing_file, const char *backing_format,
1583
                        int flags)
1584
{
1585

    
1586
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1587
    int ref_clusters, backing_format_len = 0;
1588
    QCowHeader header;
1589
    uint64_t tmp, offset;
1590
    QCowCreateState s1, *s = &s1;
1591
    QCowExtension ext_bf = {0, 0};
1592

    
1593

    
1594
    memset(s, 0, sizeof(*s));
1595

    
1596
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1597
    if (fd < 0)
1598
        return -1;
1599
    memset(&header, 0, sizeof(header));
1600
    header.magic = cpu_to_be32(QCOW_MAGIC);
1601
    header.version = cpu_to_be32(QCOW_VERSION);
1602
    header.size = cpu_to_be64(total_size * 512);
1603
    header_size = sizeof(header);
1604
    backing_filename_len = 0;
1605
    if (backing_file) {
1606
        if (backing_format) {
1607
            ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT;
1608
            backing_format_len = strlen(backing_format);
1609
            ext_bf.len = (backing_format_len + 7) & ~7;
1610
            header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7);
1611
        }
1612
        header.backing_file_offset = cpu_to_be64(header_size);
1613
        backing_filename_len = strlen(backing_file);
1614
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1615
        header_size += backing_filename_len;
1616
    }
1617
    s->cluster_bits = 12;  /* 4 KB clusters */
1618
    s->cluster_size = 1 << s->cluster_bits;
1619
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1620
    header_size = (header_size + 7) & ~7;
1621
    if (flags & BLOCK_FLAG_ENCRYPT) {
1622
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1623
    } else {
1624
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1625
    }
1626
    l2_bits = s->cluster_bits - 3;
1627
    shift = s->cluster_bits + l2_bits;
1628
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1629
    offset = align_offset(header_size, s->cluster_size);
1630
    s->l1_table_offset = offset;
1631
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1632
    header.l1_size = cpu_to_be32(l1_size);
1633
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1634

    
1635
    s->refcount_table = qemu_mallocz(s->cluster_size);
1636

    
1637
    s->refcount_table_offset = offset;
1638
    header.refcount_table_offset = cpu_to_be64(offset);
1639
    header.refcount_table_clusters = cpu_to_be32(1);
1640
    offset += s->cluster_size;
1641
    s->refcount_block_offset = offset;
1642

    
1643
    /* count how many refcount blocks needed */
1644
    tmp = offset >> s->cluster_bits;
1645
    ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1;
1646
    for (i=0; i < ref_clusters; i++) {
1647
        s->refcount_table[i] = cpu_to_be64(offset);
1648
        offset += s->cluster_size;
1649
    }
1650

    
1651
    s->refcount_block = qemu_mallocz(ref_clusters * s->cluster_size);
1652

    
1653
    /* update refcounts */
1654
    create_refcount_update(s, 0, header_size);
1655
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1656
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1657
    create_refcount_update(s, s->refcount_block_offset, ref_clusters * s->cluster_size);
1658

    
1659
    /* write all the data */
1660
    write(fd, &header, sizeof(header));
1661
    if (backing_file) {
1662
        if (backing_format_len) {
1663
            char zero[16];
1664
            int d = ext_bf.len - backing_format_len;
1665

    
1666
            memset(zero, 0, sizeof(zero));
1667
            cpu_to_be32s(&ext_bf.magic);
1668
            cpu_to_be32s(&ext_bf.len);
1669
            write(fd, &ext_bf, sizeof(ext_bf));
1670
            write(fd, backing_format, backing_format_len);
1671
            if (d>0) {
1672
                write(fd, zero, d);
1673
            }
1674
        }
1675
        write(fd, backing_file, backing_filename_len);
1676
    }
1677
    lseek(fd, s->l1_table_offset, SEEK_SET);
1678
    tmp = 0;
1679
    for(i = 0;i < l1_size; i++) {
1680
        write(fd, &tmp, sizeof(tmp));
1681
    }
1682
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1683
    write(fd, s->refcount_table, s->cluster_size);
1684

    
1685
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1686
    write(fd, s->refcount_block, ref_clusters * s->cluster_size);
1687

    
1688
    qemu_free(s->refcount_table);
1689
    qemu_free(s->refcount_block);
1690
    close(fd);
1691
    return 0;
1692
}
1693

    
1694
static int qcow_create(const char *filename, int64_t total_size,
1695
                       const char *backing_file, int flags)
1696
{
1697
    return qcow_create2(filename, total_size, backing_file, NULL, flags);
1698
}
1699

    
1700
static int qcow_make_empty(BlockDriverState *bs)
1701
{
1702
#if 0
1703
    /* XXX: not correct */
1704
    BDRVQcowState *s = bs->opaque;
1705
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1706
    int ret;
1707

1708
    memset(s->l1_table, 0, l1_length);
1709
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1710
        return -1;
1711
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1712
    if (ret < 0)
1713
        return ret;
1714

1715
    l2_cache_reset(bs);
1716
#endif
1717
    return 0;
1718
}
1719

    
1720
/* XXX: put compressed sectors first, then all the cluster aligned
1721
   tables to avoid losing bytes in alignment */
1722
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1723
                                 const uint8_t *buf, int nb_sectors)
1724
{
1725
    BDRVQcowState *s = bs->opaque;
1726
    z_stream strm;
1727
    int ret, out_len;
1728
    uint8_t *out_buf;
1729
    uint64_t cluster_offset;
1730

    
1731
    if (nb_sectors == 0) {
1732
        /* align end of file to a sector boundary to ease reading with
1733
           sector based I/Os */
1734
        cluster_offset = bdrv_getlength(s->hd);
1735
        cluster_offset = (cluster_offset + 511) & ~511;
1736
        bdrv_truncate(s->hd, cluster_offset);
1737
        return 0;
1738
    }
1739

    
1740
    if (nb_sectors != s->cluster_sectors)
1741
        return -EINVAL;
1742

    
1743
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1744

    
1745
    /* best compression, small window, no zlib header */
1746
    memset(&strm, 0, sizeof(strm));
1747
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1748
                       Z_DEFLATED, -12,
1749
                       9, Z_DEFAULT_STRATEGY);
1750
    if (ret != 0) {
1751
        qemu_free(out_buf);
1752
        return -1;
1753
    }
1754

    
1755
    strm.avail_in = s->cluster_size;
1756
    strm.next_in = (uint8_t *)buf;
1757
    strm.avail_out = s->cluster_size;
1758
    strm.next_out = out_buf;
1759

    
1760
    ret = deflate(&strm, Z_FINISH);
1761
    if (ret != Z_STREAM_END && ret != Z_OK) {
1762
        qemu_free(out_buf);
1763
        deflateEnd(&strm);
1764
        return -1;
1765
    }
1766
    out_len = strm.next_out - out_buf;
1767

    
1768
    deflateEnd(&strm);
1769

    
1770
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1771
        /* could not compress: write normal cluster */
1772
        qcow_write(bs, sector_num, buf, s->cluster_sectors);
1773
    } else {
1774
        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
1775
                                              out_len);
1776
        if (!cluster_offset)
1777
            return -1;
1778
        cluster_offset &= s->cluster_offset_mask;
1779
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1780
            qemu_free(out_buf);
1781
            return -1;
1782
        }
1783
    }
1784

    
1785
    qemu_free(out_buf);
1786
    return 0;
1787
}
1788

    
1789
static void qcow_flush(BlockDriverState *bs)
1790
{
1791
    BDRVQcowState *s = bs->opaque;
1792
    bdrv_flush(s->hd);
1793
}
1794

    
1795
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1796
{
1797
    BDRVQcowState *s = bs->opaque;
1798
    bdi->cluster_size = s->cluster_size;
1799
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1800
        (s->cluster_bits + s->l2_bits);
1801
    return 0;
1802
}
1803

    
1804
/*********************************************************/
1805
/* snapshot support */
1806

    
1807
/* update the refcounts of snapshots and the copied flag */
1808
static int update_snapshot_refcount(BlockDriverState *bs,
1809
                                    int64_t l1_table_offset,
1810
                                    int l1_size,
1811
                                    int addend)
1812
{
1813
    BDRVQcowState *s = bs->opaque;
1814
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
1815
    int64_t old_offset, old_l2_offset;
1816
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
1817

    
1818
    l2_cache_reset(bs);
1819

    
1820
    l2_table = NULL;
1821
    l1_table = NULL;
1822
    l1_size2 = l1_size * sizeof(uint64_t);
1823
    l1_allocated = 0;
1824
    if (l1_table_offset != s->l1_table_offset) {
1825
        l1_table = qemu_malloc(l1_size2);
1826
        l1_allocated = 1;
1827
        if (bdrv_pread(s->hd, l1_table_offset,
1828
                       l1_table, l1_size2) != l1_size2)
1829
            goto fail;
1830
        for(i = 0;i < l1_size; i++)
1831
            be64_to_cpus(&l1_table[i]);
1832
    } else {
1833
        assert(l1_size == s->l1_size);
1834
        l1_table = s->l1_table;
1835
        l1_allocated = 0;
1836
    }
1837

    
1838
    l2_size = s->l2_size * sizeof(uint64_t);
1839
    l2_table = qemu_malloc(l2_size);
1840
    l1_modified = 0;
1841
    for(i = 0; i < l1_size; i++) {
1842
        l2_offset = l1_table[i];
1843
        if (l2_offset) {
1844
            old_l2_offset = l2_offset;
1845
            l2_offset &= ~QCOW_OFLAG_COPIED;
1846
            l2_modified = 0;
1847
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
1848
                goto fail;
1849
            for(j = 0; j < s->l2_size; j++) {
1850
                offset = be64_to_cpu(l2_table[j]);
1851
                if (offset != 0) {
1852
                    old_offset = offset;
1853
                    offset &= ~QCOW_OFLAG_COPIED;
1854
                    if (offset & QCOW_OFLAG_COMPRESSED) {
1855
                        nb_csectors = ((offset >> s->csize_shift) &
1856
                                       s->csize_mask) + 1;
1857
                        if (addend != 0)
1858
                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
1859
                                            nb_csectors * 512, addend);
1860
                        /* compressed clusters are never modified */
1861
                        refcount = 2;
1862
                    } else {
1863
                        if (addend != 0) {
1864
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
1865
                        } else {
1866
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
1867
                        }
1868
                    }
1869

    
1870
                    if (refcount == 1) {
1871
                        offset |= QCOW_OFLAG_COPIED;
1872
                    }
1873
                    if (offset != old_offset) {
1874
                        l2_table[j] = cpu_to_be64(offset);
1875
                        l2_modified = 1;
1876
                    }
1877
                }
1878
            }
1879
            if (l2_modified) {
1880
                if (bdrv_pwrite(s->hd,
1881
                                l2_offset, l2_table, l2_size) != l2_size)
1882
                    goto fail;
1883
            }
1884

    
1885
            if (addend != 0) {
1886
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
1887
            } else {
1888
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1889
            }
1890
            if (refcount == 1) {
1891
                l2_offset |= QCOW_OFLAG_COPIED;
1892
            }
1893
            if (l2_offset != old_l2_offset) {
1894
                l1_table[i] = l2_offset;
1895
                l1_modified = 1;
1896
            }
1897
        }
1898
    }
1899
    if (l1_modified) {
1900
        for(i = 0; i < l1_size; i++)
1901
            cpu_to_be64s(&l1_table[i]);
1902
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
1903
                        l1_size2) != l1_size2)
1904
            goto fail;
1905
        for(i = 0; i < l1_size; i++)
1906
            be64_to_cpus(&l1_table[i]);
1907
    }
1908
    if (l1_allocated)
1909
        qemu_free(l1_table);
1910
    qemu_free(l2_table);
1911
    return 0;
1912
 fail:
1913
    if (l1_allocated)
1914
        qemu_free(l1_table);
1915
    qemu_free(l2_table);
1916
    return -EIO;
1917
}
1918

    
1919
static void qcow_free_snapshots(BlockDriverState *bs)
1920
{
1921
    BDRVQcowState *s = bs->opaque;
1922
    int i;
1923

    
1924
    for(i = 0; i < s->nb_snapshots; i++) {
1925
        qemu_free(s->snapshots[i].name);
1926
        qemu_free(s->snapshots[i].id_str);
1927
    }
1928
    qemu_free(s->snapshots);
1929
    s->snapshots = NULL;
1930
    s->nb_snapshots = 0;
1931
}
1932

    
1933
static int qcow_read_snapshots(BlockDriverState *bs)
1934
{
1935
    BDRVQcowState *s = bs->opaque;
1936
    QCowSnapshotHeader h;
1937
    QCowSnapshot *sn;
1938
    int i, id_str_size, name_size;
1939
    int64_t offset;
1940
    uint32_t extra_data_size;
1941

    
1942
    if (!s->nb_snapshots) {
1943
        s->snapshots = NULL;
1944
        s->snapshots_size = 0;
1945
        return 0;
1946
    }
1947

    
1948
    offset = s->snapshots_offset;
1949
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1950
    for(i = 0; i < s->nb_snapshots; i++) {
1951
        offset = align_offset(offset, 8);
1952
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1953
            goto fail;
1954
        offset += sizeof(h);
1955
        sn = s->snapshots + i;
1956
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1957
        sn->l1_size = be32_to_cpu(h.l1_size);
1958
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1959
        sn->date_sec = be32_to_cpu(h.date_sec);
1960
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1961
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1962
        extra_data_size = be32_to_cpu(h.extra_data_size);
1963

    
1964
        id_str_size = be16_to_cpu(h.id_str_size);
1965
        name_size = be16_to_cpu(h.name_size);
1966

    
1967
        offset += extra_data_size;
1968

    
1969
        sn->id_str = qemu_malloc(id_str_size + 1);
1970
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1971
            goto fail;
1972
        offset += id_str_size;
1973
        sn->id_str[id_str_size] = '\0';
1974

    
1975
        sn->name = qemu_malloc(name_size + 1);
1976
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1977
            goto fail;
1978
        offset += name_size;
1979
        sn->name[name_size] = '\0';
1980
    }
1981
    s->snapshots_size = offset - s->snapshots_offset;
1982
    return 0;
1983
 fail:
1984
    qcow_free_snapshots(bs);
1985
    return -1;
1986
}
1987

    
1988
/* add at the end of the file a new list of snapshots */
1989
static int qcow_write_snapshots(BlockDriverState *bs)
1990
{
1991
    BDRVQcowState *s = bs->opaque;
1992
    QCowSnapshot *sn;
1993
    QCowSnapshotHeader h;
1994
    int i, name_size, id_str_size, snapshots_size;
1995
    uint64_t data64;
1996
    uint32_t data32;
1997
    int64_t offset, snapshots_offset;
1998

    
1999
    /* compute the size of the snapshots */
2000
    offset = 0;
2001
    for(i = 0; i < s->nb_snapshots; i++) {
2002
        sn = s->snapshots + i;
2003
        offset = align_offset(offset, 8);
2004
        offset += sizeof(h);
2005
        offset += strlen(sn->id_str);
2006
        offset += strlen(sn->name);
2007
    }
2008
    snapshots_size = offset;
2009

    
2010
    snapshots_offset = alloc_clusters(bs, snapshots_size);
2011
    offset = snapshots_offset;
2012

    
2013
    for(i = 0; i < s->nb_snapshots; i++) {
2014
        sn = s->snapshots + i;
2015
        memset(&h, 0, sizeof(h));
2016
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
2017
        h.l1_size = cpu_to_be32(sn->l1_size);
2018
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
2019
        h.date_sec = cpu_to_be32(sn->date_sec);
2020
        h.date_nsec = cpu_to_be32(sn->date_nsec);
2021
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
2022

    
2023
        id_str_size = strlen(sn->id_str);
2024
        name_size = strlen(sn->name);
2025
        h.id_str_size = cpu_to_be16(id_str_size);
2026
        h.name_size = cpu_to_be16(name_size);
2027
        offset = align_offset(offset, 8);
2028
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
2029
            goto fail;
2030
        offset += sizeof(h);
2031
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
2032
            goto fail;
2033
        offset += id_str_size;
2034
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
2035
            goto fail;
2036
        offset += name_size;
2037
    }
2038

    
2039
    /* update the various header fields */
2040
    data64 = cpu_to_be64(snapshots_offset);
2041
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
2042
                    &data64, sizeof(data64)) != sizeof(data64))
2043
        goto fail;
2044
    data32 = cpu_to_be32(s->nb_snapshots);
2045
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
2046
                    &data32, sizeof(data32)) != sizeof(data32))
2047
        goto fail;
2048

    
2049
    /* free the old snapshot table */
2050
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
2051
    s->snapshots_offset = snapshots_offset;
2052
    s->snapshots_size = snapshots_size;
2053
    return 0;
2054
 fail:
2055
    return -1;
2056
}
2057

    
2058
static void find_new_snapshot_id(BlockDriverState *bs,
2059
                                 char *id_str, int id_str_size)
2060
{
2061
    BDRVQcowState *s = bs->opaque;
2062
    QCowSnapshot *sn;
2063
    int i, id, id_max = 0;
2064

    
2065
    for(i = 0; i < s->nb_snapshots; i++) {
2066
        sn = s->snapshots + i;
2067
        id = strtoul(sn->id_str, NULL, 10);
2068
        if (id > id_max)
2069
            id_max = id;
2070
    }
2071
    snprintf(id_str, id_str_size, "%d", id_max + 1);
2072
}
2073

    
2074
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
2075
{
2076
    BDRVQcowState *s = bs->opaque;
2077
    int i;
2078

    
2079
    for(i = 0; i < s->nb_snapshots; i++) {
2080
        if (!strcmp(s->snapshots[i].id_str, id_str))
2081
            return i;
2082
    }
2083
    return -1;
2084
}
2085

    
2086
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
2087
{
2088
    BDRVQcowState *s = bs->opaque;
2089
    int i, ret;
2090

    
2091
    ret = find_snapshot_by_id(bs, name);
2092
    if (ret >= 0)
2093
        return ret;
2094
    for(i = 0; i < s->nb_snapshots; i++) {
2095
        if (!strcmp(s->snapshots[i].name, name))
2096
            return i;
2097
    }
2098
    return -1;
2099
}
2100

    
2101
/* if no id is provided, a new one is constructed */
2102
static int qcow_snapshot_create(BlockDriverState *bs,
2103
                                QEMUSnapshotInfo *sn_info)
2104
{
2105
    BDRVQcowState *s = bs->opaque;
2106
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
2107
    int i, ret;
2108
    uint64_t *l1_table = NULL;
2109

    
2110
    memset(sn, 0, sizeof(*sn));
2111

    
2112
    if (sn_info->id_str[0] == '\0') {
2113
        /* compute a new id */
2114
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
2115
    }
2116

    
2117
    /* check that the ID is unique */
2118
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
2119
        return -ENOENT;
2120

    
2121
    sn->id_str = qemu_strdup(sn_info->id_str);
2122
    if (!sn->id_str)
2123
        goto fail;
2124
    sn->name = qemu_strdup(sn_info->name);
2125
    if (!sn->name)
2126
        goto fail;
2127
    sn->vm_state_size = sn_info->vm_state_size;
2128
    sn->date_sec = sn_info->date_sec;
2129
    sn->date_nsec = sn_info->date_nsec;
2130
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
2131

    
2132
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
2133
    if (ret < 0)
2134
        goto fail;
2135

    
2136
    /* create the L1 table of the snapshot */
2137
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
2138
    sn->l1_size = s->l1_size;
2139

    
2140
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
2141
    for(i = 0; i < s->l1_size; i++) {
2142
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
2143
    }
2144
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
2145
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
2146
        (s->l1_size * sizeof(uint64_t)))
2147
        goto fail;
2148
    qemu_free(l1_table);
2149
    l1_table = NULL;
2150

    
2151
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
2152
    if (s->snapshots) {
2153
        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
2154
        qemu_free(s->snapshots);
2155
    }
2156
    s->snapshots = snapshots1;
2157
    s->snapshots[s->nb_snapshots++] = *sn;
2158

    
2159
    if (qcow_write_snapshots(bs) < 0)
2160
        goto fail;
2161
#ifdef DEBUG_ALLOC
2162
    check_refcounts(bs);
2163
#endif
2164
    return 0;
2165
 fail:
2166
    qemu_free(sn->name);
2167
    qemu_free(l1_table);
2168
    return -1;
2169
}
2170

    
2171
/* copy the snapshot 'snapshot_name' into the current disk image */
2172
static int qcow_snapshot_goto(BlockDriverState *bs,
2173
                              const char *snapshot_id)
2174
{
2175
    BDRVQcowState *s = bs->opaque;
2176
    QCowSnapshot *sn;
2177
    int i, snapshot_index, l1_size2;
2178

    
2179
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2180
    if (snapshot_index < 0)
2181
        return -ENOENT;
2182
    sn = &s->snapshots[snapshot_index];
2183

    
2184
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
2185
        goto fail;
2186

    
2187
    if (grow_l1_table(bs, sn->l1_size) < 0)
2188
        goto fail;
2189

    
2190
    s->l1_size = sn->l1_size;
2191
    l1_size2 = s->l1_size * sizeof(uint64_t);
2192
    /* copy the snapshot l1 table to the current l1 table */
2193
    if (bdrv_pread(s->hd, sn->l1_table_offset,
2194
                   s->l1_table, l1_size2) != l1_size2)
2195
        goto fail;
2196
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
2197
                    s->l1_table, l1_size2) != l1_size2)
2198
        goto fail;
2199
    for(i = 0;i < s->l1_size; i++) {
2200
        be64_to_cpus(&s->l1_table[i]);
2201
    }
2202

    
2203
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
2204
        goto fail;
2205

    
2206
#ifdef DEBUG_ALLOC
2207
    check_refcounts(bs);
2208
#endif
2209
    return 0;
2210
 fail:
2211
    return -EIO;
2212
}
2213

    
2214
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2215
{
2216
    BDRVQcowState *s = bs->opaque;
2217
    QCowSnapshot *sn;
2218
    int snapshot_index, ret;
2219

    
2220
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2221
    if (snapshot_index < 0)
2222
        return -ENOENT;
2223
    sn = &s->snapshots[snapshot_index];
2224

    
2225
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
2226
    if (ret < 0)
2227
        return ret;
2228
    /* must update the copied flag on the current cluster offsets */
2229
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
2230
    if (ret < 0)
2231
        return ret;
2232
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
2233

    
2234
    qemu_free(sn->id_str);
2235
    qemu_free(sn->name);
2236
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
2237
    s->nb_snapshots--;
2238
    ret = qcow_write_snapshots(bs);
2239
    if (ret < 0) {
2240
        /* XXX: restore snapshot if error ? */
2241
        return ret;
2242
    }
2243
#ifdef DEBUG_ALLOC
2244
    check_refcounts(bs);
2245
#endif
2246
    return 0;
2247
}
2248

    
2249
static int qcow_snapshot_list(BlockDriverState *bs,
2250
                              QEMUSnapshotInfo **psn_tab)
2251
{
2252
    BDRVQcowState *s = bs->opaque;
2253
    QEMUSnapshotInfo *sn_tab, *sn_info;
2254
    QCowSnapshot *sn;
2255
    int i;
2256

    
2257
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
2258
    for(i = 0; i < s->nb_snapshots; i++) {
2259
        sn_info = sn_tab + i;
2260
        sn = s->snapshots + i;
2261
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
2262
                sn->id_str);
2263
        pstrcpy(sn_info->name, sizeof(sn_info->name),
2264
                sn->name);
2265
        sn_info->vm_state_size = sn->vm_state_size;
2266
        sn_info->date_sec = sn->date_sec;
2267
        sn_info->date_nsec = sn->date_nsec;
2268
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
2269
    }
2270
    *psn_tab = sn_tab;
2271
    return s->nb_snapshots;
2272
}
2273

    
2274
/*********************************************************/
2275
/* refcount handling */
2276

    
2277
static int refcount_init(BlockDriverState *bs)
2278
{
2279
    BDRVQcowState *s = bs->opaque;
2280
    int ret, refcount_table_size2, i;
2281

    
2282
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
2283
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
2284
    s->refcount_table = qemu_malloc(refcount_table_size2);
2285
    if (s->refcount_table_size > 0) {
2286
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
2287
                         s->refcount_table, refcount_table_size2);
2288
        if (ret != refcount_table_size2)
2289
            goto fail;
2290
        for(i = 0; i < s->refcount_table_size; i++)
2291
            be64_to_cpus(&s->refcount_table[i]);
2292
    }
2293
    return 0;
2294
 fail:
2295
    return -ENOMEM;
2296
}
2297

    
2298
static void refcount_close(BlockDriverState *bs)
2299
{
2300
    BDRVQcowState *s = bs->opaque;
2301
    qemu_free(s->refcount_block_cache);
2302
    qemu_free(s->refcount_table);
2303
}
2304

    
2305

    
2306
static int load_refcount_block(BlockDriverState *bs,
2307
                               int64_t refcount_block_offset)
2308
{
2309
    BDRVQcowState *s = bs->opaque;
2310
    int ret;
2311
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
2312
                     s->cluster_size);
2313
    if (ret != s->cluster_size)
2314
        return -EIO;
2315
    s->refcount_block_cache_offset = refcount_block_offset;
2316
    return 0;
2317
}
2318

    
2319
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
2320
{
2321
    BDRVQcowState *s = bs->opaque;
2322
    int refcount_table_index, block_index;
2323
    int64_t refcount_block_offset;
2324

    
2325
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2326
    if (refcount_table_index >= s->refcount_table_size)
2327
        return 0;
2328
    refcount_block_offset = s->refcount_table[refcount_table_index];
2329
    if (!refcount_block_offset)
2330
        return 0;
2331
    if (refcount_block_offset != s->refcount_block_cache_offset) {
2332
        /* better than nothing: return allocated if read error */
2333
        if (load_refcount_block(bs, refcount_block_offset) < 0)
2334
            return 1;
2335
    }
2336
    block_index = cluster_index &
2337
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2338
    return be16_to_cpu(s->refcount_block_cache[block_index]);
2339
}
2340

    
2341
/* return < 0 if error */
2342
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
2343
{
2344
    BDRVQcowState *s = bs->opaque;
2345
    int i, nb_clusters;
2346

    
2347
    nb_clusters = size_to_clusters(s, size);
2348
retry:
2349
    for(i = 0; i < nb_clusters; i++) {
2350
        int64_t i = s->free_cluster_index++;
2351
        if (get_refcount(bs, i) != 0)
2352
            goto retry;
2353
    }
2354
#ifdef DEBUG_ALLOC2
2355
    printf("alloc_clusters: size=%lld -> %lld\n",
2356
            size,
2357
            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
2358
#endif
2359
    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
2360
}
2361

    
2362
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
2363
{
2364
    int64_t offset;
2365

    
2366
    offset = alloc_clusters_noref(bs, size);
2367
    update_refcount(bs, offset, size, 1);
2368
    return offset;
2369
}
2370

    
2371
/* only used to allocate compressed sectors. We try to allocate
2372
   contiguous sectors. size must be <= cluster_size */
2373
static int64_t alloc_bytes(BlockDriverState *bs, int size)
2374
{
2375
    BDRVQcowState *s = bs->opaque;
2376
    int64_t offset, cluster_offset;
2377
    int free_in_cluster;
2378

    
2379
    assert(size > 0 && size <= s->cluster_size);
2380
    if (s->free_byte_offset == 0) {
2381
        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
2382
    }
2383
 redo:
2384
    free_in_cluster = s->cluster_size -
2385
        (s->free_byte_offset & (s->cluster_size - 1));
2386
    if (size <= free_in_cluster) {
2387
        /* enough space in current cluster */
2388
        offset = s->free_byte_offset;
2389
        s->free_byte_offset += size;
2390
        free_in_cluster -= size;
2391
        if (free_in_cluster == 0)
2392
            s->free_byte_offset = 0;
2393
        if ((offset & (s->cluster_size - 1)) != 0)
2394
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2395
    } else {
2396
        offset = alloc_clusters(bs, s->cluster_size);
2397
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
2398
        if ((cluster_offset + s->cluster_size) == offset) {
2399
            /* we are lucky: contiguous data */
2400
            offset = s->free_byte_offset;
2401
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2402
            s->free_byte_offset += size;
2403
        } else {
2404
            s->free_byte_offset = offset;
2405
            goto redo;
2406
        }
2407
    }
2408
    return offset;
2409
}
2410

    
2411
static void free_clusters(BlockDriverState *bs,
2412
                          int64_t offset, int64_t size)
2413
{
2414
    update_refcount(bs, offset, size, -1);
2415
}
2416

    
2417
static int grow_refcount_table(BlockDriverState *bs, int min_size)
2418
{
2419
    BDRVQcowState *s = bs->opaque;
2420
    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
2421
    uint64_t *new_table;
2422
    int64_t table_offset;
2423
    uint8_t data[12];
2424
    int old_table_size;
2425
    int64_t old_table_offset;
2426

    
2427
    if (min_size <= s->refcount_table_size)
2428
        return 0;
2429
    /* compute new table size */
2430
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2431
    for(;;) {
2432
        if (refcount_table_clusters == 0) {
2433
            refcount_table_clusters = 1;
2434
        } else {
2435
            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
2436
        }
2437
        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
2438
        if (min_size <= new_table_size)
2439
            break;
2440
    }
2441
#ifdef DEBUG_ALLOC2
2442
    printf("grow_refcount_table from %d to %d\n",
2443
           s->refcount_table_size,
2444
           new_table_size);
2445
#endif
2446
    new_table_size2 = new_table_size * sizeof(uint64_t);
2447
    new_table = qemu_mallocz(new_table_size2);
2448
    memcpy(new_table, s->refcount_table,
2449
           s->refcount_table_size * sizeof(uint64_t));
2450
    for(i = 0; i < s->refcount_table_size; i++)
2451
        cpu_to_be64s(&new_table[i]);
2452
    /* Note: we cannot update the refcount now to avoid recursion */
2453
    table_offset = alloc_clusters_noref(bs, new_table_size2);
2454
    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
2455
    if (ret != new_table_size2)
2456
        goto fail;
2457
    for(i = 0; i < s->refcount_table_size; i++)
2458
        be64_to_cpus(&new_table[i]);
2459

    
2460
    cpu_to_be64w((uint64_t*)data, table_offset);
2461
    cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters);
2462
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
2463
                    data, sizeof(data)) != sizeof(data))
2464
        goto fail;
2465
    qemu_free(s->refcount_table);
2466
    old_table_offset = s->refcount_table_offset;
2467
    old_table_size = s->refcount_table_size;
2468
    s->refcount_table = new_table;
2469
    s->refcount_table_size = new_table_size;
2470
    s->refcount_table_offset = table_offset;
2471

    
2472
    update_refcount(bs, table_offset, new_table_size2, 1);
2473
    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
2474
    return 0;
2475
 fail:
2476
    free_clusters(bs, table_offset, new_table_size2);
2477
    qemu_free(new_table);
2478
    return -EIO;
2479
}
2480

    
2481
/* addend must be 1 or -1 */
2482
/* XXX: cache several refcount block clusters ? */
2483
static int update_cluster_refcount(BlockDriverState *bs,
2484
                                   int64_t cluster_index,
2485
                                   int addend)
2486
{
2487
    BDRVQcowState *s = bs->opaque;
2488
    int64_t offset, refcount_block_offset;
2489
    int ret, refcount_table_index, block_index, refcount;
2490
    uint64_t data64;
2491

    
2492
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2493
    if (refcount_table_index >= s->refcount_table_size) {
2494
        if (addend < 0)
2495
            return -EINVAL;
2496
        ret = grow_refcount_table(bs, refcount_table_index + 1);
2497
        if (ret < 0)
2498
            return ret;
2499
    }
2500
    refcount_block_offset = s->refcount_table[refcount_table_index];
2501
    if (!refcount_block_offset) {
2502
        if (addend < 0)
2503
            return -EINVAL;
2504
        /* create a new refcount block */
2505
        /* Note: we cannot update the refcount now to avoid recursion */
2506
        offset = alloc_clusters_noref(bs, s->cluster_size);
2507
        memset(s->refcount_block_cache, 0, s->cluster_size);
2508
        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
2509
        if (ret != s->cluster_size)
2510
            return -EINVAL;
2511
        s->refcount_table[refcount_table_index] = offset;
2512
        data64 = cpu_to_be64(offset);
2513
        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
2514
                          refcount_table_index * sizeof(uint64_t),
2515
                          &data64, sizeof(data64));
2516
        if (ret != sizeof(data64))
2517
            return -EINVAL;
2518

    
2519
        refcount_block_offset = offset;
2520
        s->refcount_block_cache_offset = offset;
2521
        update_refcount(bs, offset, s->cluster_size, 1);
2522
    } else {
2523
        if (refcount_block_offset != s->refcount_block_cache_offset) {
2524
            if (load_refcount_block(bs, refcount_block_offset) < 0)
2525
                return -EIO;
2526
        }
2527
    }
2528
    /* we can update the count and save it */
2529
    block_index = cluster_index &
2530
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2531
    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
2532
    refcount += addend;
2533
    if (refcount < 0 || refcount > 0xffff)
2534
        return -EINVAL;
2535
    if (refcount == 0 && cluster_index < s->free_cluster_index) {
2536
        s->free_cluster_index = cluster_index;
2537
    }
2538
    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
2539
    if (bdrv_pwrite(s->hd,
2540
                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
2541
                    &s->refcount_block_cache[block_index], 2) != 2)
2542
        return -EIO;
2543
    return refcount;
2544
}
2545

    
2546
static void update_refcount(BlockDriverState *bs,
2547
                            int64_t offset, int64_t length,
2548
                            int addend)
2549
{
2550
    BDRVQcowState *s = bs->opaque;
2551
    int64_t start, last, cluster_offset;
2552

    
2553
#ifdef DEBUG_ALLOC2
2554
    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
2555
           offset, length, addend);
2556
#endif
2557
    if (length <= 0)
2558
        return;
2559
    start = offset & ~(s->cluster_size - 1);
2560
    last = (offset + length - 1) & ~(s->cluster_size - 1);
2561
    for(cluster_offset = start; cluster_offset <= last;
2562
        cluster_offset += s->cluster_size) {
2563
        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
2564
    }
2565
}
2566

    
2567
#ifdef DEBUG_ALLOC
2568
static void inc_refcounts(BlockDriverState *bs,
2569
                          uint16_t *refcount_table,
2570
                          int refcount_table_size,
2571
                          int64_t offset, int64_t size)
2572
{
2573
    BDRVQcowState *s = bs->opaque;
2574
    int64_t start, last, cluster_offset;
2575
    int k;
2576

    
2577
    if (size <= 0)
2578
        return;
2579

    
2580
    start = offset & ~(s->cluster_size - 1);
2581
    last = (offset + size - 1) & ~(s->cluster_size - 1);
2582
    for(cluster_offset = start; cluster_offset <= last;
2583
        cluster_offset += s->cluster_size) {
2584
        k = cluster_offset >> s->cluster_bits;
2585
        if (k < 0 || k >= refcount_table_size) {
2586
            printf("ERROR: invalid cluster offset=0x%llx\n", cluster_offset);
2587
        } else {
2588
            if (++refcount_table[k] == 0) {
2589
                printf("ERROR: overflow cluster offset=0x%llx\n", cluster_offset);
2590
            }
2591
        }
2592
    }
2593
}
2594

    
2595
static int check_refcounts_l1(BlockDriverState *bs,
2596
                              uint16_t *refcount_table,
2597
                              int refcount_table_size,
2598
                              int64_t l1_table_offset, int l1_size,
2599
                              int check_copied)
2600
{
2601
    BDRVQcowState *s = bs->opaque;
2602
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
2603
    int l2_size, i, j, nb_csectors, refcount;
2604

    
2605
    l2_table = NULL;
2606
    l1_size2 = l1_size * sizeof(uint64_t);
2607

    
2608
    inc_refcounts(bs, refcount_table, refcount_table_size,
2609
                  l1_table_offset, l1_size2);
2610

    
2611
    l1_table = qemu_malloc(l1_size2);
2612
    if (bdrv_pread(s->hd, l1_table_offset,
2613
                   l1_table, l1_size2) != l1_size2)
2614
        goto fail;
2615
    for(i = 0;i < l1_size; i++)
2616
        be64_to_cpus(&l1_table[i]);
2617

    
2618
    l2_size = s->l2_size * sizeof(uint64_t);
2619
    l2_table = qemu_malloc(l2_size);
2620
    for(i = 0; i < l1_size; i++) {
2621
        l2_offset = l1_table[i];
2622
        if (l2_offset) {
2623
            if (check_copied) {
2624
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2625
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
2626
                    printf("ERROR OFLAG_COPIED: l2_offset=%llx refcount=%d\n",
2627
                           l2_offset, refcount);
2628
                }
2629
            }
2630
            l2_offset &= ~QCOW_OFLAG_COPIED;
2631
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
2632
                goto fail;
2633
            for(j = 0; j < s->l2_size; j++) {
2634
                offset = be64_to_cpu(l2_table[j]);
2635
                if (offset != 0) {
2636
                    if (offset & QCOW_OFLAG_COMPRESSED) {
2637
                        if (offset & QCOW_OFLAG_COPIED) {
2638
                            printf("ERROR: cluster %lld: copied flag must never be set for compressed clusters\n",
2639
                                   offset >> s->cluster_bits);
2640
                            offset &= ~QCOW_OFLAG_COPIED;
2641
                        }
2642
                        nb_csectors = ((offset >> s->csize_shift) &
2643
                                       s->csize_mask) + 1;
2644
                        offset &= s->cluster_offset_mask;
2645
                        inc_refcounts(bs, refcount_table,
2646
                                      refcount_table_size,
2647
                                      offset & ~511, nb_csectors * 512);
2648
                    } else {
2649
                        if (check_copied) {
2650
                            refcount = get_refcount(bs, (offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
2651
                            if ((refcount == 1) != ((offset & QCOW_OFLAG_COPIED) != 0)) {
2652
                                printf("ERROR OFLAG_COPIED: offset=%llx refcount=%d\n",
2653
                                       offset, refcount);
2654
                            }
2655
                        }
2656
                        offset &= ~QCOW_OFLAG_COPIED;
2657
                        inc_refcounts(bs, refcount_table,
2658
                                      refcount_table_size,
2659
                                      offset, s->cluster_size);
2660
                    }
2661
                }
2662
            }
2663
            inc_refcounts(bs, refcount_table,
2664
                          refcount_table_size,
2665
                          l2_offset,
2666
                          s->cluster_size);
2667
        }
2668
    }
2669
    qemu_free(l1_table);
2670
    qemu_free(l2_table);
2671
    return 0;
2672
 fail:
2673
    printf("ERROR: I/O error in check_refcounts_l1\n");
2674
    qemu_free(l1_table);
2675
    qemu_free(l2_table);
2676
    return -EIO;
2677
}
2678

    
2679
static void check_refcounts(BlockDriverState *bs)
2680
{
2681
    BDRVQcowState *s = bs->opaque;
2682
    int64_t size;
2683
    int nb_clusters, refcount1, refcount2, i;
2684
    QCowSnapshot *sn;
2685
    uint16_t *refcount_table;
2686

    
2687
    size = bdrv_getlength(s->hd);
2688
    nb_clusters = size_to_clusters(s, size);
2689
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
2690

    
2691
    /* header */
2692
    inc_refcounts(bs, refcount_table, nb_clusters,
2693
                  0, s->cluster_size);
2694

    
2695
    check_refcounts_l1(bs, refcount_table, nb_clusters,
2696
                       s->l1_table_offset, s->l1_size, 1);
2697

    
2698
    /* snapshots */
2699
    for(i = 0; i < s->nb_snapshots; i++) {
2700
        sn = s->snapshots + i;
2701
        check_refcounts_l1(bs, refcount_table, nb_clusters,
2702
                           sn->l1_table_offset, sn->l1_size, 0);
2703
    }
2704
    inc_refcounts(bs, refcount_table, nb_clusters,
2705
                  s->snapshots_offset, s->snapshots_size);
2706

    
2707
    /* refcount data */
2708
    inc_refcounts(bs, refcount_table, nb_clusters,
2709
                  s->refcount_table_offset,
2710
                  s->refcount_table_size * sizeof(uint64_t));
2711
    for(i = 0; i < s->refcount_table_size; i++) {
2712
        int64_t offset;
2713
        offset = s->refcount_table[i];
2714
        if (offset != 0) {
2715
            inc_refcounts(bs, refcount_table, nb_clusters,
2716
                          offset, s->cluster_size);
2717
        }
2718
    }
2719

    
2720
    /* compare ref counts */
2721
    for(i = 0; i < nb_clusters; i++) {
2722
        refcount1 = get_refcount(bs, i);
2723
        refcount2 = refcount_table[i];
2724
        if (refcount1 != refcount2)
2725
            printf("ERROR cluster %d refcount=%d reference=%d\n",
2726
                   i, refcount1, refcount2);
2727
    }
2728

    
2729
    qemu_free(refcount_table);
2730
}
2731

    
2732
#if 0
2733
static void dump_refcounts(BlockDriverState *bs)
2734
{
2735
    BDRVQcowState *s = bs->opaque;
2736
    int64_t nb_clusters, k, k1, size;
2737
    int refcount;
2738

2739
    size = bdrv_getlength(s->hd);
2740
    nb_clusters = size_to_clusters(s, size);
2741
    for(k = 0; k < nb_clusters;) {
2742
        k1 = k;
2743
        refcount = get_refcount(bs, k);
2744
        k++;
2745
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2746
            k++;
2747
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2748
    }
2749
}
2750
#endif
2751
#endif
2752

    
2753
static int qcow_put_buffer(BlockDriverState *bs, const uint8_t *buf,
2754
                           int64_t pos, int size)
2755
{
2756
    int growable = bs->growable;
2757

    
2758
    bs->growable = 1;
2759
    bdrv_pwrite(bs, pos, buf, size);
2760
    bs->growable = growable;
2761

    
2762
    return size;
2763
}
2764

    
2765
static int qcow_get_buffer(BlockDriverState *bs, uint8_t *buf,
2766
                           int64_t pos, int size)
2767
{
2768
    int growable = bs->growable;
2769
    int ret;
2770

    
2771
    bs->growable = 1;
2772
    ret = bdrv_pread(bs, pos, buf, size);
2773
    bs->growable = growable;
2774

    
2775
    return ret;
2776
}
2777

    
2778
BlockDriver bdrv_qcow2 = {
2779
    .format_name        = "qcow2",
2780
    .instance_size        = sizeof(BDRVQcowState),
2781
    .bdrv_probe                = qcow_probe,
2782
    .bdrv_open                = qcow_open,
2783
    .bdrv_close                = qcow_close,
2784
    .bdrv_create        = qcow_create,
2785
    .bdrv_flush                = qcow_flush,
2786
    .bdrv_is_allocated        = qcow_is_allocated,
2787
    .bdrv_set_key        = qcow_set_key,
2788
    .bdrv_make_empty        = qcow_make_empty,
2789

    
2790
    .bdrv_aio_readv        = qcow_aio_readv,
2791
    .bdrv_aio_writev        = qcow_aio_writev,
2792
    .bdrv_aio_cancel        = qcow_aio_cancel,
2793
    .aiocb_size                = sizeof(QCowAIOCB),
2794
    .bdrv_write_compressed = qcow_write_compressed,
2795

    
2796
    .bdrv_snapshot_create = qcow_snapshot_create,
2797
    .bdrv_snapshot_goto        = qcow_snapshot_goto,
2798
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2799
    .bdrv_snapshot_list        = qcow_snapshot_list,
2800
    .bdrv_get_info        = qcow_get_info,
2801

    
2802
    .bdrv_put_buffer    = qcow_put_buffer,
2803
    .bdrv_get_buffer    = qcow_get_buffer,
2804

    
2805
    .bdrv_create2 = qcow_create2,
2806
};