Statistics
| Branch: | Revision:

root / block-qcow2.c @ 55616505

History | View | Annotate | Download (89.3 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block_int.h"
26
#include <zlib.h>
27
#include "aes.h"
28

    
29
/*
30
  Differences with QCOW:
31

32
  - Support for multiple incremental snapshots.
33
  - Memory management by reference counts.
34
  - Clusters which have a reference count of one have the bit
35
    QCOW_OFLAG_COPIED to optimize write performance.
36
  - Size of compressed clusters is stored in sectors to reduce bit usage
37
    in the cluster offsets.
38
  - Support for storing additional data (such as the VM state) in the
39
    snapshots.
40
  - If a backing store is used, the cluster size is not constrained
41
    (could be backported to QCOW).
42
  - L2 tables have always a size of one cluster.
43
*/
44

    
45
//#define DEBUG_ALLOC
46
//#define DEBUG_ALLOC2
47
//#define DEBUG_EXT
48

    
49
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
50
#define QCOW_VERSION 2
51

    
52
#define QCOW_CRYPT_NONE 0
53
#define QCOW_CRYPT_AES  1
54

    
55
#define QCOW_MAX_CRYPT_CLUSTERS 32
56

    
57
/* indicate that the refcount of the referenced cluster is exactly one. */
58
#define QCOW_OFLAG_COPIED     (1LL << 63)
59
/* indicate that the cluster is compressed (they never have the copied flag) */
60
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
61

    
62
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
63

    
64
typedef struct QCowHeader {
65
    uint32_t magic;
66
    uint32_t version;
67
    uint64_t backing_file_offset;
68
    uint32_t backing_file_size;
69
    uint32_t cluster_bits;
70
    uint64_t size; /* in bytes */
71
    uint32_t crypt_method;
72
    uint32_t l1_size; /* XXX: save number of clusters instead ? */
73
    uint64_t l1_table_offset;
74
    uint64_t refcount_table_offset;
75
    uint32_t refcount_table_clusters;
76
    uint32_t nb_snapshots;
77
    uint64_t snapshots_offset;
78
} QCowHeader;
79

    
80

    
81
typedef struct {
82
    uint32_t magic;
83
    uint32_t len;
84
} QCowExtension;
85
#define  QCOW_EXT_MAGIC_END 0
86
#define  QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
87

    
88

    
89
typedef struct __attribute__((packed)) QCowSnapshotHeader {
90
    /* header is 8 byte aligned */
91
    uint64_t l1_table_offset;
92

    
93
    uint32_t l1_size;
94
    uint16_t id_str_size;
95
    uint16_t name_size;
96

    
97
    uint32_t date_sec;
98
    uint32_t date_nsec;
99

    
100
    uint64_t vm_clock_nsec;
101

    
102
    uint32_t vm_state_size;
103
    uint32_t extra_data_size; /* for extension */
104
    /* extra data follows */
105
    /* id_str follows */
106
    /* name follows  */
107
} QCowSnapshotHeader;
108

    
109
#define L2_CACHE_SIZE 16
110

    
111
typedef struct QCowSnapshot {
112
    uint64_t l1_table_offset;
113
    uint32_t l1_size;
114
    char *id_str;
115
    char *name;
116
    uint32_t vm_state_size;
117
    uint32_t date_sec;
118
    uint32_t date_nsec;
119
    uint64_t vm_clock_nsec;
120
} QCowSnapshot;
121

    
122
typedef struct BDRVQcowState {
123
    BlockDriverState *hd;
124
    int cluster_bits;
125
    int cluster_size;
126
    int cluster_sectors;
127
    int l2_bits;
128
    int l2_size;
129
    int l1_size;
130
    int l1_vm_state_index;
131
    int csize_shift;
132
    int csize_mask;
133
    uint64_t cluster_offset_mask;
134
    uint64_t l1_table_offset;
135
    uint64_t *l1_table;
136
    uint64_t *l2_cache;
137
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
138
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
139
    uint8_t *cluster_cache;
140
    uint8_t *cluster_data;
141
    uint64_t cluster_cache_offset;
142

    
143
    uint64_t *refcount_table;
144
    uint64_t refcount_table_offset;
145
    uint32_t refcount_table_size;
146
    uint64_t refcount_block_cache_offset;
147
    uint16_t *refcount_block_cache;
148
    int64_t free_cluster_index;
149
    int64_t free_byte_offset;
150

    
151
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
152
    uint32_t crypt_method_header;
153
    AES_KEY aes_encrypt_key;
154
    AES_KEY aes_decrypt_key;
155
    uint64_t snapshots_offset;
156
    int snapshots_size;
157
    int nb_snapshots;
158
    QCowSnapshot *snapshots;
159
} BDRVQcowState;
160

    
161
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
162
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
163
                     uint8_t *buf, int nb_sectors);
164
static int qcow_read_snapshots(BlockDriverState *bs);
165
static void qcow_free_snapshots(BlockDriverState *bs);
166
static int refcount_init(BlockDriverState *bs);
167
static void refcount_close(BlockDriverState *bs);
168
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
169
static int update_cluster_refcount(BlockDriverState *bs,
170
                                   int64_t cluster_index,
171
                                   int addend);
172
static void update_refcount(BlockDriverState *bs,
173
                            int64_t offset, int64_t length,
174
                            int addend);
175
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
176
static int64_t alloc_bytes(BlockDriverState *bs, int size);
177
static void free_clusters(BlockDriverState *bs,
178
                          int64_t offset, int64_t size);
179
static int check_refcounts(BlockDriverState *bs);
180

    
181
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
182
{
183
    const QCowHeader *cow_header = (const void *)buf;
184

    
185
    if (buf_size >= sizeof(QCowHeader) &&
186
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
187
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
188
        return 100;
189
    else
190
        return 0;
191
}
192

    
193

    
194
/* 
195
 * read qcow2 extension and fill bs
196
 * start reading from start_offset
197
 * finish reading upon magic of value 0 or when end_offset reached
198
 * unknown magic is skipped (future extension this version knows nothing about)
199
 * return 0 upon success, non-0 otherwise
200
 */
201
static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset,
202
                                uint64_t end_offset)
203
{
204
    BDRVQcowState *s = bs->opaque;
205
    QCowExtension ext;
206
    uint64_t offset;
207

    
208
#ifdef DEBUG_EXT
209
    printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
210
#endif
211
    offset = start_offset;
212
    while (offset < end_offset) {
213

    
214
#ifdef DEBUG_EXT
215
        /* Sanity check */
216
        if (offset > s->cluster_size)
217
            printf("qcow_handle_extension: suspicious offset %lu\n", offset);
218

    
219
        printf("attemting to read extended header in offset %lu\n", offset);
220
#endif
221

    
222
        if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) {
223
            fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n",
224
                    (unsigned long long)offset);
225
            return 1;
226
        }
227
        be32_to_cpus(&ext.magic);
228
        be32_to_cpus(&ext.len);
229
        offset += sizeof(ext);
230
#ifdef DEBUG_EXT
231
        printf("ext.magic = 0x%x\n", ext.magic);
232
#endif
233
        switch (ext.magic) {
234
        case QCOW_EXT_MAGIC_END:
235
            return 0;
236

    
237
        case QCOW_EXT_MAGIC_BACKING_FORMAT:
238
            if (ext.len >= sizeof(bs->backing_format)) {
239
                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
240
                        " (>=%zu)\n",
241
                        ext.len, sizeof(bs->backing_format));
242
                return 2;
243
            }
244
            if (bdrv_pread(s->hd, offset , bs->backing_format,
245
                           ext.len) != ext.len)
246
                return 3;
247
            bs->backing_format[ext.len] = '\0';
248
#ifdef DEBUG_EXT
249
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
250
#endif
251
            offset += ((ext.len + 7) & ~7);
252
            break;
253

    
254
        default:
255
            /* unknown magic -- just skip it */
256
            offset += ((ext.len + 7) & ~7);
257
            break;
258
        }
259
    }
260

    
261
    return 0;
262
}
263

    
264

    
265
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
266
{
267
    BDRVQcowState *s = bs->opaque;
268
    int len, i, shift, ret;
269
    QCowHeader header;
270
    uint64_t ext_end;
271

    
272
    /* Performance is terrible right now with cache=writethrough due mainly
273
     * to reference count updates.  If the user does not explicitly specify
274
     * a caching type, force to writeback caching.
275
     */
276
    if ((flags & BDRV_O_CACHE_DEF)) {
277
        flags |= BDRV_O_CACHE_WB;
278
        flags &= ~BDRV_O_CACHE_DEF;
279
    }
280
    ret = bdrv_file_open(&s->hd, filename, flags);
281
    if (ret < 0)
282
        return ret;
283
    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
284
        goto fail;
285
    be32_to_cpus(&header.magic);
286
    be32_to_cpus(&header.version);
287
    be64_to_cpus(&header.backing_file_offset);
288
    be32_to_cpus(&header.backing_file_size);
289
    be64_to_cpus(&header.size);
290
    be32_to_cpus(&header.cluster_bits);
291
    be32_to_cpus(&header.crypt_method);
292
    be64_to_cpus(&header.l1_table_offset);
293
    be32_to_cpus(&header.l1_size);
294
    be64_to_cpus(&header.refcount_table_offset);
295
    be32_to_cpus(&header.refcount_table_clusters);
296
    be64_to_cpus(&header.snapshots_offset);
297
    be32_to_cpus(&header.nb_snapshots);
298

    
299
    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
300
        goto fail;
301
    if (header.size <= 1 ||
302
        header.cluster_bits < 9 ||
303
        header.cluster_bits > 16)
304
        goto fail;
305
    if (header.crypt_method > QCOW_CRYPT_AES)
306
        goto fail;
307
    s->crypt_method_header = header.crypt_method;
308
    if (s->crypt_method_header)
309
        bs->encrypted = 1;
310
    s->cluster_bits = header.cluster_bits;
311
    s->cluster_size = 1 << s->cluster_bits;
312
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
313
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
314
    s->l2_size = 1 << s->l2_bits;
315
    bs->total_sectors = header.size / 512;
316
    s->csize_shift = (62 - (s->cluster_bits - 8));
317
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
318
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
319
    s->refcount_table_offset = header.refcount_table_offset;
320
    s->refcount_table_size =
321
        header.refcount_table_clusters << (s->cluster_bits - 3);
322

    
323
    s->snapshots_offset = header.snapshots_offset;
324
    s->nb_snapshots = header.nb_snapshots;
325

    
326
    /* read the level 1 table */
327
    s->l1_size = header.l1_size;
328
    shift = s->cluster_bits + s->l2_bits;
329
    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
330
    /* the L1 table must contain at least enough entries to put
331
       header.size bytes */
332
    if (s->l1_size < s->l1_vm_state_index)
333
        goto fail;
334
    s->l1_table_offset = header.l1_table_offset;
335
    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
336
    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
337
        s->l1_size * sizeof(uint64_t))
338
        goto fail;
339
    for(i = 0;i < s->l1_size; i++) {
340
        be64_to_cpus(&s->l1_table[i]);
341
    }
342
    /* alloc L2 cache */
343
    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
344
    s->cluster_cache = qemu_malloc(s->cluster_size);
345
    /* one more sector for decompressed data alignment */
346
    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
347
                                  + 512);
348
    s->cluster_cache_offset = -1;
349

    
350
    if (refcount_init(bs) < 0)
351
        goto fail;
352

    
353
    /* read qcow2 extensions */
354
    if (header.backing_file_offset)
355
        ext_end = header.backing_file_offset;
356
    else
357
        ext_end = s->cluster_size;
358
    if (qcow_read_extensions(bs, sizeof(header), ext_end))
359
        goto fail;
360

    
361
    /* read the backing file name */
362
    if (header.backing_file_offset != 0) {
363
        len = header.backing_file_size;
364
        if (len > 1023)
365
            len = 1023;
366
        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
367
            goto fail;
368
        bs->backing_file[len] = '\0';
369
    }
370
    if (qcow_read_snapshots(bs) < 0)
371
        goto fail;
372

    
373
#ifdef DEBUG_ALLOC
374
    check_refcounts(bs);
375
#endif
376
    return 0;
377

    
378
 fail:
379
    qcow_free_snapshots(bs);
380
    refcount_close(bs);
381
    qemu_free(s->l1_table);
382
    qemu_free(s->l2_cache);
383
    qemu_free(s->cluster_cache);
384
    qemu_free(s->cluster_data);
385
    bdrv_delete(s->hd);
386
    return -1;
387
}
388

    
389
static int qcow_set_key(BlockDriverState *bs, const char *key)
390
{
391
    BDRVQcowState *s = bs->opaque;
392
    uint8_t keybuf[16];
393
    int len, i;
394

    
395
    memset(keybuf, 0, 16);
396
    len = strlen(key);
397
    if (len > 16)
398
        len = 16;
399
    /* XXX: we could compress the chars to 7 bits to increase
400
       entropy */
401
    for(i = 0;i < len;i++) {
402
        keybuf[i] = key[i];
403
    }
404
    s->crypt_method = s->crypt_method_header;
405

    
406
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
407
        return -1;
408
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
409
        return -1;
410
#if 0
411
    /* test */
412
    {
413
        uint8_t in[16];
414
        uint8_t out[16];
415
        uint8_t tmp[16];
416
        for(i=0;i<16;i++)
417
            in[i] = i;
418
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
419
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
420
        for(i = 0; i < 16; i++)
421
            printf(" %02x", tmp[i]);
422
        printf("\n");
423
        for(i = 0; i < 16; i++)
424
            printf(" %02x", out[i]);
425
        printf("\n");
426
    }
427
#endif
428
    return 0;
429
}
430

    
431
/* The crypt function is compatible with the linux cryptoloop
432
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
433
   supported */
434
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
435
                            uint8_t *out_buf, const uint8_t *in_buf,
436
                            int nb_sectors, int enc,
437
                            const AES_KEY *key)
438
{
439
    union {
440
        uint64_t ll[2];
441
        uint8_t b[16];
442
    } ivec;
443
    int i;
444

    
445
    for(i = 0; i < nb_sectors; i++) {
446
        ivec.ll[0] = cpu_to_le64(sector_num);
447
        ivec.ll[1] = 0;
448
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
449
                        ivec.b, enc);
450
        sector_num++;
451
        in_buf += 512;
452
        out_buf += 512;
453
    }
454
}
455

    
456
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
457
                        uint64_t cluster_offset, int n_start, int n_end)
458
{
459
    BDRVQcowState *s = bs->opaque;
460
    int n, ret;
461

    
462
    n = n_end - n_start;
463
    if (n <= 0)
464
        return 0;
465
    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
466
    if (ret < 0)
467
        return ret;
468
    if (s->crypt_method) {
469
        encrypt_sectors(s, start_sect + n_start,
470
                        s->cluster_data,
471
                        s->cluster_data, n, 1,
472
                        &s->aes_encrypt_key);
473
    }
474
    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
475
                     s->cluster_data, n);
476
    if (ret < 0)
477
        return ret;
478
    return 0;
479
}
480

    
481
static void l2_cache_reset(BlockDriverState *bs)
482
{
483
    BDRVQcowState *s = bs->opaque;
484

    
485
    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
486
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
487
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
488
}
489

    
490
static inline int l2_cache_new_entry(BlockDriverState *bs)
491
{
492
    BDRVQcowState *s = bs->opaque;
493
    uint32_t min_count;
494
    int min_index, i;
495

    
496
    /* find a new entry in the least used one */
497
    min_index = 0;
498
    min_count = 0xffffffff;
499
    for(i = 0; i < L2_CACHE_SIZE; i++) {
500
        if (s->l2_cache_counts[i] < min_count) {
501
            min_count = s->l2_cache_counts[i];
502
            min_index = i;
503
        }
504
    }
505
    return min_index;
506
}
507

    
508
static int64_t align_offset(int64_t offset, int n)
509
{
510
    offset = (offset + n - 1) & ~(n - 1);
511
    return offset;
512
}
513

    
514
static int grow_l1_table(BlockDriverState *bs, int min_size)
515
{
516
    BDRVQcowState *s = bs->opaque;
517
    int new_l1_size, new_l1_size2, ret, i;
518
    uint64_t *new_l1_table;
519
    uint64_t new_l1_table_offset;
520
    uint8_t data[12];
521

    
522
    new_l1_size = s->l1_size;
523
    if (min_size <= new_l1_size)
524
        return 0;
525
    while (min_size > new_l1_size) {
526
        new_l1_size = (new_l1_size * 3 + 1) / 2;
527
    }
528
#ifdef DEBUG_ALLOC2
529
    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
530
#endif
531

    
532
    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
533
    new_l1_table = qemu_mallocz(new_l1_size2);
534
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
535

    
536
    /* write new table (align to cluster) */
537
    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
538

    
539
    for(i = 0; i < s->l1_size; i++)
540
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
541
    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
542
    if (ret != new_l1_size2)
543
        goto fail;
544
    for(i = 0; i < s->l1_size; i++)
545
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
546

    
547
    /* set new table */
548
    cpu_to_be32w((uint32_t*)data, new_l1_size);
549
    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
550
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
551
                sizeof(data)) != sizeof(data))
552
        goto fail;
553
    qemu_free(s->l1_table);
554
    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
555
    s->l1_table_offset = new_l1_table_offset;
556
    s->l1_table = new_l1_table;
557
    s->l1_size = new_l1_size;
558
    return 0;
559
 fail:
560
    qemu_free(s->l1_table);
561
    return -EIO;
562
}
563

    
564
/*
565
 * seek_l2_table
566
 *
567
 * seek l2_offset in the l2_cache table
568
 * if not found, return NULL,
569
 * if found,
570
 *   increments the l2 cache hit count of the entry,
571
 *   if counter overflow, divide by two all counters
572
 *   return the pointer to the l2 cache entry
573
 *
574
 */
575

    
576
static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
577
{
578
    int i, j;
579

    
580
    for(i = 0; i < L2_CACHE_SIZE; i++) {
581
        if (l2_offset == s->l2_cache_offsets[i]) {
582
            /* increment the hit count */
583
            if (++s->l2_cache_counts[i] == 0xffffffff) {
584
                for(j = 0; j < L2_CACHE_SIZE; j++) {
585
                    s->l2_cache_counts[j] >>= 1;
586
                }
587
            }
588
            return s->l2_cache + (i << s->l2_bits);
589
        }
590
    }
591
    return NULL;
592
}
593

    
594
/*
595
 * l2_load
596
 *
597
 * Loads a L2 table into memory. If the table is in the cache, the cache
598
 * is used; otherwise the L2 table is loaded from the image file.
599
 *
600
 * Returns a pointer to the L2 table on success, or NULL if the read from
601
 * the image file failed.
602
 */
603

    
604
static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
605
{
606
    BDRVQcowState *s = bs->opaque;
607
    int min_index;
608
    uint64_t *l2_table;
609

    
610
    /* seek if the table for the given offset is in the cache */
611

    
612
    l2_table = seek_l2_table(s, l2_offset);
613
    if (l2_table != NULL)
614
        return l2_table;
615

    
616
    /* not found: load a new entry in the least used one */
617

    
618
    min_index = l2_cache_new_entry(bs);
619
    l2_table = s->l2_cache + (min_index << s->l2_bits);
620
    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
621
        s->l2_size * sizeof(uint64_t))
622
        return NULL;
623
    s->l2_cache_offsets[min_index] = l2_offset;
624
    s->l2_cache_counts[min_index] = 1;
625

    
626
    return l2_table;
627
}
628

    
629
/*
630
 * l2_allocate
631
 *
632
 * Allocate a new l2 entry in the file. If l1_index points to an already
633
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
634
 * table) copy the contents of the old L2 table into the newly allocated one.
635
 * Otherwise the new table is initialized with zeros.
636
 *
637
 */
638

    
639
static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
640
{
641
    BDRVQcowState *s = bs->opaque;
642
    int min_index;
643
    uint64_t old_l2_offset, tmp;
644
    uint64_t *l2_table, l2_offset;
645

    
646
    old_l2_offset = s->l1_table[l1_index];
647

    
648
    /* allocate a new l2 entry */
649

    
650
    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
651

    
652
    /* update the L1 entry */
653

    
654
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
655

    
656
    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
657
    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
658
                    &tmp, sizeof(tmp)) != sizeof(tmp))
659
        return NULL;
660

    
661
    /* allocate a new entry in the l2 cache */
662

    
663
    min_index = l2_cache_new_entry(bs);
664
    l2_table = s->l2_cache + (min_index << s->l2_bits);
665

    
666
    if (old_l2_offset == 0) {
667
        /* if there was no old l2 table, clear the new table */
668
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
669
    } else {
670
        /* if there was an old l2 table, read it from the disk */
671
        if (bdrv_pread(s->hd, old_l2_offset,
672
                       l2_table, s->l2_size * sizeof(uint64_t)) !=
673
            s->l2_size * sizeof(uint64_t))
674
            return NULL;
675
    }
676
    /* write the l2 table to the file */
677
    if (bdrv_pwrite(s->hd, l2_offset,
678
                    l2_table, s->l2_size * sizeof(uint64_t)) !=
679
        s->l2_size * sizeof(uint64_t))
680
        return NULL;
681

    
682
    /* update the l2 cache entry */
683

    
684
    s->l2_cache_offsets[min_index] = l2_offset;
685
    s->l2_cache_counts[min_index] = 1;
686

    
687
    return l2_table;
688
}
689

    
690
static int size_to_clusters(BDRVQcowState *s, int64_t size)
691
{
692
    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
693
}
694

    
695
static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
696
        uint64_t *l2_table, uint64_t start, uint64_t mask)
697
{
698
    int i;
699
    uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
700

    
701
    if (!offset)
702
        return 0;
703

    
704
    for (i = start; i < start + nb_clusters; i++)
705
        if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
706
            break;
707

    
708
        return (i - start);
709
}
710

    
711
static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
712
{
713
    int i = 0;
714

    
715
    while(nb_clusters-- && l2_table[i] == 0)
716
        i++;
717

    
718
    return i;
719
}
720

    
721
/*
722
 * get_cluster_offset
723
 *
724
 * For a given offset of the disk image, return cluster offset in
725
 * qcow2 file.
726
 *
727
 * on entry, *num is the number of contiguous clusters we'd like to
728
 * access following offset.
729
 *
730
 * on exit, *num is the number of contiguous clusters we can read.
731
 *
732
 * Return 1, if the offset is found
733
 * Return 0, otherwise.
734
 *
735
 */
736

    
737
static uint64_t get_cluster_offset(BlockDriverState *bs,
738
                                   uint64_t offset, int *num)
739
{
740
    BDRVQcowState *s = bs->opaque;
741
    int l1_index, l2_index;
742
    uint64_t l2_offset, *l2_table, cluster_offset;
743
    int l1_bits, c;
744
    int index_in_cluster, nb_available, nb_needed, nb_clusters;
745

    
746
    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
747
    nb_needed = *num + index_in_cluster;
748

    
749
    l1_bits = s->l2_bits + s->cluster_bits;
750

    
751
    /* compute how many bytes there are between the offset and
752
     * the end of the l1 entry
753
     */
754

    
755
    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
756

    
757
    /* compute the number of available sectors */
758

    
759
    nb_available = (nb_available >> 9) + index_in_cluster;
760

    
761
    if (nb_needed > nb_available) {
762
        nb_needed = nb_available;
763
    }
764

    
765
    cluster_offset = 0;
766

    
767
    /* seek the the l2 offset in the l1 table */
768

    
769
    l1_index = offset >> l1_bits;
770
    if (l1_index >= s->l1_size)
771
        goto out;
772

    
773
    l2_offset = s->l1_table[l1_index];
774

    
775
    /* seek the l2 table of the given l2 offset */
776

    
777
    if (!l2_offset)
778
        goto out;
779

    
780
    /* load the l2 table in memory */
781

    
782
    l2_offset &= ~QCOW_OFLAG_COPIED;
783
    l2_table = l2_load(bs, l2_offset);
784
    if (l2_table == NULL)
785
        return 0;
786

    
787
    /* find the cluster offset for the given disk offset */
788

    
789
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
790
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
791
    nb_clusters = size_to_clusters(s, nb_needed << 9);
792

    
793
    if (!cluster_offset) {
794
        /* how many empty clusters ? */
795
        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
796
    } else {
797
        /* how many allocated clusters ? */
798
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
799
                &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
800
    }
801

    
802
   nb_available = (c * s->cluster_sectors);
803
out:
804
    if (nb_available > nb_needed)
805
        nb_available = nb_needed;
806

    
807
    *num = nb_available - index_in_cluster;
808

    
809
    return cluster_offset & ~QCOW_OFLAG_COPIED;
810
}
811

    
812
/*
813
 * free_any_clusters
814
 *
815
 * free clusters according to its type: compressed or not
816
 *
817
 */
818

    
819
static void free_any_clusters(BlockDriverState *bs,
820
                              uint64_t cluster_offset, int nb_clusters)
821
{
822
    BDRVQcowState *s = bs->opaque;
823

    
824
    /* free the cluster */
825

    
826
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
827
        int nb_csectors;
828
        nb_csectors = ((cluster_offset >> s->csize_shift) &
829
                       s->csize_mask) + 1;
830
        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
831
                      nb_csectors * 512);
832
        return;
833
    }
834

    
835
    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
836

    
837
    return;
838
}
839

    
840
/*
841
 * get_cluster_table
842
 *
843
 * for a given disk offset, load (and allocate if needed)
844
 * the l2 table.
845
 *
846
 * the l2 table offset in the qcow2 file and the cluster index
847
 * in the l2 table are given to the caller.
848
 *
849
 */
850

    
851
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
852
                             uint64_t **new_l2_table,
853
                             uint64_t *new_l2_offset,
854
                             int *new_l2_index)
855
{
856
    BDRVQcowState *s = bs->opaque;
857
    int l1_index, l2_index, ret;
858
    uint64_t l2_offset, *l2_table;
859

    
860
    /* seek the the l2 offset in the l1 table */
861

    
862
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
863
    if (l1_index >= s->l1_size) {
864
        ret = grow_l1_table(bs, l1_index + 1);
865
        if (ret < 0)
866
            return 0;
867
    }
868
    l2_offset = s->l1_table[l1_index];
869

    
870
    /* seek the l2 table of the given l2 offset */
871

    
872
    if (l2_offset & QCOW_OFLAG_COPIED) {
873
        /* load the l2 table in memory */
874
        l2_offset &= ~QCOW_OFLAG_COPIED;
875
        l2_table = l2_load(bs, l2_offset);
876
        if (l2_table == NULL)
877
            return 0;
878
    } else {
879
        if (l2_offset)
880
            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
881
        l2_table = l2_allocate(bs, l1_index);
882
        if (l2_table == NULL)
883
            return 0;
884
        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
885
    }
886

    
887
    /* find the cluster offset for the given disk offset */
888

    
889
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
890

    
891
    *new_l2_table = l2_table;
892
    *new_l2_offset = l2_offset;
893
    *new_l2_index = l2_index;
894

    
895
    return 1;
896
}
897

    
898
/*
899
 * alloc_compressed_cluster_offset
900
 *
901
 * For a given offset of the disk image, return cluster offset in
902
 * qcow2 file.
903
 *
904
 * If the offset is not found, allocate a new compressed cluster.
905
 *
906
 * Return the cluster offset if successful,
907
 * Return 0, otherwise.
908
 *
909
 */
910

    
911
static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
912
                                                uint64_t offset,
913
                                                int compressed_size)
914
{
915
    BDRVQcowState *s = bs->opaque;
916
    int l2_index, ret;
917
    uint64_t l2_offset, *l2_table, cluster_offset;
918
    int nb_csectors;
919

    
920
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
921
    if (ret == 0)
922
        return 0;
923

    
924
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
925
    if (cluster_offset & QCOW_OFLAG_COPIED)
926
        return cluster_offset & ~QCOW_OFLAG_COPIED;
927

    
928
    if (cluster_offset)
929
        free_any_clusters(bs, cluster_offset, 1);
930

    
931
    cluster_offset = alloc_bytes(bs, compressed_size);
932
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
933
                  (cluster_offset >> 9);
934

    
935
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
936
                      ((uint64_t)nb_csectors << s->csize_shift);
937

    
938
    /* update L2 table */
939

    
940
    /* compressed clusters never have the copied flag */
941

    
942
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
943
    if (bdrv_pwrite(s->hd,
944
                    l2_offset + l2_index * sizeof(uint64_t),
945
                    l2_table + l2_index,
946
                    sizeof(uint64_t)) != sizeof(uint64_t))
947
        return 0;
948

    
949
    return cluster_offset;
950
}
951

    
952
typedef struct QCowL2Meta
953
{
954
    uint64_t offset;
955
    int n_start;
956
    int nb_available;
957
    int nb_clusters;
958
} QCowL2Meta;
959

    
960
static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
961
        QCowL2Meta *m)
962
{
963
    BDRVQcowState *s = bs->opaque;
964
    int i, j = 0, l2_index, ret;
965
    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
966

    
967
    if (m->nb_clusters == 0)
968
        return 0;
969

    
970
    old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
971

    
972
    /* copy content of unmodified sectors */
973
    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
974
    if (m->n_start) {
975
        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
976
        if (ret < 0)
977
            goto err;
978
    }
979

    
980
    if (m->nb_available & (s->cluster_sectors - 1)) {
981
        uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
982
        ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
983
                m->nb_available - end, s->cluster_sectors);
984
        if (ret < 0)
985
            goto err;
986
    }
987

    
988
    ret = -EIO;
989
    /* update L2 table */
990
    if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
991
        goto err;
992

    
993
    for (i = 0; i < m->nb_clusters; i++) {
994
        /* if two concurrent writes happen to the same unallocated cluster
995
         * each write allocates separate cluster and writes data concurrently.
996
         * The first one to complete updates l2 table with pointer to its
997
         * cluster the second one has to do RMW (which is done above by
998
         * copy_sectors()), update l2 table with its cluster pointer and free
999
         * old cluster. This is what this loop does */
1000
        if(l2_table[l2_index + i] != 0)
1001
            old_cluster[j++] = l2_table[l2_index + i];
1002

    
1003
        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
1004
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
1005
     }
1006

    
1007
    if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
1008
                l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
1009
            m->nb_clusters * sizeof(uint64_t))
1010
        goto err;
1011

    
1012
    for (i = 0; i < j; i++)
1013
        free_any_clusters(bs, be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED,
1014
                          1);
1015

    
1016
    ret = 0;
1017
err:
1018
    qemu_free(old_cluster);
1019
    return ret;
1020
 }
1021

    
1022
/*
1023
 * alloc_cluster_offset
1024
 *
1025
 * For a given offset of the disk image, return cluster offset in
1026
 * qcow2 file.
1027
 *
1028
 * If the offset is not found, allocate a new cluster.
1029
 *
1030
 * Return the cluster offset if successful,
1031
 * Return 0, otherwise.
1032
 *
1033
 */
1034

    
1035
static uint64_t alloc_cluster_offset(BlockDriverState *bs,
1036
                                     uint64_t offset,
1037
                                     int n_start, int n_end,
1038
                                     int *num, QCowL2Meta *m)
1039
{
1040
    BDRVQcowState *s = bs->opaque;
1041
    int l2_index, ret;
1042
    uint64_t l2_offset, *l2_table, cluster_offset;
1043
    int nb_clusters, i = 0;
1044

    
1045
    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
1046
    if (ret == 0)
1047
        return 0;
1048

    
1049
    nb_clusters = size_to_clusters(s, n_end << 9);
1050

    
1051
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1052

    
1053
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
1054

    
1055
    /* We keep all QCOW_OFLAG_COPIED clusters */
1056

    
1057
    if (cluster_offset & QCOW_OFLAG_COPIED) {
1058
        nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
1059
                &l2_table[l2_index], 0, 0);
1060

    
1061
        cluster_offset &= ~QCOW_OFLAG_COPIED;
1062
        m->nb_clusters = 0;
1063

    
1064
        goto out;
1065
    }
1066

    
1067
    /* for the moment, multiple compressed clusters are not managed */
1068

    
1069
    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
1070
        nb_clusters = 1;
1071

    
1072
    /* how many available clusters ? */
1073

    
1074
    while (i < nb_clusters) {
1075
        i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
1076
                &l2_table[l2_index], i, 0);
1077

    
1078
        if(be64_to_cpu(l2_table[l2_index + i]))
1079
            break;
1080

    
1081
        i += count_contiguous_free_clusters(nb_clusters - i,
1082
                &l2_table[l2_index + i]);
1083

    
1084
        cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
1085

    
1086
        if ((cluster_offset & QCOW_OFLAG_COPIED) ||
1087
                (cluster_offset & QCOW_OFLAG_COMPRESSED))
1088
            break;
1089
    }
1090
    nb_clusters = i;
1091

    
1092
    /* allocate a new cluster */
1093

    
1094
    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
1095

    
1096
    /* save info needed for meta data update */
1097
    m->offset = offset;
1098
    m->n_start = n_start;
1099
    m->nb_clusters = nb_clusters;
1100

    
1101
out:
1102
    m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
1103

    
1104
    *num = m->nb_available - n_start;
1105

    
1106
    return cluster_offset;
1107
}
1108

    
1109
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
1110
                             int nb_sectors, int *pnum)
1111
{
1112
    uint64_t cluster_offset;
1113

    
1114
    *pnum = nb_sectors;
1115
    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
1116

    
1117
    return (cluster_offset != 0);
1118
}
1119

    
1120
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1121
                             const uint8_t *buf, int buf_size)
1122
{
1123
    z_stream strm1, *strm = &strm1;
1124
    int ret, out_len;
1125

    
1126
    memset(strm, 0, sizeof(*strm));
1127

    
1128
    strm->next_in = (uint8_t *)buf;
1129
    strm->avail_in = buf_size;
1130
    strm->next_out = out_buf;
1131
    strm->avail_out = out_buf_size;
1132

    
1133
    ret = inflateInit2(strm, -12);
1134
    if (ret != Z_OK)
1135
        return -1;
1136
    ret = inflate(strm, Z_FINISH);
1137
    out_len = strm->next_out - out_buf;
1138
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1139
        out_len != out_buf_size) {
1140
        inflateEnd(strm);
1141
        return -1;
1142
    }
1143
    inflateEnd(strm);
1144
    return 0;
1145
}
1146

    
1147
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
1148
{
1149
    int ret, csize, nb_csectors, sector_offset;
1150
    uint64_t coffset;
1151

    
1152
    coffset = cluster_offset & s->cluster_offset_mask;
1153
    if (s->cluster_cache_offset != coffset) {
1154
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1155
        sector_offset = coffset & 511;
1156
        csize = nb_csectors * 512 - sector_offset;
1157
        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
1158
        if (ret < 0) {
1159
            return -1;
1160
        }
1161
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1162
                              s->cluster_data + sector_offset, csize) < 0) {
1163
            return -1;
1164
        }
1165
        s->cluster_cache_offset = coffset;
1166
    }
1167
    return 0;
1168
}
1169

    
1170
/* handle reading after the end of the backing file */
1171
static int backing_read1(BlockDriverState *bs,
1172
                         int64_t sector_num, uint8_t *buf, int nb_sectors)
1173
{
1174
    int n1;
1175
    if ((sector_num + nb_sectors) <= bs->total_sectors)
1176
        return nb_sectors;
1177
    if (sector_num >= bs->total_sectors)
1178
        n1 = 0;
1179
    else
1180
        n1 = bs->total_sectors - sector_num;
1181
    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
1182
    return n1;
1183
}
1184

    
1185
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
1186
                     uint8_t *buf, int nb_sectors)
1187
{
1188
    BDRVQcowState *s = bs->opaque;
1189
    int ret, index_in_cluster, n, n1;
1190
    uint64_t cluster_offset;
1191

    
1192
    while (nb_sectors > 0) {
1193
        n = nb_sectors;
1194
        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
1195
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1196
        if (!cluster_offset) {
1197
            if (bs->backing_hd) {
1198
                /* read from the base image */
1199
                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
1200
                if (n1 > 0) {
1201
                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
1202
                    if (ret < 0)
1203
                        return -1;
1204
                }
1205
            } else {
1206
                memset(buf, 0, 512 * n);
1207
            }
1208
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1209
            if (decompress_cluster(s, cluster_offset) < 0)
1210
                return -1;
1211
            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
1212
        } else {
1213
            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1214
            if (ret != n * 512)
1215
                return -1;
1216
            if (s->crypt_method) {
1217
                encrypt_sectors(s, sector_num, buf, buf, n, 0,
1218
                                &s->aes_decrypt_key);
1219
            }
1220
        }
1221
        nb_sectors -= n;
1222
        sector_num += n;
1223
        buf += n * 512;
1224
    }
1225
    return 0;
1226
}
1227

    
1228
static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1229
                     const uint8_t *buf, int nb_sectors)
1230
{
1231
    BDRVQcowState *s = bs->opaque;
1232
    int ret, index_in_cluster, n;
1233
    uint64_t cluster_offset;
1234
    int n_end;
1235
    QCowL2Meta l2meta;
1236

    
1237
    while (nb_sectors > 0) {
1238
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1239
        n_end = index_in_cluster + nb_sectors;
1240
        if (s->crypt_method &&
1241
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1242
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1243
        cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1244
                                              index_in_cluster,
1245
                                              n_end, &n, &l2meta);
1246
        if (!cluster_offset)
1247
            return -1;
1248
        if (s->crypt_method) {
1249
            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
1250
                            &s->aes_encrypt_key);
1251
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
1252
                              s->cluster_data, n * 512);
1253
        } else {
1254
            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1255
        }
1256
        if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
1257
            free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
1258
            return -1;
1259
        }
1260
        nb_sectors -= n;
1261
        sector_num += n;
1262
        buf += n * 512;
1263
    }
1264
    s->cluster_cache_offset = -1; /* disable compressed cache */
1265
    return 0;
1266
}
1267

    
1268
typedef struct QCowAIOCB {
1269
    BlockDriverAIOCB common;
1270
    int64_t sector_num;
1271
    QEMUIOVector *qiov;
1272
    uint8_t *buf;
1273
    void *orig_buf;
1274
    int nb_sectors;
1275
    int n;
1276
    uint64_t cluster_offset;
1277
    uint8_t *cluster_data;
1278
    BlockDriverAIOCB *hd_aiocb;
1279
    struct iovec hd_iov;
1280
    QEMUIOVector hd_qiov;
1281
    QEMUBH *bh;
1282
    QCowL2Meta l2meta;
1283
} QCowAIOCB;
1284

    
1285
static void qcow_aio_read_cb(void *opaque, int ret);
1286
static void qcow_aio_read_bh(void *opaque)
1287
{
1288
    QCowAIOCB *acb = opaque;
1289
    qemu_bh_delete(acb->bh);
1290
    acb->bh = NULL;
1291
    qcow_aio_read_cb(opaque, 0);
1292
}
1293

    
1294
static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
1295
{
1296
    if (acb->bh)
1297
        return -EIO;
1298

    
1299
    acb->bh = qemu_bh_new(cb, acb);
1300
    if (!acb->bh)
1301
        return -EIO;
1302

    
1303
    qemu_bh_schedule(acb->bh);
1304

    
1305
    return 0;
1306
}
1307

    
1308
static void qcow_aio_read_cb(void *opaque, int ret)
1309
{
1310
    QCowAIOCB *acb = opaque;
1311
    BlockDriverState *bs = acb->common.bs;
1312
    BDRVQcowState *s = bs->opaque;
1313
    int index_in_cluster, n1;
1314

    
1315
    acb->hd_aiocb = NULL;
1316
    if (ret < 0)
1317
        goto done;
1318

    
1319
    /* post process the read buffer */
1320
    if (!acb->cluster_offset) {
1321
        /* nothing to do */
1322
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1323
        /* nothing to do */
1324
    } else {
1325
        if (s->crypt_method) {
1326
            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
1327
                            acb->n, 0,
1328
                            &s->aes_decrypt_key);
1329
        }
1330
    }
1331

    
1332
    acb->nb_sectors -= acb->n;
1333
    acb->sector_num += acb->n;
1334
    acb->buf += acb->n * 512;
1335

    
1336
    if (acb->nb_sectors == 0) {
1337
        /* request completed */
1338
        ret = 0;
1339
        goto done;
1340
    }
1341

    
1342
    /* prepare next AIO request */
1343
    acb->n = acb->nb_sectors;
1344
    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
1345
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1346

    
1347
    if (!acb->cluster_offset) {
1348
        if (bs->backing_hd) {
1349
            /* read from the base image */
1350
            n1 = backing_read1(bs->backing_hd, acb->sector_num,
1351
                               acb->buf, acb->n);
1352
            if (n1 > 0) {
1353
                acb->hd_iov.iov_base = (void *)acb->buf;
1354
                acb->hd_iov.iov_len = acb->n * 512;
1355
                qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
1356
                acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
1357
                                    &acb->hd_qiov, acb->n,
1358
                                    qcow_aio_read_cb, acb);
1359
                if (acb->hd_aiocb == NULL)
1360
                    goto done;
1361
            } else {
1362
                ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1363
                if (ret < 0)
1364
                    goto done;
1365
            }
1366
        } else {
1367
            /* Note: in this case, no need to wait */
1368
            memset(acb->buf, 0, 512 * acb->n);
1369
            ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1370
            if (ret < 0)
1371
                goto done;
1372
        }
1373
    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
1374
        /* add AIO support for compressed blocks ? */
1375
        if (decompress_cluster(s, acb->cluster_offset) < 0)
1376
            goto done;
1377
        memcpy(acb->buf,
1378
               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
1379
        ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
1380
        if (ret < 0)
1381
            goto done;
1382
    } else {
1383
        if ((acb->cluster_offset & 511) != 0) {
1384
            ret = -EIO;
1385
            goto done;
1386
        }
1387

    
1388
        acb->hd_iov.iov_base = (void *)acb->buf;
1389
        acb->hd_iov.iov_len = acb->n * 512;
1390
        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
1391
        acb->hd_aiocb = bdrv_aio_readv(s->hd,
1392
                            (acb->cluster_offset >> 9) + index_in_cluster,
1393
                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
1394
        if (acb->hd_aiocb == NULL)
1395
            goto done;
1396
    }
1397

    
1398
    return;
1399
done:
1400
    if (acb->qiov->niov > 1) {
1401
        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
1402
        qemu_vfree(acb->orig_buf);
1403
    }
1404
    acb->common.cb(acb->common.opaque, ret);
1405
    qemu_aio_release(acb);
1406
}
1407

    
1408
static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1409
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1410
        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
1411
{
1412
    QCowAIOCB *acb;
1413

    
1414
    acb = qemu_aio_get(bs, cb, opaque);
1415
    if (!acb)
1416
        return NULL;
1417
    acb->hd_aiocb = NULL;
1418
    acb->sector_num = sector_num;
1419
    acb->qiov = qiov;
1420
    if (qiov->niov > 1) {
1421
        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
1422
        if (is_write)
1423
            qemu_iovec_to_buffer(qiov, acb->buf);
1424
    } else {
1425
        acb->buf = (uint8_t *)qiov->iov->iov_base;
1426
    }
1427
    acb->nb_sectors = nb_sectors;
1428
    acb->n = 0;
1429
    acb->cluster_offset = 0;
1430
    acb->l2meta.nb_clusters = 0;
1431
    return acb;
1432
}
1433

    
1434
static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
1435
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1436
        BlockDriverCompletionFunc *cb, void *opaque)
1437
{
1438
    QCowAIOCB *acb;
1439

    
1440
    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
1441
    if (!acb)
1442
        return NULL;
1443

    
1444
    qcow_aio_read_cb(acb, 0);
1445
    return &acb->common;
1446
}
1447

    
1448
static void qcow_aio_write_cb(void *opaque, int ret)
1449
{
1450
    QCowAIOCB *acb = opaque;
1451
    BlockDriverState *bs = acb->common.bs;
1452
    BDRVQcowState *s = bs->opaque;
1453
    int index_in_cluster;
1454
    const uint8_t *src_buf;
1455
    int n_end;
1456

    
1457
    acb->hd_aiocb = NULL;
1458

    
1459
    if (ret < 0)
1460
        goto done;
1461

    
1462
    if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
1463
        free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
1464
        goto done;
1465
    }
1466

    
1467
    acb->nb_sectors -= acb->n;
1468
    acb->sector_num += acb->n;
1469
    acb->buf += acb->n * 512;
1470

    
1471
    if (acb->nb_sectors == 0) {
1472
        /* request completed */
1473
        ret = 0;
1474
        goto done;
1475
    }
1476

    
1477
    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
1478
    n_end = index_in_cluster + acb->nb_sectors;
1479
    if (s->crypt_method &&
1480
        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1481
        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1482

    
1483
    acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1484
                                          index_in_cluster,
1485
                                          n_end, &acb->n, &acb->l2meta);
1486
    if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
1487
        ret = -EIO;
1488
        goto done;
1489
    }
1490
    if (s->crypt_method) {
1491
        if (!acb->cluster_data) {
1492
            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
1493
                                             s->cluster_size);
1494
        }
1495
        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
1496
                        acb->n, 1, &s->aes_encrypt_key);
1497
        src_buf = acb->cluster_data;
1498
    } else {
1499
        src_buf = acb->buf;
1500
    }
1501
    acb->hd_iov.iov_base = (void *)src_buf;
1502
    acb->hd_iov.iov_len = acb->n * 512;
1503
    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
1504
    acb->hd_aiocb = bdrv_aio_writev(s->hd,
1505
                                    (acb->cluster_offset >> 9) + index_in_cluster,
1506
                                    &acb->hd_qiov, acb->n,
1507
                                    qcow_aio_write_cb, acb);
1508
    if (acb->hd_aiocb == NULL)
1509
        goto done;
1510

    
1511
    return;
1512

    
1513
done:
1514
    if (acb->qiov->niov > 1)
1515
        qemu_vfree(acb->orig_buf);
1516
    acb->common.cb(acb->common.opaque, ret);
1517
    qemu_aio_release(acb);
1518
}
1519

    
1520
static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
1521
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1522
        BlockDriverCompletionFunc *cb, void *opaque)
1523
{
1524
    BDRVQcowState *s = bs->opaque;
1525
    QCowAIOCB *acb;
1526

    
1527
    s->cluster_cache_offset = -1; /* disable compressed cache */
1528

    
1529
    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
1530
    if (!acb)
1531
        return NULL;
1532

    
1533
    qcow_aio_write_cb(acb, 0);
1534
    return &acb->common;
1535
}
1536

    
1537
static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
1538
{
1539
    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
1540
    if (acb->hd_aiocb)
1541
        bdrv_aio_cancel(acb->hd_aiocb);
1542
    qemu_aio_release(acb);
1543
}
1544

    
1545
static void qcow_close(BlockDriverState *bs)
1546
{
1547
    BDRVQcowState *s = bs->opaque;
1548
    qemu_free(s->l1_table);
1549
    qemu_free(s->l2_cache);
1550
    qemu_free(s->cluster_cache);
1551
    qemu_free(s->cluster_data);
1552
    refcount_close(bs);
1553
    bdrv_delete(s->hd);
1554
}
1555

    
1556
/* XXX: use std qcow open function ? */
1557
typedef struct QCowCreateState {
1558
    int cluster_size;
1559
    int cluster_bits;
1560
    uint16_t *refcount_block;
1561
    uint64_t *refcount_table;
1562
    int64_t l1_table_offset;
1563
    int64_t refcount_table_offset;
1564
    int64_t refcount_block_offset;
1565
} QCowCreateState;
1566

    
1567
static void create_refcount_update(QCowCreateState *s,
1568
                                   int64_t offset, int64_t size)
1569
{
1570
    int refcount;
1571
    int64_t start, last, cluster_offset;
1572
    uint16_t *p;
1573

    
1574
    start = offset & ~(s->cluster_size - 1);
1575
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
1576
    for(cluster_offset = start; cluster_offset <= last;
1577
        cluster_offset += s->cluster_size) {
1578
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
1579
        refcount = be16_to_cpu(*p);
1580
        refcount++;
1581
        *p = cpu_to_be16(refcount);
1582
    }
1583
}
1584

    
1585
static int qcow_create2(const char *filename, int64_t total_size,
1586
                        const char *backing_file, const char *backing_format,
1587
                        int flags)
1588
{
1589

    
1590
    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
1591
    int ref_clusters, backing_format_len = 0;
1592
    QCowHeader header;
1593
    uint64_t tmp, offset;
1594
    QCowCreateState s1, *s = &s1;
1595
    QCowExtension ext_bf = {0, 0};
1596

    
1597

    
1598
    memset(s, 0, sizeof(*s));
1599

    
1600
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
1601
    if (fd < 0)
1602
        return -1;
1603
    memset(&header, 0, sizeof(header));
1604
    header.magic = cpu_to_be32(QCOW_MAGIC);
1605
    header.version = cpu_to_be32(QCOW_VERSION);
1606
    header.size = cpu_to_be64(total_size * 512);
1607
    header_size = sizeof(header);
1608
    backing_filename_len = 0;
1609
    if (backing_file) {
1610
        if (backing_format) {
1611
            ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT;
1612
            backing_format_len = strlen(backing_format);
1613
            ext_bf.len = (backing_format_len + 7) & ~7;
1614
            header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7);
1615
        }
1616
        header.backing_file_offset = cpu_to_be64(header_size);
1617
        backing_filename_len = strlen(backing_file);
1618
        header.backing_file_size = cpu_to_be32(backing_filename_len);
1619
        header_size += backing_filename_len;
1620
    }
1621
    s->cluster_bits = 12;  /* 4 KB clusters */
1622
    s->cluster_size = 1 << s->cluster_bits;
1623
    header.cluster_bits = cpu_to_be32(s->cluster_bits);
1624
    header_size = (header_size + 7) & ~7;
1625
    if (flags & BLOCK_FLAG_ENCRYPT) {
1626
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1627
    } else {
1628
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1629
    }
1630
    l2_bits = s->cluster_bits - 3;
1631
    shift = s->cluster_bits + l2_bits;
1632
    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
1633
    offset = align_offset(header_size, s->cluster_size);
1634
    s->l1_table_offset = offset;
1635
    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
1636
    header.l1_size = cpu_to_be32(l1_size);
1637
    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
1638

    
1639
    s->refcount_table = qemu_mallocz(s->cluster_size);
1640

    
1641
    s->refcount_table_offset = offset;
1642
    header.refcount_table_offset = cpu_to_be64(offset);
1643
    header.refcount_table_clusters = cpu_to_be32(1);
1644
    offset += s->cluster_size;
1645
    s->refcount_block_offset = offset;
1646

    
1647
    /* count how many refcount blocks needed */
1648
    tmp = offset >> s->cluster_bits;
1649
    ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1;
1650
    for (i=0; i < ref_clusters; i++) {
1651
        s->refcount_table[i] = cpu_to_be64(offset);
1652
        offset += s->cluster_size;
1653
    }
1654

    
1655
    s->refcount_block = qemu_mallocz(ref_clusters * s->cluster_size);
1656

    
1657
    /* update refcounts */
1658
    create_refcount_update(s, 0, header_size);
1659
    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
1660
    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
1661
    create_refcount_update(s, s->refcount_block_offset, ref_clusters * s->cluster_size);
1662

    
1663
    /* write all the data */
1664
    write(fd, &header, sizeof(header));
1665
    if (backing_file) {
1666
        if (backing_format_len) {
1667
            char zero[16];
1668
            int d = ext_bf.len - backing_format_len;
1669

    
1670
            memset(zero, 0, sizeof(zero));
1671
            cpu_to_be32s(&ext_bf.magic);
1672
            cpu_to_be32s(&ext_bf.len);
1673
            write(fd, &ext_bf, sizeof(ext_bf));
1674
            write(fd, backing_format, backing_format_len);
1675
            if (d>0) {
1676
                write(fd, zero, d);
1677
            }
1678
        }
1679
        write(fd, backing_file, backing_filename_len);
1680
    }
1681
    lseek(fd, s->l1_table_offset, SEEK_SET);
1682
    tmp = 0;
1683
    for(i = 0;i < l1_size; i++) {
1684
        write(fd, &tmp, sizeof(tmp));
1685
    }
1686
    lseek(fd, s->refcount_table_offset, SEEK_SET);
1687
    write(fd, s->refcount_table, s->cluster_size);
1688

    
1689
    lseek(fd, s->refcount_block_offset, SEEK_SET);
1690
    write(fd, s->refcount_block, ref_clusters * s->cluster_size);
1691

    
1692
    qemu_free(s->refcount_table);
1693
    qemu_free(s->refcount_block);
1694
    close(fd);
1695
    return 0;
1696
}
1697

    
1698
static int qcow_create(const char *filename, int64_t total_size,
1699
                       const char *backing_file, int flags)
1700
{
1701
    return qcow_create2(filename, total_size, backing_file, NULL, flags);
1702
}
1703

    
1704
static int qcow_make_empty(BlockDriverState *bs)
1705
{
1706
#if 0
1707
    /* XXX: not correct */
1708
    BDRVQcowState *s = bs->opaque;
1709
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1710
    int ret;
1711

1712
    memset(s->l1_table, 0, l1_length);
1713
    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
1714
        return -1;
1715
    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
1716
    if (ret < 0)
1717
        return ret;
1718

1719
    l2_cache_reset(bs);
1720
#endif
1721
    return 0;
1722
}
1723

    
1724
/* XXX: put compressed sectors first, then all the cluster aligned
1725
   tables to avoid losing bytes in alignment */
1726
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
1727
                                 const uint8_t *buf, int nb_sectors)
1728
{
1729
    BDRVQcowState *s = bs->opaque;
1730
    z_stream strm;
1731
    int ret, out_len;
1732
    uint8_t *out_buf;
1733
    uint64_t cluster_offset;
1734

    
1735
    if (nb_sectors == 0) {
1736
        /* align end of file to a sector boundary to ease reading with
1737
           sector based I/Os */
1738
        cluster_offset = bdrv_getlength(s->hd);
1739
        cluster_offset = (cluster_offset + 511) & ~511;
1740
        bdrv_truncate(s->hd, cluster_offset);
1741
        return 0;
1742
    }
1743

    
1744
    if (nb_sectors != s->cluster_sectors)
1745
        return -EINVAL;
1746

    
1747
    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1748

    
1749
    /* best compression, small window, no zlib header */
1750
    memset(&strm, 0, sizeof(strm));
1751
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1752
                       Z_DEFLATED, -12,
1753
                       9, Z_DEFAULT_STRATEGY);
1754
    if (ret != 0) {
1755
        qemu_free(out_buf);
1756
        return -1;
1757
    }
1758

    
1759
    strm.avail_in = s->cluster_size;
1760
    strm.next_in = (uint8_t *)buf;
1761
    strm.avail_out = s->cluster_size;
1762
    strm.next_out = out_buf;
1763

    
1764
    ret = deflate(&strm, Z_FINISH);
1765
    if (ret != Z_STREAM_END && ret != Z_OK) {
1766
        qemu_free(out_buf);
1767
        deflateEnd(&strm);
1768
        return -1;
1769
    }
1770
    out_len = strm.next_out - out_buf;
1771

    
1772
    deflateEnd(&strm);
1773

    
1774
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1775
        /* could not compress: write normal cluster */
1776
        qcow_write(bs, sector_num, buf, s->cluster_sectors);
1777
    } else {
1778
        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
1779
                                              out_len);
1780
        if (!cluster_offset)
1781
            return -1;
1782
        cluster_offset &= s->cluster_offset_mask;
1783
        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
1784
            qemu_free(out_buf);
1785
            return -1;
1786
        }
1787
    }
1788

    
1789
    qemu_free(out_buf);
1790
    return 0;
1791
}
1792

    
1793
static void qcow_flush(BlockDriverState *bs)
1794
{
1795
    BDRVQcowState *s = bs->opaque;
1796
    bdrv_flush(s->hd);
1797
}
1798

    
1799
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1800
{
1801
    BDRVQcowState *s = bs->opaque;
1802
    bdi->cluster_size = s->cluster_size;
1803
    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
1804
        (s->cluster_bits + s->l2_bits);
1805
    return 0;
1806
}
1807

    
1808
/*********************************************************/
1809
/* snapshot support */
1810

    
1811
/* update the refcounts of snapshots and the copied flag */
1812
static int update_snapshot_refcount(BlockDriverState *bs,
1813
                                    int64_t l1_table_offset,
1814
                                    int l1_size,
1815
                                    int addend)
1816
{
1817
    BDRVQcowState *s = bs->opaque;
1818
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
1819
    int64_t old_offset, old_l2_offset;
1820
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
1821

    
1822
    l2_cache_reset(bs);
1823

    
1824
    l2_table = NULL;
1825
    l1_table = NULL;
1826
    l1_size2 = l1_size * sizeof(uint64_t);
1827
    l1_allocated = 0;
1828
    if (l1_table_offset != s->l1_table_offset) {
1829
        l1_table = qemu_malloc(l1_size2);
1830
        l1_allocated = 1;
1831
        if (bdrv_pread(s->hd, l1_table_offset,
1832
                       l1_table, l1_size2) != l1_size2)
1833
            goto fail;
1834
        for(i = 0;i < l1_size; i++)
1835
            be64_to_cpus(&l1_table[i]);
1836
    } else {
1837
        assert(l1_size == s->l1_size);
1838
        l1_table = s->l1_table;
1839
        l1_allocated = 0;
1840
    }
1841

    
1842
    l2_size = s->l2_size * sizeof(uint64_t);
1843
    l2_table = qemu_malloc(l2_size);
1844
    l1_modified = 0;
1845
    for(i = 0; i < l1_size; i++) {
1846
        l2_offset = l1_table[i];
1847
        if (l2_offset) {
1848
            old_l2_offset = l2_offset;
1849
            l2_offset &= ~QCOW_OFLAG_COPIED;
1850
            l2_modified = 0;
1851
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
1852
                goto fail;
1853
            for(j = 0; j < s->l2_size; j++) {
1854
                offset = be64_to_cpu(l2_table[j]);
1855
                if (offset != 0) {
1856
                    old_offset = offset;
1857
                    offset &= ~QCOW_OFLAG_COPIED;
1858
                    if (offset & QCOW_OFLAG_COMPRESSED) {
1859
                        nb_csectors = ((offset >> s->csize_shift) &
1860
                                       s->csize_mask) + 1;
1861
                        if (addend != 0)
1862
                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
1863
                                            nb_csectors * 512, addend);
1864
                        /* compressed clusters are never modified */
1865
                        refcount = 2;
1866
                    } else {
1867
                        if (addend != 0) {
1868
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
1869
                        } else {
1870
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
1871
                        }
1872
                    }
1873

    
1874
                    if (refcount == 1) {
1875
                        offset |= QCOW_OFLAG_COPIED;
1876
                    }
1877
                    if (offset != old_offset) {
1878
                        l2_table[j] = cpu_to_be64(offset);
1879
                        l2_modified = 1;
1880
                    }
1881
                }
1882
            }
1883
            if (l2_modified) {
1884
                if (bdrv_pwrite(s->hd,
1885
                                l2_offset, l2_table, l2_size) != l2_size)
1886
                    goto fail;
1887
            }
1888

    
1889
            if (addend != 0) {
1890
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
1891
            } else {
1892
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1893
            }
1894
            if (refcount == 1) {
1895
                l2_offset |= QCOW_OFLAG_COPIED;
1896
            }
1897
            if (l2_offset != old_l2_offset) {
1898
                l1_table[i] = l2_offset;
1899
                l1_modified = 1;
1900
            }
1901
        }
1902
    }
1903
    if (l1_modified) {
1904
        for(i = 0; i < l1_size; i++)
1905
            cpu_to_be64s(&l1_table[i]);
1906
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
1907
                        l1_size2) != l1_size2)
1908
            goto fail;
1909
        for(i = 0; i < l1_size; i++)
1910
            be64_to_cpus(&l1_table[i]);
1911
    }
1912
    if (l1_allocated)
1913
        qemu_free(l1_table);
1914
    qemu_free(l2_table);
1915
    return 0;
1916
 fail:
1917
    if (l1_allocated)
1918
        qemu_free(l1_table);
1919
    qemu_free(l2_table);
1920
    return -EIO;
1921
}
1922

    
1923
static void qcow_free_snapshots(BlockDriverState *bs)
1924
{
1925
    BDRVQcowState *s = bs->opaque;
1926
    int i;
1927

    
1928
    for(i = 0; i < s->nb_snapshots; i++) {
1929
        qemu_free(s->snapshots[i].name);
1930
        qemu_free(s->snapshots[i].id_str);
1931
    }
1932
    qemu_free(s->snapshots);
1933
    s->snapshots = NULL;
1934
    s->nb_snapshots = 0;
1935
}
1936

    
1937
static int qcow_read_snapshots(BlockDriverState *bs)
1938
{
1939
    BDRVQcowState *s = bs->opaque;
1940
    QCowSnapshotHeader h;
1941
    QCowSnapshot *sn;
1942
    int i, id_str_size, name_size;
1943
    int64_t offset;
1944
    uint32_t extra_data_size;
1945

    
1946
    if (!s->nb_snapshots) {
1947
        s->snapshots = NULL;
1948
        s->snapshots_size = 0;
1949
        return 0;
1950
    }
1951

    
1952
    offset = s->snapshots_offset;
1953
    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
1954
    for(i = 0; i < s->nb_snapshots; i++) {
1955
        offset = align_offset(offset, 8);
1956
        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
1957
            goto fail;
1958
        offset += sizeof(h);
1959
        sn = s->snapshots + i;
1960
        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
1961
        sn->l1_size = be32_to_cpu(h.l1_size);
1962
        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
1963
        sn->date_sec = be32_to_cpu(h.date_sec);
1964
        sn->date_nsec = be32_to_cpu(h.date_nsec);
1965
        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
1966
        extra_data_size = be32_to_cpu(h.extra_data_size);
1967

    
1968
        id_str_size = be16_to_cpu(h.id_str_size);
1969
        name_size = be16_to_cpu(h.name_size);
1970

    
1971
        offset += extra_data_size;
1972

    
1973
        sn->id_str = qemu_malloc(id_str_size + 1);
1974
        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
1975
            goto fail;
1976
        offset += id_str_size;
1977
        sn->id_str[id_str_size] = '\0';
1978

    
1979
        sn->name = qemu_malloc(name_size + 1);
1980
        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
1981
            goto fail;
1982
        offset += name_size;
1983
        sn->name[name_size] = '\0';
1984
    }
1985
    s->snapshots_size = offset - s->snapshots_offset;
1986
    return 0;
1987
 fail:
1988
    qcow_free_snapshots(bs);
1989
    return -1;
1990
}
1991

    
1992
/* add at the end of the file a new list of snapshots */
1993
static int qcow_write_snapshots(BlockDriverState *bs)
1994
{
1995
    BDRVQcowState *s = bs->opaque;
1996
    QCowSnapshot *sn;
1997
    QCowSnapshotHeader h;
1998
    int i, name_size, id_str_size, snapshots_size;
1999
    uint64_t data64;
2000
    uint32_t data32;
2001
    int64_t offset, snapshots_offset;
2002

    
2003
    /* compute the size of the snapshots */
2004
    offset = 0;
2005
    for(i = 0; i < s->nb_snapshots; i++) {
2006
        sn = s->snapshots + i;
2007
        offset = align_offset(offset, 8);
2008
        offset += sizeof(h);
2009
        offset += strlen(sn->id_str);
2010
        offset += strlen(sn->name);
2011
    }
2012
    snapshots_size = offset;
2013

    
2014
    snapshots_offset = alloc_clusters(bs, snapshots_size);
2015
    offset = snapshots_offset;
2016

    
2017
    for(i = 0; i < s->nb_snapshots; i++) {
2018
        sn = s->snapshots + i;
2019
        memset(&h, 0, sizeof(h));
2020
        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
2021
        h.l1_size = cpu_to_be32(sn->l1_size);
2022
        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
2023
        h.date_sec = cpu_to_be32(sn->date_sec);
2024
        h.date_nsec = cpu_to_be32(sn->date_nsec);
2025
        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
2026

    
2027
        id_str_size = strlen(sn->id_str);
2028
        name_size = strlen(sn->name);
2029
        h.id_str_size = cpu_to_be16(id_str_size);
2030
        h.name_size = cpu_to_be16(name_size);
2031
        offset = align_offset(offset, 8);
2032
        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
2033
            goto fail;
2034
        offset += sizeof(h);
2035
        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
2036
            goto fail;
2037
        offset += id_str_size;
2038
        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
2039
            goto fail;
2040
        offset += name_size;
2041
    }
2042

    
2043
    /* update the various header fields */
2044
    data64 = cpu_to_be64(snapshots_offset);
2045
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
2046
                    &data64, sizeof(data64)) != sizeof(data64))
2047
        goto fail;
2048
    data32 = cpu_to_be32(s->nb_snapshots);
2049
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
2050
                    &data32, sizeof(data32)) != sizeof(data32))
2051
        goto fail;
2052

    
2053
    /* free the old snapshot table */
2054
    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
2055
    s->snapshots_offset = snapshots_offset;
2056
    s->snapshots_size = snapshots_size;
2057
    return 0;
2058
 fail:
2059
    return -1;
2060
}
2061

    
2062
static void find_new_snapshot_id(BlockDriverState *bs,
2063
                                 char *id_str, int id_str_size)
2064
{
2065
    BDRVQcowState *s = bs->opaque;
2066
    QCowSnapshot *sn;
2067
    int i, id, id_max = 0;
2068

    
2069
    for(i = 0; i < s->nb_snapshots; i++) {
2070
        sn = s->snapshots + i;
2071
        id = strtoul(sn->id_str, NULL, 10);
2072
        if (id > id_max)
2073
            id_max = id;
2074
    }
2075
    snprintf(id_str, id_str_size, "%d", id_max + 1);
2076
}
2077

    
2078
static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
2079
{
2080
    BDRVQcowState *s = bs->opaque;
2081
    int i;
2082

    
2083
    for(i = 0; i < s->nb_snapshots; i++) {
2084
        if (!strcmp(s->snapshots[i].id_str, id_str))
2085
            return i;
2086
    }
2087
    return -1;
2088
}
2089

    
2090
static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
2091
{
2092
    BDRVQcowState *s = bs->opaque;
2093
    int i, ret;
2094

    
2095
    ret = find_snapshot_by_id(bs, name);
2096
    if (ret >= 0)
2097
        return ret;
2098
    for(i = 0; i < s->nb_snapshots; i++) {
2099
        if (!strcmp(s->snapshots[i].name, name))
2100
            return i;
2101
    }
2102
    return -1;
2103
}
2104

    
2105
/* if no id is provided, a new one is constructed */
2106
static int qcow_snapshot_create(BlockDriverState *bs,
2107
                                QEMUSnapshotInfo *sn_info)
2108
{
2109
    BDRVQcowState *s = bs->opaque;
2110
    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
2111
    int i, ret;
2112
    uint64_t *l1_table = NULL;
2113

    
2114
    memset(sn, 0, sizeof(*sn));
2115

    
2116
    if (sn_info->id_str[0] == '\0') {
2117
        /* compute a new id */
2118
        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
2119
    }
2120

    
2121
    /* check that the ID is unique */
2122
    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
2123
        return -ENOENT;
2124

    
2125
    sn->id_str = qemu_strdup(sn_info->id_str);
2126
    if (!sn->id_str)
2127
        goto fail;
2128
    sn->name = qemu_strdup(sn_info->name);
2129
    if (!sn->name)
2130
        goto fail;
2131
    sn->vm_state_size = sn_info->vm_state_size;
2132
    sn->date_sec = sn_info->date_sec;
2133
    sn->date_nsec = sn_info->date_nsec;
2134
    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
2135

    
2136
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
2137
    if (ret < 0)
2138
        goto fail;
2139

    
2140
    /* create the L1 table of the snapshot */
2141
    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
2142
    sn->l1_size = s->l1_size;
2143

    
2144
    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
2145
    for(i = 0; i < s->l1_size; i++) {
2146
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
2147
    }
2148
    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
2149
                    l1_table, s->l1_size * sizeof(uint64_t)) !=
2150
        (s->l1_size * sizeof(uint64_t)))
2151
        goto fail;
2152
    qemu_free(l1_table);
2153
    l1_table = NULL;
2154

    
2155
    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
2156
    if (s->snapshots) {
2157
        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
2158
        qemu_free(s->snapshots);
2159
    }
2160
    s->snapshots = snapshots1;
2161
    s->snapshots[s->nb_snapshots++] = *sn;
2162

    
2163
    if (qcow_write_snapshots(bs) < 0)
2164
        goto fail;
2165
#ifdef DEBUG_ALLOC
2166
    check_refcounts(bs);
2167
#endif
2168
    return 0;
2169
 fail:
2170
    qemu_free(sn->name);
2171
    qemu_free(l1_table);
2172
    return -1;
2173
}
2174

    
2175
/* copy the snapshot 'snapshot_name' into the current disk image */
2176
static int qcow_snapshot_goto(BlockDriverState *bs,
2177
                              const char *snapshot_id)
2178
{
2179
    BDRVQcowState *s = bs->opaque;
2180
    QCowSnapshot *sn;
2181
    int i, snapshot_index, l1_size2;
2182

    
2183
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2184
    if (snapshot_index < 0)
2185
        return -ENOENT;
2186
    sn = &s->snapshots[snapshot_index];
2187

    
2188
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
2189
        goto fail;
2190

    
2191
    if (grow_l1_table(bs, sn->l1_size) < 0)
2192
        goto fail;
2193

    
2194
    s->l1_size = sn->l1_size;
2195
    l1_size2 = s->l1_size * sizeof(uint64_t);
2196
    /* copy the snapshot l1 table to the current l1 table */
2197
    if (bdrv_pread(s->hd, sn->l1_table_offset,
2198
                   s->l1_table, l1_size2) != l1_size2)
2199
        goto fail;
2200
    if (bdrv_pwrite(s->hd, s->l1_table_offset,
2201
                    s->l1_table, l1_size2) != l1_size2)
2202
        goto fail;
2203
    for(i = 0;i < s->l1_size; i++) {
2204
        be64_to_cpus(&s->l1_table[i]);
2205
    }
2206

    
2207
    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
2208
        goto fail;
2209

    
2210
#ifdef DEBUG_ALLOC
2211
    check_refcounts(bs);
2212
#endif
2213
    return 0;
2214
 fail:
2215
    return -EIO;
2216
}
2217

    
2218
static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2219
{
2220
    BDRVQcowState *s = bs->opaque;
2221
    QCowSnapshot *sn;
2222
    int snapshot_index, ret;
2223

    
2224
    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
2225
    if (snapshot_index < 0)
2226
        return -ENOENT;
2227
    sn = &s->snapshots[snapshot_index];
2228

    
2229
    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
2230
    if (ret < 0)
2231
        return ret;
2232
    /* must update the copied flag on the current cluster offsets */
2233
    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
2234
    if (ret < 0)
2235
        return ret;
2236
    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
2237

    
2238
    qemu_free(sn->id_str);
2239
    qemu_free(sn->name);
2240
    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
2241
    s->nb_snapshots--;
2242
    ret = qcow_write_snapshots(bs);
2243
    if (ret < 0) {
2244
        /* XXX: restore snapshot if error ? */
2245
        return ret;
2246
    }
2247
#ifdef DEBUG_ALLOC
2248
    check_refcounts(bs);
2249
#endif
2250
    return 0;
2251
}
2252

    
2253
static int qcow_snapshot_list(BlockDriverState *bs,
2254
                              QEMUSnapshotInfo **psn_tab)
2255
{
2256
    BDRVQcowState *s = bs->opaque;
2257
    QEMUSnapshotInfo *sn_tab, *sn_info;
2258
    QCowSnapshot *sn;
2259
    int i;
2260

    
2261
    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
2262
    for(i = 0; i < s->nb_snapshots; i++) {
2263
        sn_info = sn_tab + i;
2264
        sn = s->snapshots + i;
2265
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
2266
                sn->id_str);
2267
        pstrcpy(sn_info->name, sizeof(sn_info->name),
2268
                sn->name);
2269
        sn_info->vm_state_size = sn->vm_state_size;
2270
        sn_info->date_sec = sn->date_sec;
2271
        sn_info->date_nsec = sn->date_nsec;
2272
        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
2273
    }
2274
    *psn_tab = sn_tab;
2275
    return s->nb_snapshots;
2276
}
2277

    
2278
/*********************************************************/
2279
/* refcount handling */
2280

    
2281
static int refcount_init(BlockDriverState *bs)
2282
{
2283
    BDRVQcowState *s = bs->opaque;
2284
    int ret, refcount_table_size2, i;
2285

    
2286
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
2287
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
2288
    s->refcount_table = qemu_malloc(refcount_table_size2);
2289
    if (s->refcount_table_size > 0) {
2290
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
2291
                         s->refcount_table, refcount_table_size2);
2292
        if (ret != refcount_table_size2)
2293
            goto fail;
2294
        for(i = 0; i < s->refcount_table_size; i++)
2295
            be64_to_cpus(&s->refcount_table[i]);
2296
    }
2297
    return 0;
2298
 fail:
2299
    return -ENOMEM;
2300
}
2301

    
2302
static void refcount_close(BlockDriverState *bs)
2303
{
2304
    BDRVQcowState *s = bs->opaque;
2305
    qemu_free(s->refcount_block_cache);
2306
    qemu_free(s->refcount_table);
2307
}
2308

    
2309

    
2310
static int load_refcount_block(BlockDriverState *bs,
2311
                               int64_t refcount_block_offset)
2312
{
2313
    BDRVQcowState *s = bs->opaque;
2314
    int ret;
2315
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
2316
                     s->cluster_size);
2317
    if (ret != s->cluster_size)
2318
        return -EIO;
2319
    s->refcount_block_cache_offset = refcount_block_offset;
2320
    return 0;
2321
}
2322

    
2323
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
2324
{
2325
    BDRVQcowState *s = bs->opaque;
2326
    int refcount_table_index, block_index;
2327
    int64_t refcount_block_offset;
2328

    
2329
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2330
    if (refcount_table_index >= s->refcount_table_size)
2331
        return 0;
2332
    refcount_block_offset = s->refcount_table[refcount_table_index];
2333
    if (!refcount_block_offset)
2334
        return 0;
2335
    if (refcount_block_offset != s->refcount_block_cache_offset) {
2336
        /* better than nothing: return allocated if read error */
2337
        if (load_refcount_block(bs, refcount_block_offset) < 0)
2338
            return 1;
2339
    }
2340
    block_index = cluster_index &
2341
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2342
    return be16_to_cpu(s->refcount_block_cache[block_index]);
2343
}
2344

    
2345
/* return < 0 if error */
2346
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
2347
{
2348
    BDRVQcowState *s = bs->opaque;
2349
    int i, nb_clusters;
2350

    
2351
    nb_clusters = size_to_clusters(s, size);
2352
retry:
2353
    for(i = 0; i < nb_clusters; i++) {
2354
        int64_t i = s->free_cluster_index++;
2355
        if (get_refcount(bs, i) != 0)
2356
            goto retry;
2357
    }
2358
#ifdef DEBUG_ALLOC2
2359
    printf("alloc_clusters: size=%lld -> %lld\n",
2360
            size,
2361
            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
2362
#endif
2363
    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
2364
}
2365

    
2366
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
2367
{
2368
    int64_t offset;
2369

    
2370
    offset = alloc_clusters_noref(bs, size);
2371
    update_refcount(bs, offset, size, 1);
2372
    return offset;
2373
}
2374

    
2375
/* only used to allocate compressed sectors. We try to allocate
2376
   contiguous sectors. size must be <= cluster_size */
2377
static int64_t alloc_bytes(BlockDriverState *bs, int size)
2378
{
2379
    BDRVQcowState *s = bs->opaque;
2380
    int64_t offset, cluster_offset;
2381
    int free_in_cluster;
2382

    
2383
    assert(size > 0 && size <= s->cluster_size);
2384
    if (s->free_byte_offset == 0) {
2385
        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
2386
    }
2387
 redo:
2388
    free_in_cluster = s->cluster_size -
2389
        (s->free_byte_offset & (s->cluster_size - 1));
2390
    if (size <= free_in_cluster) {
2391
        /* enough space in current cluster */
2392
        offset = s->free_byte_offset;
2393
        s->free_byte_offset += size;
2394
        free_in_cluster -= size;
2395
        if (free_in_cluster == 0)
2396
            s->free_byte_offset = 0;
2397
        if ((offset & (s->cluster_size - 1)) != 0)
2398
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2399
    } else {
2400
        offset = alloc_clusters(bs, s->cluster_size);
2401
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
2402
        if ((cluster_offset + s->cluster_size) == offset) {
2403
            /* we are lucky: contiguous data */
2404
            offset = s->free_byte_offset;
2405
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
2406
            s->free_byte_offset += size;
2407
        } else {
2408
            s->free_byte_offset = offset;
2409
            goto redo;
2410
        }
2411
    }
2412
    return offset;
2413
}
2414

    
2415
static void free_clusters(BlockDriverState *bs,
2416
                          int64_t offset, int64_t size)
2417
{
2418
    update_refcount(bs, offset, size, -1);
2419
}
2420

    
2421
static int grow_refcount_table(BlockDriverState *bs, int min_size)
2422
{
2423
    BDRVQcowState *s = bs->opaque;
2424
    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
2425
    uint64_t *new_table;
2426
    int64_t table_offset;
2427
    uint8_t data[12];
2428
    int old_table_size;
2429
    int64_t old_table_offset;
2430

    
2431
    if (min_size <= s->refcount_table_size)
2432
        return 0;
2433
    /* compute new table size */
2434
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2435
    for(;;) {
2436
        if (refcount_table_clusters == 0) {
2437
            refcount_table_clusters = 1;
2438
        } else {
2439
            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
2440
        }
2441
        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
2442
        if (min_size <= new_table_size)
2443
            break;
2444
    }
2445
#ifdef DEBUG_ALLOC2
2446
    printf("grow_refcount_table from %d to %d\n",
2447
           s->refcount_table_size,
2448
           new_table_size);
2449
#endif
2450
    new_table_size2 = new_table_size * sizeof(uint64_t);
2451
    new_table = qemu_mallocz(new_table_size2);
2452
    memcpy(new_table, s->refcount_table,
2453
           s->refcount_table_size * sizeof(uint64_t));
2454
    for(i = 0; i < s->refcount_table_size; i++)
2455
        cpu_to_be64s(&new_table[i]);
2456
    /* Note: we cannot update the refcount now to avoid recursion */
2457
    table_offset = alloc_clusters_noref(bs, new_table_size2);
2458
    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
2459
    if (ret != new_table_size2)
2460
        goto fail;
2461
    for(i = 0; i < s->refcount_table_size; i++)
2462
        be64_to_cpus(&new_table[i]);
2463

    
2464
    cpu_to_be64w((uint64_t*)data, table_offset);
2465
    cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters);
2466
    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
2467
                    data, sizeof(data)) != sizeof(data))
2468
        goto fail;
2469
    qemu_free(s->refcount_table);
2470
    old_table_offset = s->refcount_table_offset;
2471
    old_table_size = s->refcount_table_size;
2472
    s->refcount_table = new_table;
2473
    s->refcount_table_size = new_table_size;
2474
    s->refcount_table_offset = table_offset;
2475

    
2476
    update_refcount(bs, table_offset, new_table_size2, 1);
2477
    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
2478
    return 0;
2479
 fail:
2480
    free_clusters(bs, table_offset, new_table_size2);
2481
    qemu_free(new_table);
2482
    return -EIO;
2483
}
2484

    
2485
/* addend must be 1 or -1 */
2486
/* XXX: cache several refcount block clusters ? */
2487
static int update_cluster_refcount(BlockDriverState *bs,
2488
                                   int64_t cluster_index,
2489
                                   int addend)
2490
{
2491
    BDRVQcowState *s = bs->opaque;
2492
    int64_t offset, refcount_block_offset;
2493
    int ret, refcount_table_index, block_index, refcount;
2494
    uint64_t data64;
2495

    
2496
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
2497
    if (refcount_table_index >= s->refcount_table_size) {
2498
        if (addend < 0)
2499
            return -EINVAL;
2500
        ret = grow_refcount_table(bs, refcount_table_index + 1);
2501
        if (ret < 0)
2502
            return ret;
2503
    }
2504
    refcount_block_offset = s->refcount_table[refcount_table_index];
2505
    if (!refcount_block_offset) {
2506
        if (addend < 0)
2507
            return -EINVAL;
2508
        /* create a new refcount block */
2509
        /* Note: we cannot update the refcount now to avoid recursion */
2510
        offset = alloc_clusters_noref(bs, s->cluster_size);
2511
        memset(s->refcount_block_cache, 0, s->cluster_size);
2512
        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
2513
        if (ret != s->cluster_size)
2514
            return -EINVAL;
2515
        s->refcount_table[refcount_table_index] = offset;
2516
        data64 = cpu_to_be64(offset);
2517
        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
2518
                          refcount_table_index * sizeof(uint64_t),
2519
                          &data64, sizeof(data64));
2520
        if (ret != sizeof(data64))
2521
            return -EINVAL;
2522

    
2523
        refcount_block_offset = offset;
2524
        s->refcount_block_cache_offset = offset;
2525
        update_refcount(bs, offset, s->cluster_size, 1);
2526
    } else {
2527
        if (refcount_block_offset != s->refcount_block_cache_offset) {
2528
            if (load_refcount_block(bs, refcount_block_offset) < 0)
2529
                return -EIO;
2530
        }
2531
    }
2532
    /* we can update the count and save it */
2533
    block_index = cluster_index &
2534
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
2535
    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
2536
    refcount += addend;
2537
    if (refcount < 0 || refcount > 0xffff)
2538
        return -EINVAL;
2539
    if (refcount == 0 && cluster_index < s->free_cluster_index) {
2540
        s->free_cluster_index = cluster_index;
2541
    }
2542
    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
2543
    if (bdrv_pwrite(s->hd,
2544
                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
2545
                    &s->refcount_block_cache[block_index], 2) != 2)
2546
        return -EIO;
2547
    return refcount;
2548
}
2549

    
2550
static void update_refcount(BlockDriverState *bs,
2551
                            int64_t offset, int64_t length,
2552
                            int addend)
2553
{
2554
    BDRVQcowState *s = bs->opaque;
2555
    int64_t start, last, cluster_offset;
2556

    
2557
#ifdef DEBUG_ALLOC2
2558
    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
2559
           offset, length, addend);
2560
#endif
2561
    if (length <= 0)
2562
        return;
2563
    start = offset & ~(s->cluster_size - 1);
2564
    last = (offset + length - 1) & ~(s->cluster_size - 1);
2565
    for(cluster_offset = start; cluster_offset <= last;
2566
        cluster_offset += s->cluster_size) {
2567
        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
2568
    }
2569
}
2570

    
2571
/*
2572
 * Increases the refcount for a range of clusters in a given refcount table.
2573
 * This is used to construct a temporary refcount table out of L1 and L2 tables
2574
 * which can be compared the the refcount table saved in the image.
2575
 *
2576
 * Returns the number of errors in the image that were found
2577
 */
2578
static int inc_refcounts(BlockDriverState *bs,
2579
                          uint16_t *refcount_table,
2580
                          int refcount_table_size,
2581
                          int64_t offset, int64_t size)
2582
{
2583
    BDRVQcowState *s = bs->opaque;
2584
    int64_t start, last, cluster_offset;
2585
    int k;
2586
    int errors = 0;
2587

    
2588
    if (size <= 0)
2589
        return 0;
2590

    
2591
    start = offset & ~(s->cluster_size - 1);
2592
    last = (offset + size - 1) & ~(s->cluster_size - 1);
2593
    for(cluster_offset = start; cluster_offset <= last;
2594
        cluster_offset += s->cluster_size) {
2595
        k = cluster_offset >> s->cluster_bits;
2596
        if (k < 0 || k >= refcount_table_size) {
2597
            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
2598
                cluster_offset);
2599
            errors++;
2600
        } else {
2601
            if (++refcount_table[k] == 0) {
2602
                fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
2603
                    "\n", cluster_offset);
2604
                errors++;
2605
            }
2606
        }
2607
    }
2608

    
2609
    return errors;
2610
}
2611

    
2612
/*
2613
 * Increases the refcount in the given refcount table for the all clusters
2614
 * referenced in the L2 table. While doing so, performs some checks on L2
2615
 * entries.
2616
 *
2617
 * Returns the number of errors found by the checks or -errno if an internal
2618
 * error occurred.
2619
 */
2620
static int check_refcounts_l2(BlockDriverState *bs,
2621
    uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
2622
    int check_copied)
2623
{
2624
    BDRVQcowState *s = bs->opaque;
2625
    uint64_t *l2_table, offset;
2626
    int i, l2_size, nb_csectors, refcount;
2627
    int errors = 0;
2628

    
2629
    /* Read L2 table from disk */
2630
    l2_size = s->l2_size * sizeof(uint64_t);
2631
    l2_table = qemu_malloc(l2_size);
2632

    
2633
    if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
2634
        goto fail;
2635

    
2636
    /* Do the actual checks */
2637
    for(i = 0; i < s->l2_size; i++) {
2638
        offset = be64_to_cpu(l2_table[i]);
2639
        if (offset != 0) {
2640
            if (offset & QCOW_OFLAG_COMPRESSED) {
2641
                /* Compressed clusters don't have QCOW_OFLAG_COPIED */
2642
                if (offset & QCOW_OFLAG_COPIED) {
2643
                    fprintf(stderr, "ERROR: cluster %" PRId64 ": "
2644
                        "copied flag must never be set for compressed "
2645
                        "clusters\n", offset >> s->cluster_bits);
2646
                    offset &= ~QCOW_OFLAG_COPIED;
2647
                    errors++;
2648
                }
2649

    
2650
                /* Mark cluster as used */
2651
                nb_csectors = ((offset >> s->csize_shift) &
2652
                               s->csize_mask) + 1;
2653
                offset &= s->cluster_offset_mask;
2654
                errors += inc_refcounts(bs, refcount_table,
2655
                              refcount_table_size,
2656
                              offset & ~511, nb_csectors * 512);
2657
            } else {
2658
                /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
2659
                if (check_copied) {
2660
                    uint64_t entry = offset;
2661
                    offset &= ~QCOW_OFLAG_COPIED;
2662
                    refcount = get_refcount(bs, offset >> s->cluster_bits);
2663
                    if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) {
2664
                        fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
2665
                            PRIx64 " refcount=%d\n", entry, refcount);
2666
                        errors++;
2667
                    }
2668
                }
2669

    
2670
                /* Mark cluster as used */
2671
                offset &= ~QCOW_OFLAG_COPIED;
2672
                errors += inc_refcounts(bs, refcount_table,
2673
                              refcount_table_size,
2674
                              offset, s->cluster_size);
2675

    
2676
                /* Correct offsets are cluster aligned */
2677
                if (offset & (s->cluster_size - 1)) {
2678
                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
2679
                        "properly aligned; L2 entry corrupted.\n", offset);
2680
                    errors++;
2681
                }
2682
            }
2683
        }
2684
    }
2685

    
2686
    qemu_free(l2_table);
2687
    return errors;
2688

    
2689
fail:
2690
    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
2691
    qemu_free(l2_table);
2692
    return -EIO;
2693
}
2694

    
2695
/*
2696
 * Increases the refcount for the L1 table, its L2 tables and all referenced
2697
 * clusters in the given refcount table. While doing so, performs some checks
2698
 * on L1 and L2 entries.
2699
 *
2700
 * Returns the number of errors found by the checks or -errno if an internal
2701
 * error occurred.
2702
 */
2703
static int check_refcounts_l1(BlockDriverState *bs,
2704
                              uint16_t *refcount_table,
2705
                              int refcount_table_size,
2706
                              int64_t l1_table_offset, int l1_size,
2707
                              int check_copied)
2708
{
2709
    BDRVQcowState *s = bs->opaque;
2710
    uint64_t *l1_table, l2_offset, l1_size2;
2711
    int i, refcount, ret;
2712
    int errors = 0;
2713

    
2714
    l1_size2 = l1_size * sizeof(uint64_t);
2715

    
2716
    /* Mark L1 table as used */
2717
    errors += inc_refcounts(bs, refcount_table, refcount_table_size,
2718
                  l1_table_offset, l1_size2);
2719

    
2720
    /* Read L1 table entries from disk */
2721
    l1_table = qemu_malloc(l1_size2);
2722
    if (bdrv_pread(s->hd, l1_table_offset,
2723
                   l1_table, l1_size2) != l1_size2)
2724
        goto fail;
2725
    for(i = 0;i < l1_size; i++)
2726
        be64_to_cpus(&l1_table[i]);
2727

    
2728
    /* Do the actual checks */
2729
    for(i = 0; i < l1_size; i++) {
2730
        l2_offset = l1_table[i];
2731
        if (l2_offset) {
2732
            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
2733
            if (check_copied) {
2734
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
2735
                    >> s->cluster_bits);
2736
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
2737
                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
2738
                        " refcount=%d\n", l2_offset, refcount);
2739
                    errors++;
2740
                }
2741
            }
2742

    
2743
            /* Mark L2 table as used */
2744
            l2_offset &= ~QCOW_OFLAG_COPIED;
2745
            errors += inc_refcounts(bs, refcount_table,
2746
                          refcount_table_size,
2747
                          l2_offset,
2748
                          s->cluster_size);
2749

    
2750
            /* L2 tables are cluster aligned */
2751
            if (l2_offset & (s->cluster_size - 1)) {
2752
                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
2753
                    "cluster aligned; L1 entry corrupted\n", l2_offset);
2754
                errors++;
2755
            }
2756

    
2757
            /* Process and check L2 entries */
2758
            ret = check_refcounts_l2(bs, refcount_table, refcount_table_size,
2759
                l2_offset, check_copied);
2760
            if (ret < 0) {
2761
                goto fail;
2762
            }
2763
            errors += ret;
2764
        }
2765
    }
2766
    qemu_free(l1_table);
2767
    return errors;
2768

    
2769
fail:
2770
    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
2771
    qemu_free(l1_table);
2772
    return -EIO;
2773
}
2774

    
2775
/*
2776
 * Checks an image for refcount consistency.
2777
 *
2778
 * Returns 0 if no errors are found, the number of errors in case the image is
2779
 * detected as corrupted, and -errno when an internal error occured.
2780
 */
2781
static int check_refcounts(BlockDriverState *bs)
2782
{
2783
    BDRVQcowState *s = bs->opaque;
2784
    int64_t size;
2785
    int nb_clusters, refcount1, refcount2, i;
2786
    QCowSnapshot *sn;
2787
    uint16_t *refcount_table;
2788
    int ret, errors = 0;
2789

    
2790
    size = bdrv_getlength(s->hd);
2791
    nb_clusters = size_to_clusters(s, size);
2792
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
2793

    
2794
    /* header */
2795
    errors += inc_refcounts(bs, refcount_table, nb_clusters,
2796
                  0, s->cluster_size);
2797

    
2798
    /* current L1 table */
2799
    ret = check_refcounts_l1(bs, refcount_table, nb_clusters,
2800
                       s->l1_table_offset, s->l1_size, 1);
2801
    if (ret < 0) {
2802
        return ret;
2803
    }
2804
    errors += ret;
2805

    
2806
    /* snapshots */
2807
    for(i = 0; i < s->nb_snapshots; i++) {
2808
        sn = s->snapshots + i;
2809
        check_refcounts_l1(bs, refcount_table, nb_clusters,
2810
                           sn->l1_table_offset, sn->l1_size, 0);
2811
    }
2812
    errors += inc_refcounts(bs, refcount_table, nb_clusters,
2813
                  s->snapshots_offset, s->snapshots_size);
2814

    
2815
    /* refcount data */
2816
    errors += inc_refcounts(bs, refcount_table, nb_clusters,
2817
                  s->refcount_table_offset,
2818
                  s->refcount_table_size * sizeof(uint64_t));
2819
    for(i = 0; i < s->refcount_table_size; i++) {
2820
        int64_t offset;
2821
        offset = s->refcount_table[i];
2822
        if (offset != 0) {
2823
            errors += inc_refcounts(bs, refcount_table, nb_clusters,
2824
                          offset, s->cluster_size);
2825
        }
2826
    }
2827

    
2828
    /* compare ref counts */
2829
    for(i = 0; i < nb_clusters; i++) {
2830
        refcount1 = get_refcount(bs, i);
2831
        refcount2 = refcount_table[i];
2832
        if (refcount1 != refcount2) {
2833
            fprintf(stderr, "ERROR cluster %d refcount=%d reference=%d\n",
2834
                   i, refcount1, refcount2);
2835
            errors++;
2836
        }
2837
    }
2838

    
2839
    qemu_free(refcount_table);
2840

    
2841
    return errors;
2842
}
2843

    
2844
static int qcow_check(BlockDriverState *bs)
2845
{
2846
    return check_refcounts(bs);
2847
}
2848

    
2849
#if 0
2850
static void dump_refcounts(BlockDriverState *bs)
2851
{
2852
    BDRVQcowState *s = bs->opaque;
2853
    int64_t nb_clusters, k, k1, size;
2854
    int refcount;
2855

2856
    size = bdrv_getlength(s->hd);
2857
    nb_clusters = size_to_clusters(s, size);
2858
    for(k = 0; k < nb_clusters;) {
2859
        k1 = k;
2860
        refcount = get_refcount(bs, k);
2861
        k++;
2862
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
2863
            k++;
2864
        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
2865
    }
2866
}
2867
#endif
2868

    
2869
static int qcow_put_buffer(BlockDriverState *bs, const uint8_t *buf,
2870
                           int64_t pos, int size)
2871
{
2872
    int growable = bs->growable;
2873

    
2874
    bs->growable = 1;
2875
    bdrv_pwrite(bs, pos, buf, size);
2876
    bs->growable = growable;
2877

    
2878
    return size;
2879
}
2880

    
2881
static int qcow_get_buffer(BlockDriverState *bs, uint8_t *buf,
2882
                           int64_t pos, int size)
2883
{
2884
    int growable = bs->growable;
2885
    int ret;
2886

    
2887
    bs->growable = 1;
2888
    ret = bdrv_pread(bs, pos, buf, size);
2889
    bs->growable = growable;
2890

    
2891
    return ret;
2892
}
2893

    
2894
BlockDriver bdrv_qcow2 = {
2895
    .format_name        = "qcow2",
2896
    .instance_size        = sizeof(BDRVQcowState),
2897
    .bdrv_probe                = qcow_probe,
2898
    .bdrv_open                = qcow_open,
2899
    .bdrv_close                = qcow_close,
2900
    .bdrv_create        = qcow_create,
2901
    .bdrv_flush                = qcow_flush,
2902
    .bdrv_is_allocated        = qcow_is_allocated,
2903
    .bdrv_set_key        = qcow_set_key,
2904
    .bdrv_make_empty        = qcow_make_empty,
2905

    
2906
    .bdrv_aio_readv        = qcow_aio_readv,
2907
    .bdrv_aio_writev        = qcow_aio_writev,
2908
    .bdrv_aio_cancel        = qcow_aio_cancel,
2909
    .aiocb_size                = sizeof(QCowAIOCB),
2910
    .bdrv_write_compressed = qcow_write_compressed,
2911

    
2912
    .bdrv_snapshot_create = qcow_snapshot_create,
2913
    .bdrv_snapshot_goto        = qcow_snapshot_goto,
2914
    .bdrv_snapshot_delete = qcow_snapshot_delete,
2915
    .bdrv_snapshot_list        = qcow_snapshot_list,
2916
    .bdrv_get_info        = qcow_get_info,
2917

    
2918
    .bdrv_put_buffer    = qcow_put_buffer,
2919
    .bdrv_get_buffer    = qcow_get_buffer,
2920

    
2921
    .bdrv_create2 = qcow_create2,
2922
    .bdrv_check = qcow_check,
2923
};