Statistics
| Branch: | Revision:

root / block / qcow2-refcount.c @ de7890db

History | View | Annotate | Download (35.4 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24

    
25
#include "qemu-common.h"
26
#include "block_int.h"
27
#include "block/qcow2.h"
28

    
29
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
30
static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
31
                            int64_t offset, int64_t length,
32
                            int addend);
33

    
34

    
35
static int cache_refcount_updates = 0;
36

    
37
static int write_refcount_block(BDRVQcowState *s)
38
{
39
    size_t size = s->cluster_size;
40

    
41
    if (s->refcount_block_cache_offset == 0) {
42
        return 0;
43
    }
44

    
45
    if (bdrv_pwrite(s->hd, s->refcount_block_cache_offset,
46
            s->refcount_block_cache, size) != size)
47
    {
48
        return -EIO;
49
    }
50

    
51
    return 0;
52
}
53

    
54
/*********************************************************/
55
/* refcount handling */
56

    
57
int qcow2_refcount_init(BlockDriverState *bs)
58
{
59
    BDRVQcowState *s = bs->opaque;
60
    int ret, refcount_table_size2, i;
61

    
62
    s->refcount_block_cache = qemu_malloc(s->cluster_size);
63
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
64
    s->refcount_table = qemu_malloc(refcount_table_size2);
65
    if (s->refcount_table_size > 0) {
66
        ret = bdrv_pread(s->hd, s->refcount_table_offset,
67
                         s->refcount_table, refcount_table_size2);
68
        if (ret != refcount_table_size2)
69
            goto fail;
70
        for(i = 0; i < s->refcount_table_size; i++)
71
            be64_to_cpus(&s->refcount_table[i]);
72
    }
73
    return 0;
74
 fail:
75
    return -ENOMEM;
76
}
77

    
78
void qcow2_refcount_close(BlockDriverState *bs)
79
{
80
    BDRVQcowState *s = bs->opaque;
81
    qemu_free(s->refcount_block_cache);
82
    qemu_free(s->refcount_table);
83
}
84

    
85

    
86
static int load_refcount_block(BlockDriverState *bs,
87
                               int64_t refcount_block_offset)
88
{
89
    BDRVQcowState *s = bs->opaque;
90
    int ret;
91

    
92
    if (cache_refcount_updates) {
93
        write_refcount_block(s);
94
    }
95

    
96
    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
97
                     s->cluster_size);
98
    if (ret != s->cluster_size)
99
        return -EIO;
100
    s->refcount_block_cache_offset = refcount_block_offset;
101
    return 0;
102
}
103

    
104
static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
105
{
106
    BDRVQcowState *s = bs->opaque;
107
    int refcount_table_index, block_index;
108
    int64_t refcount_block_offset;
109

    
110
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
111
    if (refcount_table_index >= s->refcount_table_size)
112
        return 0;
113
    refcount_block_offset = s->refcount_table[refcount_table_index];
114
    if (!refcount_block_offset)
115
        return 0;
116
    if (refcount_block_offset != s->refcount_block_cache_offset) {
117
        /* better than nothing: return allocated if read error */
118
        if (load_refcount_block(bs, refcount_block_offset) < 0)
119
            return 1;
120
    }
121
    block_index = cluster_index &
122
        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
123
    return be16_to_cpu(s->refcount_block_cache[block_index]);
124
}
125

    
126
/*
127
 * Rounds the refcount table size up to avoid growing the table for each single
128
 * refcount block that is allocated.
129
 */
130
static unsigned int next_refcount_table_size(BDRVQcowState *s,
131
    unsigned int min_size)
132
{
133
    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
134
    unsigned int refcount_table_clusters =
135
        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
136

    
137
    while (min_clusters > refcount_table_clusters) {
138
        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
139
    }
140

    
141
    return refcount_table_clusters << (s->cluster_bits - 3);
142
}
143

    
144

    
145
/* Checks if two offsets are described by the same refcount block */
146
static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
147
    uint64_t offset_b)
148
{
149
    uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
150
    uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
151

    
152
    return (block_a == block_b);
153
}
154

    
155
/*
156
 * Loads a refcount block. If it doesn't exist yet, it is allocated first
157
 * (including growing the refcount table if needed).
158
 *
159
 * Returns the offset of the refcount block on success or -errno in error case
160
 */
161
static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
162
{
163
    BDRVQcowState *s = bs->opaque;
164
    unsigned int refcount_table_index;
165
    int ret;
166

    
167
    /* Find the refcount block for the given cluster */
168
    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
169

    
170
    if (refcount_table_index < s->refcount_table_size) {
171

    
172
        uint64_t refcount_block_offset =
173
            s->refcount_table[refcount_table_index];
174

    
175
        /* If it's already there, we're done */
176
        if (refcount_block_offset) {
177
            if (refcount_block_offset != s->refcount_block_cache_offset) {
178
                ret = load_refcount_block(bs, refcount_block_offset);
179
                if (ret < 0) {
180
                    return ret;
181
                }
182
            }
183
            return refcount_block_offset;
184
        }
185
    }
186

    
187
    /*
188
     * If we came here, we need to allocate something. Something is at least
189
     * a cluster for the new refcount block. It may also include a new refcount
190
     * table if the old refcount table is too small.
191
     *
192
     * Note that allocating clusters here needs some special care:
193
     *
194
     * - We can't use the normal qcow2_alloc_clusters(), it would try to
195
     *   increase the refcount and very likely we would end up with an endless
196
     *   recursion. Instead we must place the refcount blocks in a way that
197
     *   they can describe them themselves.
198
     *
199
     * - We need to consider that at this point we are inside update_refcounts
200
     *   and doing the initial refcount increase. This means that some clusters
201
     *   have already been allocated by the caller, but their refcount isn't
202
     *   accurate yet. free_cluster_index tells us where this allocation ends
203
     *   as long as we don't overwrite it by freeing clusters.
204
     *
205
     * - alloc_clusters_noref and qcow2_free_clusters may load a different
206
     *   refcount block into the cache
207
     */
208

    
209
    if (cache_refcount_updates) {
210
        ret = write_refcount_block(s);
211
        if (ret < 0) {
212
            return ret;
213
        }
214
    }
215

    
216
    /* Allocate the refcount block itself and mark it as used */
217
    uint64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
218
    memset(s->refcount_block_cache, 0, s->cluster_size);
219
    s->refcount_block_cache_offset = new_block;
220

    
221
#ifdef DEBUG_ALLOC2
222
    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
223
        " at %" PRIx64 "\n",
224
        refcount_table_index, cluster_index << s->cluster_bits, new_block);
225
#endif
226

    
227
    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
228
        /* The block describes itself, need to update the cache */
229
        int block_index = (new_block >> s->cluster_bits) &
230
            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
231
        s->refcount_block_cache[block_index] = cpu_to_be16(1);
232
    } else {
233
        /* Described somewhere else. This can recurse at most twice before we
234
         * arrive at a block that describes itself. */
235
        ret = update_refcount(bs, new_block, s->cluster_size, 1);
236
        if (ret < 0) {
237
            goto fail_block;
238
        }
239
    }
240

    
241
    /* Now the new refcount block needs to be written to disk */
242
    ret = bdrv_pwrite(s->hd, new_block, s->refcount_block_cache,
243
        s->cluster_size);
244
    if (ret < 0) {
245
        goto fail_block;
246
    }
247

    
248
    /* If the refcount table is big enough, just hook the block up there */
249
    if (refcount_table_index < s->refcount_table_size) {
250
        uint64_t data64 = cpu_to_be64(new_block);
251
        ret = bdrv_pwrite(s->hd,
252
            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
253
            &data64, sizeof(data64));
254
        if (ret < 0) {
255
            goto fail_block;
256
        }
257

    
258
        s->refcount_table[refcount_table_index] = new_block;
259
        return new_block;
260
    }
261

    
262
    /*
263
     * If we come here, we need to grow the refcount table. Again, a new
264
     * refcount table needs some space and we can't simply allocate to avoid
265
     * endless recursion.
266
     *
267
     * Therefore let's grab new refcount blocks at the end of the image, which
268
     * will describe themselves and the new refcount table. This way we can
269
     * reference them only in the new table and do the switch to the new
270
     * refcount table at once without producing an inconsistent state in
271
     * between.
272
     */
273
    /* Calculate the number of refcount blocks needed so far */
274
    uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
275
    uint64_t blocks_used = (s->free_cluster_index +
276
        refcount_block_clusters - 1) / refcount_block_clusters;
277

    
278
    /* And now we need at least one block more for the new metadata */
279
    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
280
    uint64_t last_table_size;
281
    uint64_t blocks_clusters;
282
    do {
283
        uint64_t table_clusters = size_to_clusters(s, table_size);
284
        blocks_clusters = 1 +
285
            ((table_clusters + refcount_block_clusters - 1)
286
            / refcount_block_clusters);
287
        uint64_t meta_clusters = table_clusters + blocks_clusters;
288

    
289
        last_table_size = table_size;
290
        table_size = next_refcount_table_size(s, blocks_used +
291
            ((meta_clusters + refcount_block_clusters - 1)
292
            / refcount_block_clusters));
293

    
294
    } while (last_table_size != table_size);
295

    
296
#ifdef DEBUG_ALLOC2
297
    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
298
        s->refcount_table_size, table_size);
299
#endif
300

    
301
    /* Create the new refcount table and blocks */
302
    uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
303
        s->cluster_size;
304
    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
305
    uint16_t *new_blocks = qemu_mallocz(blocks_clusters * s->cluster_size);
306
    uint64_t *new_table = qemu_mallocz(table_size * sizeof(uint64_t));
307

    
308
    assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
309

    
310
    /* Fill the new refcount table */
311
    memcpy(new_table, s->refcount_table,
312
        s->refcount_table_size * sizeof(uint64_t));
313
    new_table[refcount_table_index] = new_block;
314

    
315
    int i;
316
    for (i = 0; i < blocks_clusters; i++) {
317
        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
318
    }
319

    
320
    /* Fill the refcount blocks */
321
    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
322
    int block = 0;
323
    for (i = 0; i < table_clusters + blocks_clusters; i++) {
324
        new_blocks[block++] = cpu_to_be16(1);
325
    }
326

    
327
    /* Write refcount blocks to disk */
328
    ret = bdrv_pwrite(s->hd, meta_offset, new_blocks,
329
        blocks_clusters * s->cluster_size);
330
    qemu_free(new_blocks);
331
    if (ret < 0) {
332
        goto fail_table;
333
    }
334

    
335
    /* Write refcount table to disk */
336
    for(i = 0; i < table_size; i++) {
337
        cpu_to_be64s(&new_table[i]);
338
    }
339

    
340
    ret = bdrv_pwrite(s->hd, table_offset, new_table,
341
        table_size * sizeof(uint64_t));
342
    if (ret < 0) {
343
        goto fail_table;
344
    }
345

    
346
    for(i = 0; i < table_size; i++) {
347
        cpu_to_be64s(&new_table[i]);
348
    }
349

    
350
    /* Hook up the new refcount table in the qcow2 header */
351
    uint8_t data[12];
352
    cpu_to_be64w((uint64_t*)data, table_offset);
353
    cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
354
    ret = bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
355
        data, sizeof(data));
356
    if (ret < 0) {
357
        goto fail_table;
358
    }
359

    
360
    /* And switch it in memory */
361
    uint64_t old_table_offset = s->refcount_table_offset;
362
    uint64_t old_table_size = s->refcount_table_size;
363

    
364
    qemu_free(s->refcount_table);
365
    s->refcount_table = new_table;
366
    s->refcount_table_size = table_size;
367
    s->refcount_table_offset = table_offset;
368

    
369
    /* Free old table. Remember, we must not change free_cluster_index */
370
    uint64_t old_free_cluster_index = s->free_cluster_index;
371
    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
372
    s->free_cluster_index = old_free_cluster_index;
373

    
374
    ret = load_refcount_block(bs, new_block);
375
    if (ret < 0) {
376
        goto fail_block;
377
    }
378

    
379
    return new_block;
380

    
381
fail_table:
382
    qemu_free(new_table);
383
fail_block:
384
    s->refcount_block_cache_offset = 0;
385
    return ret;
386
}
387

    
388
#define REFCOUNTS_PER_SECTOR (512 >> REFCOUNT_SHIFT)
389
static int write_refcount_block_entries(BDRVQcowState *s,
390
    int64_t refcount_block_offset, int first_index, int last_index)
391
{
392
    size_t size;
393

    
394
    if (cache_refcount_updates) {
395
        return 0;
396
    }
397

    
398
    first_index &= ~(REFCOUNTS_PER_SECTOR - 1);
399
    last_index = (last_index + REFCOUNTS_PER_SECTOR)
400
        & ~(REFCOUNTS_PER_SECTOR - 1);
401

    
402
    size = (last_index - first_index) << REFCOUNT_SHIFT;
403
    if (bdrv_pwrite(s->hd,
404
        refcount_block_offset + (first_index << REFCOUNT_SHIFT),
405
        &s->refcount_block_cache[first_index], size) != size)
406
    {
407
        return -EIO;
408
    }
409

    
410
    return 0;
411
}
412

    
413
/* XXX: cache several refcount block clusters ? */
414
static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
415
    int64_t offset, int64_t length, int addend)
416
{
417
    BDRVQcowState *s = bs->opaque;
418
    int64_t start, last, cluster_offset;
419
    int64_t refcount_block_offset = 0;
420
    int64_t table_index = -1, old_table_index;
421
    int first_index = -1, last_index = -1;
422
    int ret;
423

    
424
#ifdef DEBUG_ALLOC2
425
    printf("update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
426
           offset, length, addend);
427
#endif
428
    if (length < 0) {
429
        return -EINVAL;
430
    } else if (length == 0) {
431
        return 0;
432
    }
433

    
434
    start = offset & ~(s->cluster_size - 1);
435
    last = (offset + length - 1) & ~(s->cluster_size - 1);
436
    for(cluster_offset = start; cluster_offset <= last;
437
        cluster_offset += s->cluster_size)
438
    {
439
        int block_index, refcount;
440
        int64_t cluster_index = cluster_offset >> s->cluster_bits;
441
        int64_t new_block;
442

    
443
        /* Only write refcount block to disk when we are done with it */
444
        old_table_index = table_index;
445
        table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
446
        if ((old_table_index >= 0) && (table_index != old_table_index)) {
447

    
448
            if (write_refcount_block_entries(s, refcount_block_offset,
449
                first_index, last_index) < 0)
450
            {
451
                return -EIO;
452
            }
453

    
454
            first_index = -1;
455
            last_index = -1;
456
        }
457

    
458
        /* Load the refcount block and allocate it if needed */
459
        new_block = alloc_refcount_block(bs, cluster_index);
460
        if (new_block < 0) {
461
            ret = new_block;
462
            goto fail;
463
        }
464
        refcount_block_offset = new_block;
465

    
466
        /* we can update the count and save it */
467
        block_index = cluster_index &
468
            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
469
        if (first_index == -1 || block_index < first_index) {
470
            first_index = block_index;
471
        }
472
        if (block_index > last_index) {
473
            last_index = block_index;
474
        }
475

    
476
        refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
477
        refcount += addend;
478
        if (refcount < 0 || refcount > 0xffff) {
479
            ret = -EINVAL;
480
            goto fail;
481
        }
482
        if (refcount == 0 && cluster_index < s->free_cluster_index) {
483
            s->free_cluster_index = cluster_index;
484
        }
485
        s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
486
    }
487

    
488
    ret = 0;
489
fail:
490

    
491
    /* Write last changed block to disk */
492
    if (refcount_block_offset != 0) {
493
        if (write_refcount_block_entries(s, refcount_block_offset,
494
            first_index, last_index) < 0)
495
        {
496
            return ret < 0 ? ret : -EIO;
497
        }
498
    }
499

    
500
    /*
501
     * Try do undo any updates if an error is returned (This may succeed in
502
     * some cases like ENOSPC for allocating a new refcount block)
503
     */
504
    if (ret < 0) {
505
        int dummy;
506
        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
507
    }
508

    
509
    return ret;
510
}
511

    
512
/* addend must be 1 or -1 */
513
static int update_cluster_refcount(BlockDriverState *bs,
514
                                   int64_t cluster_index,
515
                                   int addend)
516
{
517
    BDRVQcowState *s = bs->opaque;
518
    int ret;
519

    
520
    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
521
    if (ret < 0) {
522
        return ret;
523
    }
524

    
525
    return get_refcount(bs, cluster_index);
526
}
527

    
528

    
529

    
530
/*********************************************************/
531
/* cluster allocation functions */
532

    
533

    
534

    
535
/* return < 0 if error */
536
static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
537
{
538
    BDRVQcowState *s = bs->opaque;
539
    int i, nb_clusters;
540

    
541
    nb_clusters = size_to_clusters(s, size);
542
retry:
543
    for(i = 0; i < nb_clusters; i++) {
544
        int64_t i = s->free_cluster_index++;
545
        if (get_refcount(bs, i) != 0)
546
            goto retry;
547
    }
548
#ifdef DEBUG_ALLOC2
549
    printf("alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
550
            size,
551
            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
552
#endif
553
    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
554
}
555

    
556
int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
557
{
558
    int64_t offset;
559
    int ret;
560

    
561
    offset = alloc_clusters_noref(bs, size);
562
    ret = update_refcount(bs, offset, size, 1);
563
    if (ret < 0) {
564
        return ret;
565
    }
566
    return offset;
567
}
568

    
569
/* only used to allocate compressed sectors. We try to allocate
570
   contiguous sectors. size must be <= cluster_size */
571
int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
572
{
573
    BDRVQcowState *s = bs->opaque;
574
    int64_t offset, cluster_offset;
575
    int free_in_cluster;
576

    
577
    assert(size > 0 && size <= s->cluster_size);
578
    if (s->free_byte_offset == 0) {
579
        s->free_byte_offset = qcow2_alloc_clusters(bs, s->cluster_size);
580
        if (s->free_byte_offset < 0) {
581
            return s->free_byte_offset;
582
        }
583
    }
584
 redo:
585
    free_in_cluster = s->cluster_size -
586
        (s->free_byte_offset & (s->cluster_size - 1));
587
    if (size <= free_in_cluster) {
588
        /* enough space in current cluster */
589
        offset = s->free_byte_offset;
590
        s->free_byte_offset += size;
591
        free_in_cluster -= size;
592
        if (free_in_cluster == 0)
593
            s->free_byte_offset = 0;
594
        if ((offset & (s->cluster_size - 1)) != 0)
595
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
596
    } else {
597
        offset = qcow2_alloc_clusters(bs, s->cluster_size);
598
        if (offset < 0) {
599
            return offset;
600
        }
601
        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
602
        if ((cluster_offset + s->cluster_size) == offset) {
603
            /* we are lucky: contiguous data */
604
            offset = s->free_byte_offset;
605
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
606
            s->free_byte_offset += size;
607
        } else {
608
            s->free_byte_offset = offset;
609
            goto redo;
610
        }
611
    }
612
    return offset;
613
}
614

    
615
void qcow2_free_clusters(BlockDriverState *bs,
616
                          int64_t offset, int64_t size)
617
{
618
    int ret;
619

    
620
    ret = update_refcount(bs, offset, size, -1);
621
    if (ret < 0) {
622
        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
623
        abort();
624
    }
625
}
626

    
627
/*
628
 * free_any_clusters
629
 *
630
 * free clusters according to its type: compressed or not
631
 *
632
 */
633

    
634
void qcow2_free_any_clusters(BlockDriverState *bs,
635
    uint64_t cluster_offset, int nb_clusters)
636
{
637
    BDRVQcowState *s = bs->opaque;
638

    
639
    /* free the cluster */
640

    
641
    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
642
        int nb_csectors;
643
        nb_csectors = ((cluster_offset >> s->csize_shift) &
644
                       s->csize_mask) + 1;
645
        qcow2_free_clusters(bs,
646
            (cluster_offset & s->cluster_offset_mask) & ~511,
647
            nb_csectors * 512);
648
        return;
649
    }
650

    
651
    qcow2_free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
652

    
653
    return;
654
}
655

    
656

    
657

    
658
/*********************************************************/
659
/* snapshots and image creation */
660

    
661

    
662

    
663
void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset,
664
    int64_t size)
665
{
666
    int refcount;
667
    int64_t start, last, cluster_offset;
668
    uint16_t *p;
669

    
670
    start = offset & ~(s->cluster_size - 1);
671
    last = (offset + size - 1)  & ~(s->cluster_size - 1);
672
    for(cluster_offset = start; cluster_offset <= last;
673
        cluster_offset += s->cluster_size) {
674
        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
675
        refcount = be16_to_cpu(*p);
676
        refcount++;
677
        *p = cpu_to_be16(refcount);
678
    }
679
}
680

    
681
/* update the refcounts of snapshots and the copied flag */
682
int qcow2_update_snapshot_refcount(BlockDriverState *bs,
683
    int64_t l1_table_offset, int l1_size, int addend)
684
{
685
    BDRVQcowState *s = bs->opaque;
686
    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
687
    int64_t old_offset, old_l2_offset;
688
    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
689

    
690
    qcow2_l2_cache_reset(bs);
691
    cache_refcount_updates = 1;
692

    
693
    l2_table = NULL;
694
    l1_table = NULL;
695
    l1_size2 = l1_size * sizeof(uint64_t);
696
    if (l1_table_offset != s->l1_table_offset) {
697
        if (l1_size2 != 0) {
698
            l1_table = qemu_mallocz(align_offset(l1_size2, 512));
699
        } else {
700
            l1_table = NULL;
701
        }
702
        l1_allocated = 1;
703
        if (bdrv_pread(s->hd, l1_table_offset,
704
                       l1_table, l1_size2) != l1_size2)
705
            goto fail;
706
        for(i = 0;i < l1_size; i++)
707
            be64_to_cpus(&l1_table[i]);
708
    } else {
709
        assert(l1_size == s->l1_size);
710
        l1_table = s->l1_table;
711
        l1_allocated = 0;
712
    }
713

    
714
    l2_size = s->l2_size * sizeof(uint64_t);
715
    l2_table = qemu_malloc(l2_size);
716
    l1_modified = 0;
717
    for(i = 0; i < l1_size; i++) {
718
        l2_offset = l1_table[i];
719
        if (l2_offset) {
720
            old_l2_offset = l2_offset;
721
            l2_offset &= ~QCOW_OFLAG_COPIED;
722
            l2_modified = 0;
723
            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
724
                goto fail;
725
            for(j = 0; j < s->l2_size; j++) {
726
                offset = be64_to_cpu(l2_table[j]);
727
                if (offset != 0) {
728
                    old_offset = offset;
729
                    offset &= ~QCOW_OFLAG_COPIED;
730
                    if (offset & QCOW_OFLAG_COMPRESSED) {
731
                        nb_csectors = ((offset >> s->csize_shift) &
732
                                       s->csize_mask) + 1;
733
                        if (addend != 0) {
734
                            int ret;
735
                            ret = update_refcount(bs,
736
                                (offset & s->cluster_offset_mask) & ~511,
737
                                nb_csectors * 512, addend);
738
                            if (ret < 0) {
739
                                goto fail;
740
                            }
741
                        }
742
                        /* compressed clusters are never modified */
743
                        refcount = 2;
744
                    } else {
745
                        if (addend != 0) {
746
                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
747
                        } else {
748
                            refcount = get_refcount(bs, offset >> s->cluster_bits);
749
                        }
750
                    }
751

    
752
                    if (refcount == 1) {
753
                        offset |= QCOW_OFLAG_COPIED;
754
                    }
755
                    if (offset != old_offset) {
756
                        l2_table[j] = cpu_to_be64(offset);
757
                        l2_modified = 1;
758
                    }
759
                }
760
            }
761
            if (l2_modified) {
762
                if (bdrv_pwrite(s->hd,
763
                                l2_offset, l2_table, l2_size) != l2_size)
764
                    goto fail;
765
            }
766

    
767
            if (addend != 0) {
768
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
769
            } else {
770
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
771
            }
772
            if (refcount == 1) {
773
                l2_offset |= QCOW_OFLAG_COPIED;
774
            }
775
            if (l2_offset != old_l2_offset) {
776
                l1_table[i] = l2_offset;
777
                l1_modified = 1;
778
            }
779
        }
780
    }
781
    if (l1_modified) {
782
        for(i = 0; i < l1_size; i++)
783
            cpu_to_be64s(&l1_table[i]);
784
        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
785
                        l1_size2) != l1_size2)
786
            goto fail;
787
        for(i = 0; i < l1_size; i++)
788
            be64_to_cpus(&l1_table[i]);
789
    }
790
    if (l1_allocated)
791
        qemu_free(l1_table);
792
    qemu_free(l2_table);
793
    cache_refcount_updates = 0;
794
    write_refcount_block(s);
795
    return 0;
796
 fail:
797
    if (l1_allocated)
798
        qemu_free(l1_table);
799
    qemu_free(l2_table);
800
    cache_refcount_updates = 0;
801
    write_refcount_block(s);
802
    return -EIO;
803
}
804

    
805

    
806

    
807

    
808
/*********************************************************/
809
/* refcount checking functions */
810

    
811

    
812

    
813
/*
814
 * Increases the refcount for a range of clusters in a given refcount table.
815
 * This is used to construct a temporary refcount table out of L1 and L2 tables
816
 * which can be compared the the refcount table saved in the image.
817
 *
818
 * Returns the number of errors in the image that were found
819
 */
820
static int inc_refcounts(BlockDriverState *bs,
821
                          uint16_t *refcount_table,
822
                          int refcount_table_size,
823
                          int64_t offset, int64_t size)
824
{
825
    BDRVQcowState *s = bs->opaque;
826
    int64_t start, last, cluster_offset;
827
    int k;
828
    int errors = 0;
829

    
830
    if (size <= 0)
831
        return 0;
832

    
833
    start = offset & ~(s->cluster_size - 1);
834
    last = (offset + size - 1) & ~(s->cluster_size - 1);
835
    for(cluster_offset = start; cluster_offset <= last;
836
        cluster_offset += s->cluster_size) {
837
        k = cluster_offset >> s->cluster_bits;
838
        if (k < 0 || k >= refcount_table_size) {
839
            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
840
                cluster_offset);
841
            errors++;
842
        } else {
843
            if (++refcount_table[k] == 0) {
844
                fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
845
                    "\n", cluster_offset);
846
                errors++;
847
            }
848
        }
849
    }
850

    
851
    return errors;
852
}
853

    
854
/*
855
 * Increases the refcount in the given refcount table for the all clusters
856
 * referenced in the L2 table. While doing so, performs some checks on L2
857
 * entries.
858
 *
859
 * Returns the number of errors found by the checks or -errno if an internal
860
 * error occurred.
861
 */
862
static int check_refcounts_l2(BlockDriverState *bs,
863
    uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
864
    int check_copied)
865
{
866
    BDRVQcowState *s = bs->opaque;
867
    uint64_t *l2_table, offset;
868
    int i, l2_size, nb_csectors, refcount;
869
    int errors = 0;
870

    
871
    /* Read L2 table from disk */
872
    l2_size = s->l2_size * sizeof(uint64_t);
873
    l2_table = qemu_malloc(l2_size);
874

    
875
    if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
876
        goto fail;
877

    
878
    /* Do the actual checks */
879
    for(i = 0; i < s->l2_size; i++) {
880
        offset = be64_to_cpu(l2_table[i]);
881
        if (offset != 0) {
882
            if (offset & QCOW_OFLAG_COMPRESSED) {
883
                /* Compressed clusters don't have QCOW_OFLAG_COPIED */
884
                if (offset & QCOW_OFLAG_COPIED) {
885
                    fprintf(stderr, "ERROR: cluster %" PRId64 ": "
886
                        "copied flag must never be set for compressed "
887
                        "clusters\n", offset >> s->cluster_bits);
888
                    offset &= ~QCOW_OFLAG_COPIED;
889
                    errors++;
890
                }
891

    
892
                /* Mark cluster as used */
893
                nb_csectors = ((offset >> s->csize_shift) &
894
                               s->csize_mask) + 1;
895
                offset &= s->cluster_offset_mask;
896
                errors += inc_refcounts(bs, refcount_table,
897
                              refcount_table_size,
898
                              offset & ~511, nb_csectors * 512);
899
            } else {
900
                /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
901
                if (check_copied) {
902
                    uint64_t entry = offset;
903
                    offset &= ~QCOW_OFLAG_COPIED;
904
                    refcount = get_refcount(bs, offset >> s->cluster_bits);
905
                    if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) {
906
                        fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
907
                            PRIx64 " refcount=%d\n", entry, refcount);
908
                        errors++;
909
                    }
910
                }
911

    
912
                /* Mark cluster as used */
913
                offset &= ~QCOW_OFLAG_COPIED;
914
                errors += inc_refcounts(bs, refcount_table,
915
                              refcount_table_size,
916
                              offset, s->cluster_size);
917

    
918
                /* Correct offsets are cluster aligned */
919
                if (offset & (s->cluster_size - 1)) {
920
                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
921
                        "properly aligned; L2 entry corrupted.\n", offset);
922
                    errors++;
923
                }
924
            }
925
        }
926
    }
927

    
928
    qemu_free(l2_table);
929
    return errors;
930

    
931
fail:
932
    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
933
    qemu_free(l2_table);
934
    return -EIO;
935
}
936

    
937
/*
938
 * Increases the refcount for the L1 table, its L2 tables and all referenced
939
 * clusters in the given refcount table. While doing so, performs some checks
940
 * on L1 and L2 entries.
941
 *
942
 * Returns the number of errors found by the checks or -errno if an internal
943
 * error occurred.
944
 */
945
static int check_refcounts_l1(BlockDriverState *bs,
946
                              uint16_t *refcount_table,
947
                              int refcount_table_size,
948
                              int64_t l1_table_offset, int l1_size,
949
                              int check_copied)
950
{
951
    BDRVQcowState *s = bs->opaque;
952
    uint64_t *l1_table, l2_offset, l1_size2;
953
    int i, refcount, ret;
954
    int errors = 0;
955

    
956
    l1_size2 = l1_size * sizeof(uint64_t);
957

    
958
    /* Mark L1 table as used */
959
    errors += inc_refcounts(bs, refcount_table, refcount_table_size,
960
                  l1_table_offset, l1_size2);
961

    
962
    /* Read L1 table entries from disk */
963
    if (l1_size2 == 0) {
964
        l1_table = NULL;
965
    } else {
966
        l1_table = qemu_malloc(l1_size2);
967
        if (bdrv_pread(s->hd, l1_table_offset,
968
                       l1_table, l1_size2) != l1_size2)
969
            goto fail;
970
        for(i = 0;i < l1_size; i++)
971
            be64_to_cpus(&l1_table[i]);
972
    }
973

    
974
    /* Do the actual checks */
975
    for(i = 0; i < l1_size; i++) {
976
        l2_offset = l1_table[i];
977
        if (l2_offset) {
978
            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
979
            if (check_copied) {
980
                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
981
                    >> s->cluster_bits);
982
                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
983
                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
984
                        " refcount=%d\n", l2_offset, refcount);
985
                    errors++;
986
                }
987
            }
988

    
989
            /* Mark L2 table as used */
990
            l2_offset &= ~QCOW_OFLAG_COPIED;
991
            errors += inc_refcounts(bs, refcount_table,
992
                          refcount_table_size,
993
                          l2_offset,
994
                          s->cluster_size);
995

    
996
            /* L2 tables are cluster aligned */
997
            if (l2_offset & (s->cluster_size - 1)) {
998
                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
999
                    "cluster aligned; L1 entry corrupted\n", l2_offset);
1000
                errors++;
1001
            }
1002

    
1003
            /* Process and check L2 entries */
1004
            ret = check_refcounts_l2(bs, refcount_table, refcount_table_size,
1005
                l2_offset, check_copied);
1006
            if (ret < 0) {
1007
                goto fail;
1008
            }
1009
            errors += ret;
1010
        }
1011
    }
1012
    qemu_free(l1_table);
1013
    return errors;
1014

    
1015
fail:
1016
    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1017
    qemu_free(l1_table);
1018
    return -EIO;
1019
}
1020

    
1021
/*
1022
 * Checks an image for refcount consistency.
1023
 *
1024
 * Returns 0 if no errors are found, the number of errors in case the image is
1025
 * detected as corrupted, and -errno when an internal error occured.
1026
 */
1027
int qcow2_check_refcounts(BlockDriverState *bs)
1028
{
1029
    BDRVQcowState *s = bs->opaque;
1030
    int64_t size;
1031
    int nb_clusters, refcount1, refcount2, i;
1032
    QCowSnapshot *sn;
1033
    uint16_t *refcount_table;
1034
    int ret, errors = 0;
1035

    
1036
    size = bdrv_getlength(s->hd);
1037
    nb_clusters = size_to_clusters(s, size);
1038
    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
1039

    
1040
    /* header */
1041
    errors += inc_refcounts(bs, refcount_table, nb_clusters,
1042
                  0, s->cluster_size);
1043

    
1044
    /* current L1 table */
1045
    ret = check_refcounts_l1(bs, refcount_table, nb_clusters,
1046
                       s->l1_table_offset, s->l1_size, 1);
1047
    if (ret < 0) {
1048
        return ret;
1049
    }
1050
    errors += ret;
1051

    
1052
    /* snapshots */
1053
    for(i = 0; i < s->nb_snapshots; i++) {
1054
        sn = s->snapshots + i;
1055
        check_refcounts_l1(bs, refcount_table, nb_clusters,
1056
                           sn->l1_table_offset, sn->l1_size, 0);
1057
    }
1058
    errors += inc_refcounts(bs, refcount_table, nb_clusters,
1059
                  s->snapshots_offset, s->snapshots_size);
1060

    
1061
    /* refcount data */
1062
    errors += inc_refcounts(bs, refcount_table, nb_clusters,
1063
                  s->refcount_table_offset,
1064
                  s->refcount_table_size * sizeof(uint64_t));
1065
    for(i = 0; i < s->refcount_table_size; i++) {
1066
        int64_t offset;
1067
        offset = s->refcount_table[i];
1068

    
1069
        /* Refcount blocks are cluster aligned */
1070
        if (offset & (s->cluster_size - 1)) {
1071
            fprintf(stderr, "ERROR refcount block %d is not "
1072
                "cluster aligned; refcount table entry corrupted\n", i);
1073
            errors++;
1074
        }
1075

    
1076
        if (offset != 0) {
1077
            errors += inc_refcounts(bs, refcount_table, nb_clusters,
1078
                          offset, s->cluster_size);
1079
            if (refcount_table[offset / s->cluster_size] != 1) {
1080
                fprintf(stderr, "ERROR refcount block %d refcount=%d\n",
1081
                    i, refcount_table[offset / s->cluster_size]);
1082
            }
1083
        }
1084
    }
1085

    
1086
    /* compare ref counts */
1087
    for(i = 0; i < nb_clusters; i++) {
1088
        refcount1 = get_refcount(bs, i);
1089
        refcount2 = refcount_table[i];
1090
        if (refcount1 != refcount2) {
1091
            fprintf(stderr, "ERROR cluster %d refcount=%d reference=%d\n",
1092
                   i, refcount1, refcount2);
1093
            errors++;
1094
        }
1095
    }
1096

    
1097
    qemu_free(refcount_table);
1098

    
1099
    return errors;
1100
}
1101