Statistics
| Branch: | Revision:

root / block / qcow2.c @ 72daa72e

History | View | Annotate | Download (70.6 kB)

1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "block/block_int.h"
26
#include "qemu/module.h"
27
#include <zlib.h>
28
#include "qemu/aes.h"
29
#include "block/qcow2.h"
30
#include "qemu/error-report.h"
31
#include "qapi/qmp/qerror.h"
32
#include "qapi/qmp/qbool.h"
33
#include "trace.h"
34

    
35
/*
36
  Differences with QCOW:
37

38
  - Support for multiple incremental snapshots.
39
  - Memory management by reference counts.
40
  - Clusters which have a reference count of one have the bit
41
    QCOW_OFLAG_COPIED to optimize write performance.
42
  - Size of compressed clusters is stored in sectors to reduce bit usage
43
    in the cluster offsets.
44
  - Support for storing additional data (such as the VM state) in the
45
    snapshots.
46
  - If a backing store is used, the cluster size is not constrained
47
    (could be backported to QCOW).
48
  - L2 tables have always a size of one cluster.
49
*/
50

    
51

    
52
typedef struct {
53
    uint32_t magic;
54
    uint32_t len;
55
} QEMU_PACKED QCowExtension;
56

    
57
#define  QCOW2_EXT_MAGIC_END 0
58
#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
59
#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
60

    
61
static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
62
{
63
    const QCowHeader *cow_header = (const void *)buf;
64

    
65
    if (buf_size >= sizeof(QCowHeader) &&
66
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
67
        be32_to_cpu(cow_header->version) >= 2)
68
        return 100;
69
    else
70
        return 0;
71
}
72

    
73

    
74
/* 
75
 * read qcow2 extension and fill bs
76
 * start reading from start_offset
77
 * finish reading upon magic of value 0 or when end_offset reached
78
 * unknown magic is skipped (future extension this version knows nothing about)
79
 * return 0 upon success, non-0 otherwise
80
 */
81
static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
82
                                 uint64_t end_offset, void **p_feature_table,
83
                                 Error **errp)
84
{
85
    BDRVQcowState *s = bs->opaque;
86
    QCowExtension ext;
87
    uint64_t offset;
88
    int ret;
89

    
90
#ifdef DEBUG_EXT
91
    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
92
#endif
93
    offset = start_offset;
94
    while (offset < end_offset) {
95

    
96
#ifdef DEBUG_EXT
97
        /* Sanity check */
98
        if (offset > s->cluster_size)
99
            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
100

    
101
        printf("attempting to read extended header in offset %lu\n", offset);
102
#endif
103

    
104
        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
105
        if (ret < 0) {
106
            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
107
                             "pread fail from offset %" PRIu64, offset);
108
            return 1;
109
        }
110
        be32_to_cpus(&ext.magic);
111
        be32_to_cpus(&ext.len);
112
        offset += sizeof(ext);
113
#ifdef DEBUG_EXT
114
        printf("ext.magic = 0x%x\n", ext.magic);
115
#endif
116
        if (ext.len > end_offset - offset) {
117
            error_setg(errp, "Header extension too large");
118
            return -EINVAL;
119
        }
120

    
121
        switch (ext.magic) {
122
        case QCOW2_EXT_MAGIC_END:
123
            return 0;
124

    
125
        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
126
            if (ext.len >= sizeof(bs->backing_format)) {
127
                error_setg(errp, "ERROR: ext_backing_format: len=%u too large"
128
                           " (>=%zu)", ext.len, sizeof(bs->backing_format));
129
                return 2;
130
            }
131
            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
132
            if (ret < 0) {
133
                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
134
                                 "Could not read format name");
135
                return 3;
136
            }
137
            bs->backing_format[ext.len] = '\0';
138
#ifdef DEBUG_EXT
139
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
140
#endif
141
            break;
142

    
143
        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
144
            if (p_feature_table != NULL) {
145
                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
146
                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
147
                if (ret < 0) {
148
                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
149
                                     "Could not read table");
150
                    return ret;
151
                }
152

    
153
                *p_feature_table = feature_table;
154
            }
155
            break;
156

    
157
        default:
158
            /* unknown magic - save it in case we need to rewrite the header */
159
            {
160
                Qcow2UnknownHeaderExtension *uext;
161

    
162
                uext = g_malloc0(sizeof(*uext)  + ext.len);
163
                uext->magic = ext.magic;
164
                uext->len = ext.len;
165
                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
166

    
167
                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
168
                if (ret < 0) {
169
                    error_setg_errno(errp, -ret, "ERROR: unknown extension: "
170
                                     "Could not read data");
171
                    return ret;
172
                }
173
            }
174
            break;
175
        }
176

    
177
        offset += ((ext.len + 7) & ~7);
178
    }
179

    
180
    return 0;
181
}
182

    
183
static void cleanup_unknown_header_ext(BlockDriverState *bs)
184
{
185
    BDRVQcowState *s = bs->opaque;
186
    Qcow2UnknownHeaderExtension *uext, *next;
187

    
188
    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
189
        QLIST_REMOVE(uext, next);
190
        g_free(uext);
191
    }
192
}
193

    
194
static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
195
    Error **errp, const char *fmt, ...)
196
{
197
    char msg[64];
198
    va_list ap;
199

    
200
    va_start(ap, fmt);
201
    vsnprintf(msg, sizeof(msg), fmt, ap);
202
    va_end(ap);
203

    
204
    error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "qcow2",
205
              msg);
206
}
207

    
208
static void report_unsupported_feature(BlockDriverState *bs,
209
    Error **errp, Qcow2Feature *table, uint64_t mask)
210
{
211
    while (table && table->name[0] != '\0') {
212
        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
213
            if (mask & (1 << table->bit)) {
214
                report_unsupported(bs, errp, "%.46s", table->name);
215
                mask &= ~(1 << table->bit);
216
            }
217
        }
218
        table++;
219
    }
220

    
221
    if (mask) {
222
        report_unsupported(bs, errp, "Unknown incompatible feature: %" PRIx64,
223
                           mask);
224
    }
225
}
226

    
227
/*
228
 * Sets the dirty bit and flushes afterwards if necessary.
229
 *
230
 * The incompatible_features bit is only set if the image file header was
231
 * updated successfully.  Therefore it is not required to check the return
232
 * value of this function.
233
 */
234
int qcow2_mark_dirty(BlockDriverState *bs)
235
{
236
    BDRVQcowState *s = bs->opaque;
237
    uint64_t val;
238
    int ret;
239

    
240
    assert(s->qcow_version >= 3);
241

    
242
    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
243
        return 0; /* already dirty */
244
    }
245

    
246
    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
247
    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
248
                      &val, sizeof(val));
249
    if (ret < 0) {
250
        return ret;
251
    }
252
    ret = bdrv_flush(bs->file);
253
    if (ret < 0) {
254
        return ret;
255
    }
256

    
257
    /* Only treat image as dirty if the header was updated successfully */
258
    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
259
    return 0;
260
}
261

    
262
/*
263
 * Clears the dirty bit and flushes before if necessary.  Only call this
264
 * function when there are no pending requests, it does not guard against
265
 * concurrent requests dirtying the image.
266
 */
267
static int qcow2_mark_clean(BlockDriverState *bs)
268
{
269
    BDRVQcowState *s = bs->opaque;
270

    
271
    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
272
        int ret = bdrv_flush(bs);
273
        if (ret < 0) {
274
            return ret;
275
        }
276

    
277
        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
278
        return qcow2_update_header(bs);
279
    }
280
    return 0;
281
}
282

    
283
/*
284
 * Marks the image as corrupt.
285
 */
286
int qcow2_mark_corrupt(BlockDriverState *bs)
287
{
288
    BDRVQcowState *s = bs->opaque;
289

    
290
    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
291
    return qcow2_update_header(bs);
292
}
293

    
294
/*
295
 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
296
 * before if necessary.
297
 */
298
int qcow2_mark_consistent(BlockDriverState *bs)
299
{
300
    BDRVQcowState *s = bs->opaque;
301

    
302
    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
303
        int ret = bdrv_flush(bs);
304
        if (ret < 0) {
305
            return ret;
306
        }
307

    
308
        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
309
        return qcow2_update_header(bs);
310
    }
311
    return 0;
312
}
313

    
314
static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
315
                       BdrvCheckMode fix)
316
{
317
    int ret = qcow2_check_refcounts(bs, result, fix);
318
    if (ret < 0) {
319
        return ret;
320
    }
321

    
322
    if (fix && result->check_errors == 0 && result->corruptions == 0) {
323
        ret = qcow2_mark_clean(bs);
324
        if (ret < 0) {
325
            return ret;
326
        }
327
        return qcow2_mark_consistent(bs);
328
    }
329
    return ret;
330
}
331

    
332
static QemuOptsList qcow2_runtime_opts = {
333
    .name = "qcow2",
334
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
335
    .desc = {
336
        {
337
            .name = QCOW2_OPT_LAZY_REFCOUNTS,
338
            .type = QEMU_OPT_BOOL,
339
            .help = "Postpone refcount updates",
340
        },
341
        {
342
            .name = QCOW2_OPT_DISCARD_REQUEST,
343
            .type = QEMU_OPT_BOOL,
344
            .help = "Pass guest discard requests to the layer below",
345
        },
346
        {
347
            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
348
            .type = QEMU_OPT_BOOL,
349
            .help = "Generate discard requests when snapshot related space "
350
                    "is freed",
351
        },
352
        {
353
            .name = QCOW2_OPT_DISCARD_OTHER,
354
            .type = QEMU_OPT_BOOL,
355
            .help = "Generate discard requests when other clusters are freed",
356
        },
357
        {
358
            .name = QCOW2_OPT_OVERLAP,
359
            .type = QEMU_OPT_STRING,
360
            .help = "Selects which overlap checks to perform from a range of "
361
                    "templates (none, constant, cached, all)",
362
        },
363
        {
364
            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
365
            .type = QEMU_OPT_BOOL,
366
            .help = "Check for unintended writes into the main qcow2 header",
367
        },
368
        {
369
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
370
            .type = QEMU_OPT_BOOL,
371
            .help = "Check for unintended writes into the active L1 table",
372
        },
373
        {
374
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
375
            .type = QEMU_OPT_BOOL,
376
            .help = "Check for unintended writes into an active L2 table",
377
        },
378
        {
379
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
380
            .type = QEMU_OPT_BOOL,
381
            .help = "Check for unintended writes into the refcount table",
382
        },
383
        {
384
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
385
            .type = QEMU_OPT_BOOL,
386
            .help = "Check for unintended writes into a refcount block",
387
        },
388
        {
389
            .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
390
            .type = QEMU_OPT_BOOL,
391
            .help = "Check for unintended writes into the snapshot table",
392
        },
393
        {
394
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
395
            .type = QEMU_OPT_BOOL,
396
            .help = "Check for unintended writes into an inactive L1 table",
397
        },
398
        {
399
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
400
            .type = QEMU_OPT_BOOL,
401
            .help = "Check for unintended writes into an inactive L2 table",
402
        },
403
        { /* end of list */ }
404
    },
405
};
406

    
407
static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
408
    [QCOW2_OL_MAIN_HEADER_BITNR]    = QCOW2_OPT_OVERLAP_MAIN_HEADER,
409
    [QCOW2_OL_ACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L1,
410
    [QCOW2_OL_ACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L2,
411
    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
412
    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
413
    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
414
    [QCOW2_OL_INACTIVE_L1_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L1,
415
    [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
416
};
417

    
418
static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
419
                      Error **errp)
420
{
421
    BDRVQcowState *s = bs->opaque;
422
    int len, i, ret = 0;
423
    QCowHeader header;
424
    QemuOpts *opts;
425
    Error *local_err = NULL;
426
    uint64_t ext_end;
427
    uint64_t l1_vm_state_index;
428
    const char *opt_overlap_check;
429
    int overlap_check_template = 0;
430

    
431
    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
432
    if (ret < 0) {
433
        error_setg_errno(errp, -ret, "Could not read qcow2 header");
434
        goto fail;
435
    }
436
    be32_to_cpus(&header.magic);
437
    be32_to_cpus(&header.version);
438
    be64_to_cpus(&header.backing_file_offset);
439
    be32_to_cpus(&header.backing_file_size);
440
    be64_to_cpus(&header.size);
441
    be32_to_cpus(&header.cluster_bits);
442
    be32_to_cpus(&header.crypt_method);
443
    be64_to_cpus(&header.l1_table_offset);
444
    be32_to_cpus(&header.l1_size);
445
    be64_to_cpus(&header.refcount_table_offset);
446
    be32_to_cpus(&header.refcount_table_clusters);
447
    be64_to_cpus(&header.snapshots_offset);
448
    be32_to_cpus(&header.nb_snapshots);
449

    
450
    if (header.magic != QCOW_MAGIC) {
451
        error_setg(errp, "Image is not in qcow2 format");
452
        ret = -EMEDIUMTYPE;
453
        goto fail;
454
    }
455
    if (header.version < 2 || header.version > 3) {
456
        report_unsupported(bs, errp, "QCOW version %d", header.version);
457
        ret = -ENOTSUP;
458
        goto fail;
459
    }
460

    
461
    s->qcow_version = header.version;
462

    
463
    /* Initialise version 3 header fields */
464
    if (header.version == 2) {
465
        header.incompatible_features    = 0;
466
        header.compatible_features      = 0;
467
        header.autoclear_features       = 0;
468
        header.refcount_order           = 4;
469
        header.header_length            = 72;
470
    } else {
471
        be64_to_cpus(&header.incompatible_features);
472
        be64_to_cpus(&header.compatible_features);
473
        be64_to_cpus(&header.autoclear_features);
474
        be32_to_cpus(&header.refcount_order);
475
        be32_to_cpus(&header.header_length);
476
    }
477

    
478
    if (header.header_length > sizeof(header)) {
479
        s->unknown_header_fields_size = header.header_length - sizeof(header);
480
        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
481
        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
482
                         s->unknown_header_fields_size);
483
        if (ret < 0) {
484
            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
485
                             "fields");
486
            goto fail;
487
        }
488
    }
489

    
490
    if (header.backing_file_offset) {
491
        ext_end = header.backing_file_offset;
492
    } else {
493
        ext_end = 1 << header.cluster_bits;
494
    }
495

    
496
    /* Handle feature bits */
497
    s->incompatible_features    = header.incompatible_features;
498
    s->compatible_features      = header.compatible_features;
499
    s->autoclear_features       = header.autoclear_features;
500

    
501
    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
502
        void *feature_table = NULL;
503
        qcow2_read_extensions(bs, header.header_length, ext_end,
504
                              &feature_table, NULL);
505
        report_unsupported_feature(bs, errp, feature_table,
506
                                   s->incompatible_features &
507
                                   ~QCOW2_INCOMPAT_MASK);
508
        ret = -ENOTSUP;
509
        goto fail;
510
    }
511

    
512
    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
513
        /* Corrupt images may not be written to unless they are being repaired
514
         */
515
        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
516
            error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
517
                       "read/write");
518
            ret = -EACCES;
519
            goto fail;
520
        }
521
    }
522

    
523
    /* Check support for various header values */
524
    if (header.refcount_order != 4) {
525
        report_unsupported(bs, errp, "%d bit reference counts",
526
                           1 << header.refcount_order);
527
        ret = -ENOTSUP;
528
        goto fail;
529
    }
530
    s->refcount_order = header.refcount_order;
531

    
532
    if (header.cluster_bits < MIN_CLUSTER_BITS ||
533
        header.cluster_bits > MAX_CLUSTER_BITS) {
534
        error_setg(errp, "Unsupported cluster size: 2^%i", header.cluster_bits);
535
        ret = -EINVAL;
536
        goto fail;
537
    }
538
    if (header.crypt_method > QCOW_CRYPT_AES) {
539
        error_setg(errp, "Unsupported encryption method: %i",
540
                   header.crypt_method);
541
        ret = -EINVAL;
542
        goto fail;
543
    }
544
    s->crypt_method_header = header.crypt_method;
545
    if (s->crypt_method_header) {
546
        bs->encrypted = 1;
547
    }
548
    s->cluster_bits = header.cluster_bits;
549
    s->cluster_size = 1 << s->cluster_bits;
550
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
551
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
552
    s->l2_size = 1 << s->l2_bits;
553
    bs->total_sectors = header.size / 512;
554
    s->csize_shift = (62 - (s->cluster_bits - 8));
555
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
556
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
557
    s->refcount_table_offset = header.refcount_table_offset;
558
    s->refcount_table_size =
559
        header.refcount_table_clusters << (s->cluster_bits - 3);
560

    
561
    s->snapshots_offset = header.snapshots_offset;
562
    s->nb_snapshots = header.nb_snapshots;
563

    
564
    /* read the level 1 table */
565
    s->l1_size = header.l1_size;
566

    
567
    l1_vm_state_index = size_to_l1(s, header.size);
568
    if (l1_vm_state_index > INT_MAX) {
569
        error_setg(errp, "Image is too big");
570
        ret = -EFBIG;
571
        goto fail;
572
    }
573
    s->l1_vm_state_index = l1_vm_state_index;
574

    
575
    /* the L1 table must contain at least enough entries to put
576
       header.size bytes */
577
    if (s->l1_size < s->l1_vm_state_index) {
578
        error_setg(errp, "L1 table is too small");
579
        ret = -EINVAL;
580
        goto fail;
581
    }
582
    s->l1_table_offset = header.l1_table_offset;
583
    if (s->l1_size > 0) {
584
        s->l1_table = g_malloc0(
585
            align_offset(s->l1_size * sizeof(uint64_t), 512));
586
        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
587
                         s->l1_size * sizeof(uint64_t));
588
        if (ret < 0) {
589
            error_setg_errno(errp, -ret, "Could not read L1 table");
590
            goto fail;
591
        }
592
        for(i = 0;i < s->l1_size; i++) {
593
            be64_to_cpus(&s->l1_table[i]);
594
        }
595
    }
596

    
597
    /* alloc L2 table/refcount block cache */
598
    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
599
    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
600

    
601
    s->cluster_cache = g_malloc(s->cluster_size);
602
    /* one more sector for decompressed data alignment */
603
    s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
604
                                  + 512);
605
    s->cluster_cache_offset = -1;
606
    s->flags = flags;
607

    
608
    ret = qcow2_refcount_init(bs);
609
    if (ret != 0) {
610
        error_setg_errno(errp, -ret, "Could not initialize refcount handling");
611
        goto fail;
612
    }
613

    
614
    QLIST_INIT(&s->cluster_allocs);
615
    QTAILQ_INIT(&s->discards);
616

    
617
    /* read qcow2 extensions */
618
    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
619
        &local_err)) {
620
        error_propagate(errp, local_err);
621
        ret = -EINVAL;
622
        goto fail;
623
    }
624

    
625
    /* read the backing file name */
626
    if (header.backing_file_offset != 0) {
627
        len = header.backing_file_size;
628
        if (len > 1023) {
629
            len = 1023;
630
        }
631
        ret = bdrv_pread(bs->file, header.backing_file_offset,
632
                         bs->backing_file, len);
633
        if (ret < 0) {
634
            error_setg_errno(errp, -ret, "Could not read backing file name");
635
            goto fail;
636
        }
637
        bs->backing_file[len] = '\0';
638
    }
639

    
640
    ret = qcow2_read_snapshots(bs);
641
    if (ret < 0) {
642
        error_setg_errno(errp, -ret, "Could not read snapshots");
643
        goto fail;
644
    }
645

    
646
    /* Clear unknown autoclear feature bits */
647
    if (!bs->read_only && s->autoclear_features != 0) {
648
        s->autoclear_features = 0;
649
        ret = qcow2_update_header(bs);
650
        if (ret < 0) {
651
            error_setg_errno(errp, -ret, "Could not update qcow2 header");
652
            goto fail;
653
        }
654
    }
655

    
656
    /* Initialise locks */
657
    qemu_co_mutex_init(&s->lock);
658

    
659
    /* Repair image if dirty */
660
    if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
661
        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
662
        BdrvCheckResult result = {0};
663

    
664
        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
665
        if (ret < 0) {
666
            error_setg_errno(errp, -ret, "Could not repair dirty image");
667
            goto fail;
668
        }
669
    }
670

    
671
    /* Enable lazy_refcounts according to image and command line options */
672
    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
673
    qemu_opts_absorb_qdict(opts, options, &local_err);
674
    if (error_is_set(&local_err)) {
675
        error_propagate(errp, local_err);
676
        ret = -EINVAL;
677
        goto fail;
678
    }
679

    
680
    s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
681
        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
682

    
683
    s->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
684
    s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
685
    s->discard_passthrough[QCOW2_DISCARD_REQUEST] =
686
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
687
                          flags & BDRV_O_UNMAP);
688
    s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
689
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
690
    s->discard_passthrough[QCOW2_DISCARD_OTHER] =
691
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
692

    
693
    opt_overlap_check = qemu_opt_get(opts, "overlap-check") ?: "cached";
694
    if (!strcmp(opt_overlap_check, "none")) {
695
        overlap_check_template = 0;
696
    } else if (!strcmp(opt_overlap_check, "constant")) {
697
        overlap_check_template = QCOW2_OL_CONSTANT;
698
    } else if (!strcmp(opt_overlap_check, "cached")) {
699
        overlap_check_template = QCOW2_OL_CACHED;
700
    } else if (!strcmp(opt_overlap_check, "all")) {
701
        overlap_check_template = QCOW2_OL_ALL;
702
    } else {
703
        error_setg(errp, "Unsupported value '%s' for qcow2 option "
704
                   "'overlap-check'. Allowed are either of the following: "
705
                   "none, constant, cached, all", opt_overlap_check);
706
        qemu_opts_del(opts);
707
        ret = -EINVAL;
708
        goto fail;
709
    }
710

    
711
    s->overlap_check = 0;
712
    for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
713
        /* overlap-check defines a template bitmask, but every flag may be
714
         * overwritten through the associated boolean option */
715
        s->overlap_check |=
716
            qemu_opt_get_bool(opts, overlap_bool_option_names[i],
717
                              overlap_check_template & (1 << i)) << i;
718
    }
719

    
720
    qemu_opts_del(opts);
721
    bs->bl.write_zeroes_alignment = s->cluster_sectors;
722

    
723
    if (s->use_lazy_refcounts && s->qcow_version < 3) {
724
        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
725
                   "qemu 1.1 compatibility level");
726
        ret = -EINVAL;
727
        goto fail;
728
    }
729

    
730
#ifdef DEBUG_ALLOC
731
    {
732
        BdrvCheckResult result = {0};
733
        qcow2_check_refcounts(bs, &result, 0);
734
    }
735
#endif
736
    return ret;
737

    
738
 fail:
739
    g_free(s->unknown_header_fields);
740
    cleanup_unknown_header_ext(bs);
741
    qcow2_free_snapshots(bs);
742
    qcow2_refcount_close(bs);
743
    g_free(s->l1_table);
744
    /* else pre-write overlap checks in cache_destroy may crash */
745
    s->l1_table = NULL;
746
    if (s->l2_table_cache) {
747
        qcow2_cache_destroy(bs, s->l2_table_cache);
748
    }
749
    g_free(s->cluster_cache);
750
    qemu_vfree(s->cluster_data);
751
    return ret;
752
}
753

    
754
static int qcow2_set_key(BlockDriverState *bs, const char *key)
755
{
756
    BDRVQcowState *s = bs->opaque;
757
    uint8_t keybuf[16];
758
    int len, i;
759

    
760
    memset(keybuf, 0, 16);
761
    len = strlen(key);
762
    if (len > 16)
763
        len = 16;
764
    /* XXX: we could compress the chars to 7 bits to increase
765
       entropy */
766
    for(i = 0;i < len;i++) {
767
        keybuf[i] = key[i];
768
    }
769
    s->crypt_method = s->crypt_method_header;
770

    
771
    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
772
        return -1;
773
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
774
        return -1;
775
#if 0
776
    /* test */
777
    {
778
        uint8_t in[16];
779
        uint8_t out[16];
780
        uint8_t tmp[16];
781
        for(i=0;i<16;i++)
782
            in[i] = i;
783
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
784
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
785
        for(i = 0; i < 16; i++)
786
            printf(" %02x", tmp[i]);
787
        printf("\n");
788
        for(i = 0; i < 16; i++)
789
            printf(" %02x", out[i]);
790
        printf("\n");
791
    }
792
#endif
793
    return 0;
794
}
795

    
796
/* We have nothing to do for QCOW2 reopen, stubs just return
797
 * success */
798
static int qcow2_reopen_prepare(BDRVReopenState *state,
799
                                BlockReopenQueue *queue, Error **errp)
800
{
801
    return 0;
802
}
803

    
804
static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
805
        int64_t sector_num, int nb_sectors, int *pnum)
806
{
807
    BDRVQcowState *s = bs->opaque;
808
    uint64_t cluster_offset;
809
    int index_in_cluster, ret;
810
    int64_t status = 0;
811

    
812
    *pnum = nb_sectors;
813
    qemu_co_mutex_lock(&s->lock);
814
    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
815
    qemu_co_mutex_unlock(&s->lock);
816
    if (ret < 0) {
817
        return ret;
818
    }
819

    
820
    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
821
        !s->crypt_method) {
822
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
823
        cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
824
        status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
825
    }
826
    if (ret == QCOW2_CLUSTER_ZERO) {
827
        status |= BDRV_BLOCK_ZERO;
828
    } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
829
        status |= BDRV_BLOCK_DATA;
830
    }
831
    return status;
832
}
833

    
834
/* handle reading after the end of the backing file */
835
int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
836
                  int64_t sector_num, int nb_sectors)
837
{
838
    int n1;
839
    if ((sector_num + nb_sectors) <= bs->total_sectors)
840
        return nb_sectors;
841
    if (sector_num >= bs->total_sectors)
842
        n1 = 0;
843
    else
844
        n1 = bs->total_sectors - sector_num;
845

    
846
    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
847

    
848
    return n1;
849
}
850

    
851
static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
852
                          int remaining_sectors, QEMUIOVector *qiov)
853
{
854
    BDRVQcowState *s = bs->opaque;
855
    int index_in_cluster, n1;
856
    int ret;
857
    int cur_nr_sectors; /* number of sectors in current iteration */
858
    uint64_t cluster_offset = 0;
859
    uint64_t bytes_done = 0;
860
    QEMUIOVector hd_qiov;
861
    uint8_t *cluster_data = NULL;
862

    
863
    qemu_iovec_init(&hd_qiov, qiov->niov);
864

    
865
    qemu_co_mutex_lock(&s->lock);
866

    
867
    while (remaining_sectors != 0) {
868

    
869
        /* prepare next request */
870
        cur_nr_sectors = remaining_sectors;
871
        if (s->crypt_method) {
872
            cur_nr_sectors = MIN(cur_nr_sectors,
873
                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
874
        }
875

    
876
        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
877
            &cur_nr_sectors, &cluster_offset);
878
        if (ret < 0) {
879
            goto fail;
880
        }
881

    
882
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
883

    
884
        qemu_iovec_reset(&hd_qiov);
885
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
886
            cur_nr_sectors * 512);
887

    
888
        switch (ret) {
889
        case QCOW2_CLUSTER_UNALLOCATED:
890

    
891
            if (bs->backing_hd) {
892
                /* read from the base image */
893
                n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
894
                    sector_num, cur_nr_sectors);
895
                if (n1 > 0) {
896
                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
897
                    qemu_co_mutex_unlock(&s->lock);
898
                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
899
                                        n1, &hd_qiov);
900
                    qemu_co_mutex_lock(&s->lock);
901
                    if (ret < 0) {
902
                        goto fail;
903
                    }
904
                }
905
            } else {
906
                /* Note: in this case, no need to wait */
907
                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
908
            }
909
            break;
910

    
911
        case QCOW2_CLUSTER_ZERO:
912
            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
913
            break;
914

    
915
        case QCOW2_CLUSTER_COMPRESSED:
916
            /* add AIO support for compressed blocks ? */
917
            ret = qcow2_decompress_cluster(bs, cluster_offset);
918
            if (ret < 0) {
919
                goto fail;
920
            }
921

    
922
            qemu_iovec_from_buf(&hd_qiov, 0,
923
                s->cluster_cache + index_in_cluster * 512,
924
                512 * cur_nr_sectors);
925
            break;
926

    
927
        case QCOW2_CLUSTER_NORMAL:
928
            if ((cluster_offset & 511) != 0) {
929
                ret = -EIO;
930
                goto fail;
931
            }
932

    
933
            if (s->crypt_method) {
934
                /*
935
                 * For encrypted images, read everything into a temporary
936
                 * contiguous buffer on which the AES functions can work.
937
                 */
938
                if (!cluster_data) {
939
                    cluster_data =
940
                        qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
941
                }
942

    
943
                assert(cur_nr_sectors <=
944
                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
945
                qemu_iovec_reset(&hd_qiov);
946
                qemu_iovec_add(&hd_qiov, cluster_data,
947
                    512 * cur_nr_sectors);
948
            }
949

    
950
            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
951
            qemu_co_mutex_unlock(&s->lock);
952
            ret = bdrv_co_readv(bs->file,
953
                                (cluster_offset >> 9) + index_in_cluster,
954
                                cur_nr_sectors, &hd_qiov);
955
            qemu_co_mutex_lock(&s->lock);
956
            if (ret < 0) {
957
                goto fail;
958
            }
959
            if (s->crypt_method) {
960
                qcow2_encrypt_sectors(s, sector_num,  cluster_data,
961
                    cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
962
                qemu_iovec_from_buf(qiov, bytes_done,
963
                    cluster_data, 512 * cur_nr_sectors);
964
            }
965
            break;
966

    
967
        default:
968
            g_assert_not_reached();
969
            ret = -EIO;
970
            goto fail;
971
        }
972

    
973
        remaining_sectors -= cur_nr_sectors;
974
        sector_num += cur_nr_sectors;
975
        bytes_done += cur_nr_sectors * 512;
976
    }
977
    ret = 0;
978

    
979
fail:
980
    qemu_co_mutex_unlock(&s->lock);
981

    
982
    qemu_iovec_destroy(&hd_qiov);
983
    qemu_vfree(cluster_data);
984

    
985
    return ret;
986
}
987

    
988
static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
989
                           int64_t sector_num,
990
                           int remaining_sectors,
991
                           QEMUIOVector *qiov)
992
{
993
    BDRVQcowState *s = bs->opaque;
994
    int index_in_cluster;
995
    int n_end;
996
    int ret;
997
    int cur_nr_sectors; /* number of sectors in current iteration */
998
    uint64_t cluster_offset;
999
    QEMUIOVector hd_qiov;
1000
    uint64_t bytes_done = 0;
1001
    uint8_t *cluster_data = NULL;
1002
    QCowL2Meta *l2meta = NULL;
1003

    
1004
    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
1005
                                 remaining_sectors);
1006

    
1007
    qemu_iovec_init(&hd_qiov, qiov->niov);
1008

    
1009
    s->cluster_cache_offset = -1; /* disable compressed cache */
1010

    
1011
    qemu_co_mutex_lock(&s->lock);
1012

    
1013
    while (remaining_sectors != 0) {
1014

    
1015
        l2meta = NULL;
1016

    
1017
        trace_qcow2_writev_start_part(qemu_coroutine_self());
1018
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1019
        n_end = index_in_cluster + remaining_sectors;
1020
        if (s->crypt_method &&
1021
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
1022
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1023
        }
1024

    
1025
        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
1026
            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
1027
        if (ret < 0) {
1028
            goto fail;
1029
        }
1030

    
1031
        assert((cluster_offset & 511) == 0);
1032

    
1033
        qemu_iovec_reset(&hd_qiov);
1034
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
1035
            cur_nr_sectors * 512);
1036

    
1037
        if (s->crypt_method) {
1038
            if (!cluster_data) {
1039
                cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
1040
                                                 s->cluster_size);
1041
            }
1042

    
1043
            assert(hd_qiov.size <=
1044
                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1045
            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
1046

    
1047
            qcow2_encrypt_sectors(s, sector_num, cluster_data,
1048
                cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
1049

    
1050
            qemu_iovec_reset(&hd_qiov);
1051
            qemu_iovec_add(&hd_qiov, cluster_data,
1052
                cur_nr_sectors * 512);
1053
        }
1054

    
1055
        ret = qcow2_pre_write_overlap_check(bs, 0,
1056
                cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE,
1057
                cur_nr_sectors * BDRV_SECTOR_SIZE);
1058
        if (ret < 0) {
1059
            goto fail;
1060
        }
1061

    
1062
        qemu_co_mutex_unlock(&s->lock);
1063
        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
1064
        trace_qcow2_writev_data(qemu_coroutine_self(),
1065
                                (cluster_offset >> 9) + index_in_cluster);
1066
        ret = bdrv_co_writev(bs->file,
1067
                             (cluster_offset >> 9) + index_in_cluster,
1068
                             cur_nr_sectors, &hd_qiov);
1069
        qemu_co_mutex_lock(&s->lock);
1070
        if (ret < 0) {
1071
            goto fail;
1072
        }
1073

    
1074
        while (l2meta != NULL) {
1075
            QCowL2Meta *next;
1076

    
1077
            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
1078
            if (ret < 0) {
1079
                goto fail;
1080
            }
1081

    
1082
            /* Take the request off the list of running requests */
1083
            if (l2meta->nb_clusters != 0) {
1084
                QLIST_REMOVE(l2meta, next_in_flight);
1085
            }
1086

    
1087
            qemu_co_queue_restart_all(&l2meta->dependent_requests);
1088

    
1089
            next = l2meta->next;
1090
            g_free(l2meta);
1091
            l2meta = next;
1092
        }
1093

    
1094
        remaining_sectors -= cur_nr_sectors;
1095
        sector_num += cur_nr_sectors;
1096
        bytes_done += cur_nr_sectors * 512;
1097
        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
1098
    }
1099
    ret = 0;
1100

    
1101
fail:
1102
    qemu_co_mutex_unlock(&s->lock);
1103

    
1104
    while (l2meta != NULL) {
1105
        QCowL2Meta *next;
1106

    
1107
        if (l2meta->nb_clusters != 0) {
1108
            QLIST_REMOVE(l2meta, next_in_flight);
1109
        }
1110
        qemu_co_queue_restart_all(&l2meta->dependent_requests);
1111

    
1112
        next = l2meta->next;
1113
        g_free(l2meta);
1114
        l2meta = next;
1115
    }
1116

    
1117
    qemu_iovec_destroy(&hd_qiov);
1118
    qemu_vfree(cluster_data);
1119
    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
1120

    
1121
    return ret;
1122
}
1123

    
1124
static void qcow2_close(BlockDriverState *bs)
1125
{
1126
    BDRVQcowState *s = bs->opaque;
1127
    g_free(s->l1_table);
1128
    /* else pre-write overlap checks in cache_destroy may crash */
1129
    s->l1_table = NULL;
1130

    
1131
    qcow2_cache_flush(bs, s->l2_table_cache);
1132
    qcow2_cache_flush(bs, s->refcount_block_cache);
1133

    
1134
    qcow2_mark_clean(bs);
1135

    
1136
    qcow2_cache_destroy(bs, s->l2_table_cache);
1137
    qcow2_cache_destroy(bs, s->refcount_block_cache);
1138

    
1139
    g_free(s->unknown_header_fields);
1140
    cleanup_unknown_header_ext(bs);
1141

    
1142
    g_free(s->cluster_cache);
1143
    qemu_vfree(s->cluster_data);
1144
    qcow2_refcount_close(bs);
1145
    qcow2_free_snapshots(bs);
1146
}
1147

    
1148
static void qcow2_invalidate_cache(BlockDriverState *bs)
1149
{
1150
    BDRVQcowState *s = bs->opaque;
1151
    int flags = s->flags;
1152
    AES_KEY aes_encrypt_key;
1153
    AES_KEY aes_decrypt_key;
1154
    uint32_t crypt_method = 0;
1155
    QDict *options;
1156

    
1157
    /*
1158
     * Backing files are read-only which makes all of their metadata immutable,
1159
     * that means we don't have to worry about reopening them here.
1160
     */
1161

    
1162
    if (s->crypt_method) {
1163
        crypt_method = s->crypt_method;
1164
        memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
1165
        memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
1166
    }
1167

    
1168
    qcow2_close(bs);
1169

    
1170
    options = qdict_new();
1171
    qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
1172
              qbool_from_int(s->use_lazy_refcounts));
1173

    
1174
    memset(s, 0, sizeof(BDRVQcowState));
1175
    qcow2_open(bs, options, flags, NULL);
1176

    
1177
    QDECREF(options);
1178

    
1179
    if (crypt_method) {
1180
        s->crypt_method = crypt_method;
1181
        memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
1182
        memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
1183
    }
1184
}
1185

    
1186
static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
1187
    size_t len, size_t buflen)
1188
{
1189
    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
1190
    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
1191

    
1192
    if (buflen < ext_len) {
1193
        return -ENOSPC;
1194
    }
1195

    
1196
    *ext_backing_fmt = (QCowExtension) {
1197
        .magic  = cpu_to_be32(magic),
1198
        .len    = cpu_to_be32(len),
1199
    };
1200
    memcpy(buf + sizeof(QCowExtension), s, len);
1201

    
1202
    return ext_len;
1203
}
1204

    
1205
/*
1206
 * Updates the qcow2 header, including the variable length parts of it, i.e.
1207
 * the backing file name and all extensions. qcow2 was not designed to allow
1208
 * such changes, so if we run out of space (we can only use the first cluster)
1209
 * this function may fail.
1210
 *
1211
 * Returns 0 on success, -errno in error cases.
1212
 */
1213
int qcow2_update_header(BlockDriverState *bs)
1214
{
1215
    BDRVQcowState *s = bs->opaque;
1216
    QCowHeader *header;
1217
    char *buf;
1218
    size_t buflen = s->cluster_size;
1219
    int ret;
1220
    uint64_t total_size;
1221
    uint32_t refcount_table_clusters;
1222
    size_t header_length;
1223
    Qcow2UnknownHeaderExtension *uext;
1224

    
1225
    buf = qemu_blockalign(bs, buflen);
1226

    
1227
    /* Header structure */
1228
    header = (QCowHeader*) buf;
1229

    
1230
    if (buflen < sizeof(*header)) {
1231
        ret = -ENOSPC;
1232
        goto fail;
1233
    }
1234

    
1235
    header_length = sizeof(*header) + s->unknown_header_fields_size;
1236
    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
1237
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
1238

    
1239
    *header = (QCowHeader) {
1240
        /* Version 2 fields */
1241
        .magic                  = cpu_to_be32(QCOW_MAGIC),
1242
        .version                = cpu_to_be32(s->qcow_version),
1243
        .backing_file_offset    = 0,
1244
        .backing_file_size      = 0,
1245
        .cluster_bits           = cpu_to_be32(s->cluster_bits),
1246
        .size                   = cpu_to_be64(total_size),
1247
        .crypt_method           = cpu_to_be32(s->crypt_method_header),
1248
        .l1_size                = cpu_to_be32(s->l1_size),
1249
        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
1250
        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
1251
        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
1252
        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
1253
        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
1254

    
1255
        /* Version 3 fields */
1256
        .incompatible_features  = cpu_to_be64(s->incompatible_features),
1257
        .compatible_features    = cpu_to_be64(s->compatible_features),
1258
        .autoclear_features     = cpu_to_be64(s->autoclear_features),
1259
        .refcount_order         = cpu_to_be32(s->refcount_order),
1260
        .header_length          = cpu_to_be32(header_length),
1261
    };
1262

    
1263
    /* For older versions, write a shorter header */
1264
    switch (s->qcow_version) {
1265
    case 2:
1266
        ret = offsetof(QCowHeader, incompatible_features);
1267
        break;
1268
    case 3:
1269
        ret = sizeof(*header);
1270
        break;
1271
    default:
1272
        ret = -EINVAL;
1273
        goto fail;
1274
    }
1275

    
1276
    buf += ret;
1277
    buflen -= ret;
1278
    memset(buf, 0, buflen);
1279

    
1280
    /* Preserve any unknown field in the header */
1281
    if (s->unknown_header_fields_size) {
1282
        if (buflen < s->unknown_header_fields_size) {
1283
            ret = -ENOSPC;
1284
            goto fail;
1285
        }
1286

    
1287
        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
1288
        buf += s->unknown_header_fields_size;
1289
        buflen -= s->unknown_header_fields_size;
1290
    }
1291

    
1292
    /* Backing file format header extension */
1293
    if (*bs->backing_format) {
1294
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
1295
                             bs->backing_format, strlen(bs->backing_format),
1296
                             buflen);
1297
        if (ret < 0) {
1298
            goto fail;
1299
        }
1300

    
1301
        buf += ret;
1302
        buflen -= ret;
1303
    }
1304

    
1305
    /* Feature table */
1306
    Qcow2Feature features[] = {
1307
        {
1308
            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
1309
            .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
1310
            .name = "dirty bit",
1311
        },
1312
        {
1313
            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
1314
            .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
1315
            .name = "corrupt bit",
1316
        },
1317
        {
1318
            .type = QCOW2_FEAT_TYPE_COMPATIBLE,
1319
            .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
1320
            .name = "lazy refcounts",
1321
        },
1322
    };
1323

    
1324
    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
1325
                         features, sizeof(features), buflen);
1326
    if (ret < 0) {
1327
        goto fail;
1328
    }
1329
    buf += ret;
1330
    buflen -= ret;
1331

    
1332
    /* Keep unknown header extensions */
1333
    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
1334
        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
1335
        if (ret < 0) {
1336
            goto fail;
1337
        }
1338

    
1339
        buf += ret;
1340
        buflen -= ret;
1341
    }
1342

    
1343
    /* End of header extensions */
1344
    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
1345
    if (ret < 0) {
1346
        goto fail;
1347
    }
1348

    
1349
    buf += ret;
1350
    buflen -= ret;
1351

    
1352
    /* Backing file name */
1353
    if (*bs->backing_file) {
1354
        size_t backing_file_len = strlen(bs->backing_file);
1355

    
1356
        if (buflen < backing_file_len) {
1357
            ret = -ENOSPC;
1358
            goto fail;
1359
        }
1360

    
1361
        /* Using strncpy is ok here, since buf is not NUL-terminated. */
1362
        strncpy(buf, bs->backing_file, buflen);
1363

    
1364
        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
1365
        header->backing_file_size   = cpu_to_be32(backing_file_len);
1366
    }
1367

    
1368
    /* Write the new header */
1369
    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
1370
    if (ret < 0) {
1371
        goto fail;
1372
    }
1373

    
1374
    ret = 0;
1375
fail:
1376
    qemu_vfree(header);
1377
    return ret;
1378
}
1379

    
1380
static int qcow2_change_backing_file(BlockDriverState *bs,
1381
    const char *backing_file, const char *backing_fmt)
1382
{
1383
    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1384
    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1385

    
1386
    return qcow2_update_header(bs);
1387
}
1388

    
1389
static int preallocate(BlockDriverState *bs)
1390
{
1391
    uint64_t nb_sectors;
1392
    uint64_t offset;
1393
    uint64_t host_offset = 0;
1394
    int num;
1395
    int ret;
1396
    QCowL2Meta *meta;
1397

    
1398
    nb_sectors = bdrv_getlength(bs) >> 9;
1399
    offset = 0;
1400

    
1401
    while (nb_sectors) {
1402
        num = MIN(nb_sectors, INT_MAX >> 9);
1403
        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
1404
                                         &host_offset, &meta);
1405
        if (ret < 0) {
1406
            return ret;
1407
        }
1408

    
1409
        ret = qcow2_alloc_cluster_link_l2(bs, meta);
1410
        if (ret < 0) {
1411
            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters,
1412
                                    QCOW2_DISCARD_NEVER);
1413
            return ret;
1414
        }
1415

    
1416
        /* There are no dependent requests, but we need to remove our request
1417
         * from the list of in-flight requests */
1418
        if (meta != NULL) {
1419
            QLIST_REMOVE(meta, next_in_flight);
1420
        }
1421

    
1422
        /* TODO Preallocate data if requested */
1423

    
1424
        nb_sectors -= num;
1425
        offset += num << 9;
1426
    }
1427

    
1428
    /*
1429
     * It is expected that the image file is large enough to actually contain
1430
     * all of the allocated clusters (otherwise we get failing reads after
1431
     * EOF). Extend the image to the last allocated sector.
1432
     */
1433
    if (host_offset != 0) {
1434
        uint8_t buf[512];
1435
        memset(buf, 0, 512);
1436
        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
1437
        if (ret < 0) {
1438
            return ret;
1439
        }
1440
    }
1441

    
1442
    return 0;
1443
}
1444

    
1445
static int qcow2_create2(const char *filename, int64_t total_size,
1446
                         const char *backing_file, const char *backing_format,
1447
                         int flags, size_t cluster_size, int prealloc,
1448
                         QEMUOptionParameter *options, int version,
1449
                         Error **errp)
1450
{
1451
    /* Calculate cluster_bits */
1452
    int cluster_bits;
1453
    cluster_bits = ffs(cluster_size) - 1;
1454
    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
1455
        (1 << cluster_bits) != cluster_size)
1456
    {
1457
        error_setg(errp, "Cluster size must be a power of two between %d and "
1458
                   "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
1459
        return -EINVAL;
1460
    }
1461

    
1462
    /*
1463
     * Open the image file and write a minimal qcow2 header.
1464
     *
1465
     * We keep things simple and start with a zero-sized image. We also
1466
     * do without refcount blocks or a L1 table for now. We'll fix the
1467
     * inconsistency later.
1468
     *
1469
     * We do need a refcount table because growing the refcount table means
1470
     * allocating two new refcount blocks - the seconds of which would be at
1471
     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
1472
     * size for any qcow2 image.
1473
     */
1474
    BlockDriverState* bs;
1475
    QCowHeader *header;
1476
    uint8_t* refcount_table;
1477
    Error *local_err = NULL;
1478
    int ret;
1479

    
1480
    ret = bdrv_create_file(filename, options, &local_err);
1481
    if (ret < 0) {
1482
        error_propagate(errp, local_err);
1483
        return ret;
1484
    }
1485

    
1486
    ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
1487
    if (ret < 0) {
1488
        error_propagate(errp, local_err);
1489
        return ret;
1490
    }
1491

    
1492
    /* Write the header */
1493
    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
1494
    header = g_malloc0(cluster_size);
1495
    *header = (QCowHeader) {
1496
        .magic                      = cpu_to_be32(QCOW_MAGIC),
1497
        .version                    = cpu_to_be32(version),
1498
        .cluster_bits               = cpu_to_be32(cluster_bits),
1499
        .size                       = cpu_to_be64(0),
1500
        .l1_table_offset            = cpu_to_be64(0),
1501
        .l1_size                    = cpu_to_be32(0),
1502
        .refcount_table_offset      = cpu_to_be64(cluster_size),
1503
        .refcount_table_clusters    = cpu_to_be32(1),
1504
        .refcount_order             = cpu_to_be32(3 + REFCOUNT_SHIFT),
1505
        .header_length              = cpu_to_be32(sizeof(*header)),
1506
    };
1507

    
1508
    if (flags & BLOCK_FLAG_ENCRYPT) {
1509
        header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1510
    } else {
1511
        header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1512
    }
1513

    
1514
    if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
1515
        header->compatible_features |=
1516
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
1517
    }
1518

    
1519
    ret = bdrv_pwrite(bs, 0, header, cluster_size);
1520
    g_free(header);
1521
    if (ret < 0) {
1522
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
1523
        goto out;
1524
    }
1525

    
1526
    /* Write an empty refcount table */
1527
    refcount_table = g_malloc0(cluster_size);
1528
    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
1529
    g_free(refcount_table);
1530

    
1531
    if (ret < 0) {
1532
        error_setg_errno(errp, -ret, "Could not write refcount table");
1533
        goto out;
1534
    }
1535

    
1536
    bdrv_close(bs);
1537

    
1538
    /*
1539
     * And now open the image and make it consistent first (i.e. increase the
1540
     * refcount of the cluster that is occupied by the header and the refcount
1541
     * table)
1542
     */
1543
    BlockDriver* drv = bdrv_find_format("qcow2");
1544
    assert(drv != NULL);
1545
    ret = bdrv_open(bs, filename, NULL,
1546
        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv, &local_err);
1547
    if (ret < 0) {
1548
        error_propagate(errp, local_err);
1549
        goto out;
1550
    }
1551

    
1552
    ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
1553
    if (ret < 0) {
1554
        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
1555
                         "header and refcount table");
1556
        goto out;
1557

    
1558
    } else if (ret != 0) {
1559
        error_report("Huh, first cluster in empty image is already in use?");
1560
        abort();
1561
    }
1562

    
1563
    /* Okay, now that we have a valid image, let's give it the right size */
1564
    ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
1565
    if (ret < 0) {
1566
        error_setg_errno(errp, -ret, "Could not resize image");
1567
        goto out;
1568
    }
1569

    
1570
    /* Want a backing file? There you go.*/
1571
    if (backing_file) {
1572
        ret = bdrv_change_backing_file(bs, backing_file, backing_format);
1573
        if (ret < 0) {
1574
            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
1575
                             "with format '%s'", backing_file, backing_format);
1576
            goto out;
1577
        }
1578
    }
1579

    
1580
    /* And if we're supposed to preallocate metadata, do that now */
1581
    if (prealloc) {
1582
        BDRVQcowState *s = bs->opaque;
1583
        qemu_co_mutex_lock(&s->lock);
1584
        ret = preallocate(bs);
1585
        qemu_co_mutex_unlock(&s->lock);
1586
        if (ret < 0) {
1587
            error_setg_errno(errp, -ret, "Could not preallocate metadata");
1588
            goto out;
1589
        }
1590
    }
1591

    
1592
    bdrv_close(bs);
1593

    
1594
    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */
1595
    ret = bdrv_open(bs, filename, NULL,
1596
                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING,
1597
                    drv, &local_err);
1598
    if (error_is_set(&local_err)) {
1599
        error_propagate(errp, local_err);
1600
        goto out;
1601
    }
1602

    
1603
    ret = 0;
1604
out:
1605
    bdrv_unref(bs);
1606
    return ret;
1607
}
1608

    
1609
static int qcow2_create(const char *filename, QEMUOptionParameter *options,
1610
                        Error **errp)
1611
{
1612
    const char *backing_file = NULL;
1613
    const char *backing_fmt = NULL;
1614
    uint64_t sectors = 0;
1615
    int flags = 0;
1616
    size_t cluster_size = DEFAULT_CLUSTER_SIZE;
1617
    int prealloc = 0;
1618
    int version = 3;
1619
    Error *local_err = NULL;
1620
    int ret;
1621

    
1622
    /* Read out options */
1623
    while (options && options->name) {
1624
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1625
            sectors = options->value.n / 512;
1626
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1627
            backing_file = options->value.s;
1628
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
1629
            backing_fmt = options->value.s;
1630
        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
1631
            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
1632
        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
1633
            if (options->value.n) {
1634
                cluster_size = options->value.n;
1635
            }
1636
        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1637
            if (!options->value.s || !strcmp(options->value.s, "off")) {
1638
                prealloc = 0;
1639
            } else if (!strcmp(options->value.s, "metadata")) {
1640
                prealloc = 1;
1641
            } else {
1642
                error_setg(errp, "Invalid preallocation mode: '%s'",
1643
                           options->value.s);
1644
                return -EINVAL;
1645
            }
1646
        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
1647
            if (!options->value.s) {
1648
                /* keep the default */
1649
            } else if (!strcmp(options->value.s, "0.10")) {
1650
                version = 2;
1651
            } else if (!strcmp(options->value.s, "1.1")) {
1652
                version = 3;
1653
            } else {
1654
                error_setg(errp, "Invalid compatibility level: '%s'",
1655
                           options->value.s);
1656
                return -EINVAL;
1657
            }
1658
        } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
1659
            flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
1660
        }
1661
        options++;
1662
    }
1663

    
1664
    if (backing_file && prealloc) {
1665
        error_setg(errp, "Backing file and preallocation cannot be used at "
1666
                   "the same time");
1667
        return -EINVAL;
1668
    }
1669

    
1670
    if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
1671
        error_setg(errp, "Lazy refcounts only supported with compatibility "
1672
                   "level 1.1 and above (use compat=1.1 or greater)");
1673
        return -EINVAL;
1674
    }
1675

    
1676
    ret = qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
1677
                        cluster_size, prealloc, options, version, &local_err);
1678
    if (error_is_set(&local_err)) {
1679
        error_propagate(errp, local_err);
1680
    }
1681
    return ret;
1682
}
1683

    
1684
static int qcow2_make_empty(BlockDriverState *bs)
1685
{
1686
#if 0
1687
    /* XXX: not correct */
1688
    BDRVQcowState *s = bs->opaque;
1689
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1690
    int ret;
1691

1692
    memset(s->l1_table, 0, l1_length);
1693
    if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
1694
        return -1;
1695
    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
1696
    if (ret < 0)
1697
        return ret;
1698

1699
    l2_cache_reset(bs);
1700
#endif
1701
    return 0;
1702
}
1703

    
1704
static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
1705
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
1706
{
1707
    int ret;
1708
    BDRVQcowState *s = bs->opaque;
1709

    
1710
    /* Emulate misaligned zero writes */
1711
    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
1712
        return -ENOTSUP;
1713
    }
1714

    
1715
    /* Whatever is left can use real zero clusters */
1716
    qemu_co_mutex_lock(&s->lock);
1717
    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1718
        nb_sectors);
1719
    qemu_co_mutex_unlock(&s->lock);
1720

    
1721
    return ret;
1722
}
1723

    
1724
static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
1725
    int64_t sector_num, int nb_sectors)
1726
{
1727
    int ret;
1728
    BDRVQcowState *s = bs->opaque;
1729

    
1730
    qemu_co_mutex_lock(&s->lock);
1731
    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1732
        nb_sectors, QCOW2_DISCARD_REQUEST);
1733
    qemu_co_mutex_unlock(&s->lock);
1734
    return ret;
1735
}
1736

    
1737
static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
1738
{
1739
    BDRVQcowState *s = bs->opaque;
1740
    int64_t new_l1_size;
1741
    int ret;
1742

    
1743
    if (offset & 511) {
1744
        error_report("The new size must be a multiple of 512");
1745
        return -EINVAL;
1746
    }
1747

    
1748
    /* cannot proceed if image has snapshots */
1749
    if (s->nb_snapshots) {
1750
        error_report("Can't resize an image which has snapshots");
1751
        return -ENOTSUP;
1752
    }
1753

    
1754
    /* shrinking is currently not supported */
1755
    if (offset < bs->total_sectors * 512) {
1756
        error_report("qcow2 doesn't support shrinking images yet");
1757
        return -ENOTSUP;
1758
    }
1759

    
1760
    new_l1_size = size_to_l1(s, offset);
1761
    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
1762
    if (ret < 0) {
1763
        return ret;
1764
    }
1765

    
1766
    /* write updated header.size */
1767
    offset = cpu_to_be64(offset);
1768
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
1769
                           &offset, sizeof(uint64_t));
1770
    if (ret < 0) {
1771
        return ret;
1772
    }
1773

    
1774
    s->l1_vm_state_index = new_l1_size;
1775
    return 0;
1776
}
1777

    
1778
/* XXX: put compressed sectors first, then all the cluster aligned
1779
   tables to avoid losing bytes in alignment */
1780
static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
1781
                                  const uint8_t *buf, int nb_sectors)
1782
{
1783
    BDRVQcowState *s = bs->opaque;
1784
    z_stream strm;
1785
    int ret, out_len;
1786
    uint8_t *out_buf;
1787
    uint64_t cluster_offset;
1788

    
1789
    if (nb_sectors == 0) {
1790
        /* align end of file to a sector boundary to ease reading with
1791
           sector based I/Os */
1792
        cluster_offset = bdrv_getlength(bs->file);
1793
        cluster_offset = (cluster_offset + 511) & ~511;
1794
        bdrv_truncate(bs->file, cluster_offset);
1795
        return 0;
1796
    }
1797

    
1798
    if (nb_sectors != s->cluster_sectors) {
1799
        ret = -EINVAL;
1800

    
1801
        /* Zero-pad last write if image size is not cluster aligned */
1802
        if (sector_num + nb_sectors == bs->total_sectors &&
1803
            nb_sectors < s->cluster_sectors) {
1804
            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
1805
            memset(pad_buf, 0, s->cluster_size);
1806
            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
1807
            ret = qcow2_write_compressed(bs, sector_num,
1808
                                         pad_buf, s->cluster_sectors);
1809
            qemu_vfree(pad_buf);
1810
        }
1811
        return ret;
1812
    }
1813

    
1814
    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1815

    
1816
    /* best compression, small window, no zlib header */
1817
    memset(&strm, 0, sizeof(strm));
1818
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1819
                       Z_DEFLATED, -12,
1820
                       9, Z_DEFAULT_STRATEGY);
1821
    if (ret != 0) {
1822
        ret = -EINVAL;
1823
        goto fail;
1824
    }
1825

    
1826
    strm.avail_in = s->cluster_size;
1827
    strm.next_in = (uint8_t *)buf;
1828
    strm.avail_out = s->cluster_size;
1829
    strm.next_out = out_buf;
1830

    
1831
    ret = deflate(&strm, Z_FINISH);
1832
    if (ret != Z_STREAM_END && ret != Z_OK) {
1833
        deflateEnd(&strm);
1834
        ret = -EINVAL;
1835
        goto fail;
1836
    }
1837
    out_len = strm.next_out - out_buf;
1838

    
1839
    deflateEnd(&strm);
1840

    
1841
    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1842
        /* could not compress: write normal cluster */
1843
        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
1844
        if (ret < 0) {
1845
            goto fail;
1846
        }
1847
    } else {
1848
        cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
1849
            sector_num << 9, out_len);
1850
        if (!cluster_offset) {
1851
            ret = -EIO;
1852
            goto fail;
1853
        }
1854
        cluster_offset &= s->cluster_offset_mask;
1855

    
1856
        ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
1857
        if (ret < 0) {
1858
            goto fail;
1859
        }
1860

    
1861
        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
1862
        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
1863
        if (ret < 0) {
1864
            goto fail;
1865
        }
1866
    }
1867

    
1868
    ret = 0;
1869
fail:
1870
    g_free(out_buf);
1871
    return ret;
1872
}
1873

    
1874
static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
1875
{
1876
    BDRVQcowState *s = bs->opaque;
1877
    int ret;
1878

    
1879
    qemu_co_mutex_lock(&s->lock);
1880
    ret = qcow2_cache_flush(bs, s->l2_table_cache);
1881
    if (ret < 0) {
1882
        qemu_co_mutex_unlock(&s->lock);
1883
        return ret;
1884
    }
1885

    
1886
    if (qcow2_need_accurate_refcounts(s)) {
1887
        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
1888
        if (ret < 0) {
1889
            qemu_co_mutex_unlock(&s->lock);
1890
            return ret;
1891
        }
1892
    }
1893
    qemu_co_mutex_unlock(&s->lock);
1894

    
1895
    return 0;
1896
}
1897

    
1898
static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1899
{
1900
    BDRVQcowState *s = bs->opaque;
1901
    bdi->unallocated_blocks_are_zero = true;
1902
    bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3);
1903
    bdi->cluster_size = s->cluster_size;
1904
    bdi->vm_state_offset = qcow2_vm_state_offset(s);
1905
    return 0;
1906
}
1907

    
1908
static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
1909
{
1910
    BDRVQcowState *s = bs->opaque;
1911
    ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);
1912

    
1913
    *spec_info = (ImageInfoSpecific){
1914
        .kind  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
1915
        {
1916
            .qcow2 = g_new(ImageInfoSpecificQCow2, 1),
1917
        },
1918
    };
1919
    if (s->qcow_version == 2) {
1920
        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
1921
            .compat = g_strdup("0.10"),
1922
        };
1923
    } else if (s->qcow_version == 3) {
1924
        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
1925
            .compat             = g_strdup("1.1"),
1926
            .lazy_refcounts     = s->compatible_features &
1927
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
1928
            .has_lazy_refcounts = true,
1929
        };
1930
    }
1931

    
1932
    return spec_info;
1933
}
1934

    
1935
#if 0
1936
static void dump_refcounts(BlockDriverState *bs)
1937
{
1938
    BDRVQcowState *s = bs->opaque;
1939
    int64_t nb_clusters, k, k1, size;
1940
    int refcount;
1941

1942
    size = bdrv_getlength(bs->file);
1943
    nb_clusters = size_to_clusters(s, size);
1944
    for(k = 0; k < nb_clusters;) {
1945
        k1 = k;
1946
        refcount = get_refcount(bs, k);
1947
        k++;
1948
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
1949
            k++;
1950
        printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
1951
               k - k1);
1952
    }
1953
}
1954
#endif
1955

    
1956
static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
1957
                              int64_t pos)
1958
{
1959
    BDRVQcowState *s = bs->opaque;
1960
    int64_t total_sectors = bs->total_sectors;
1961
    int growable = bs->growable;
1962
    bool zero_beyond_eof = bs->zero_beyond_eof;
1963
    int ret;
1964

    
1965
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
1966
    bs->growable = 1;
1967
    bs->zero_beyond_eof = false;
1968
    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
1969
    bs->growable = growable;
1970
    bs->zero_beyond_eof = zero_beyond_eof;
1971

    
1972
    /* bdrv_co_do_writev will have increased the total_sectors value to include
1973
     * the VM state - the VM state is however not an actual part of the block
1974
     * device, therefore, we need to restore the old value. */
1975
    bs->total_sectors = total_sectors;
1976

    
1977
    return ret;
1978
}
1979

    
1980
static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1981
                              int64_t pos, int size)
1982
{
1983
    BDRVQcowState *s = bs->opaque;
1984
    int growable = bs->growable;
1985
    bool zero_beyond_eof = bs->zero_beyond_eof;
1986
    int ret;
1987

    
1988
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
1989
    bs->growable = 1;
1990
    bs->zero_beyond_eof = false;
1991
    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
1992
    bs->growable = growable;
1993
    bs->zero_beyond_eof = zero_beyond_eof;
1994

    
1995
    return ret;
1996
}
1997

    
1998
/*
1999
 * Downgrades an image's version. To achieve this, any incompatible features
2000
 * have to be removed.
2001
 */
2002
static int qcow2_downgrade(BlockDriverState *bs, int target_version)
2003
{
2004
    BDRVQcowState *s = bs->opaque;
2005
    int current_version = s->qcow_version;
2006
    int ret;
2007

    
2008
    if (target_version == current_version) {
2009
        return 0;
2010
    } else if (target_version > current_version) {
2011
        return -EINVAL;
2012
    } else if (target_version != 2) {
2013
        return -EINVAL;
2014
    }
2015

    
2016
    if (s->refcount_order != 4) {
2017
        /* we would have to convert the image to a refcount_order == 4 image
2018
         * here; however, since qemu (at the time of writing this) does not
2019
         * support anything different than 4 anyway, there is no point in doing
2020
         * so right now; however, we should error out (if qemu supports this in
2021
         * the future and this code has not been adapted) */
2022
        error_report("qcow2_downgrade: Image refcount orders other than 4 are "
2023
                     "currently not supported.");
2024
        return -ENOTSUP;
2025
    }
2026

    
2027
    /* clear incompatible features */
2028
    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
2029
        ret = qcow2_mark_clean(bs);
2030
        if (ret < 0) {
2031
            return ret;
2032
        }
2033
    }
2034

    
2035
    /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
2036
     * the first place; if that happens nonetheless, returning -ENOTSUP is the
2037
     * best thing to do anyway */
2038

    
2039
    if (s->incompatible_features) {
2040
        return -ENOTSUP;
2041
    }
2042

    
2043
    /* since we can ignore compatible features, we can set them to 0 as well */
2044
    s->compatible_features = 0;
2045
    /* if lazy refcounts have been used, they have already been fixed through
2046
     * clearing the dirty flag */
2047

    
2048
    /* clearing autoclear features is trivial */
2049
    s->autoclear_features = 0;
2050

    
2051
    ret = qcow2_expand_zero_clusters(bs);
2052
    if (ret < 0) {
2053
        return ret;
2054
    }
2055

    
2056
    s->qcow_version = target_version;
2057
    ret = qcow2_update_header(bs);
2058
    if (ret < 0) {
2059
        s->qcow_version = current_version;
2060
        return ret;
2061
    }
2062
    return 0;
2063
}
2064

    
2065
static int qcow2_amend_options(BlockDriverState *bs,
2066
                               QEMUOptionParameter *options)
2067
{
2068
    BDRVQcowState *s = bs->opaque;
2069
    int old_version = s->qcow_version, new_version = old_version;
2070
    uint64_t new_size = 0;
2071
    const char *backing_file = NULL, *backing_format = NULL;
2072
    bool lazy_refcounts = s->use_lazy_refcounts;
2073
    int ret;
2074
    int i;
2075

    
2076
    for (i = 0; options[i].name; i++)
2077
    {
2078
        if (!options[i].assigned) {
2079
            /* only change explicitly defined options */
2080
            continue;
2081
        }
2082

    
2083
        if (!strcmp(options[i].name, "compat")) {
2084
            if (!options[i].value.s) {
2085
                /* preserve default */
2086
            } else if (!strcmp(options[i].value.s, "0.10")) {
2087
                new_version = 2;
2088
            } else if (!strcmp(options[i].value.s, "1.1")) {
2089
                new_version = 3;
2090
            } else {
2091
                fprintf(stderr, "Unknown compatibility level %s.\n",
2092
                        options[i].value.s);
2093
                return -EINVAL;
2094
            }
2095
        } else if (!strcmp(options[i].name, "preallocation")) {
2096
            fprintf(stderr, "Cannot change preallocation mode.\n");
2097
            return -ENOTSUP;
2098
        } else if (!strcmp(options[i].name, "size")) {
2099
            new_size = options[i].value.n;
2100
        } else if (!strcmp(options[i].name, "backing_file")) {
2101
            backing_file = options[i].value.s;
2102
        } else if (!strcmp(options[i].name, "backing_fmt")) {
2103
            backing_format = options[i].value.s;
2104
        } else if (!strcmp(options[i].name, "encryption")) {
2105
            if ((options[i].value.n != !!s->crypt_method)) {
2106
                fprintf(stderr, "Changing the encryption flag is not "
2107
                        "supported.\n");
2108
                return -ENOTSUP;
2109
            }
2110
        } else if (!strcmp(options[i].name, "cluster_size")) {
2111
            if (options[i].value.n != s->cluster_size) {
2112
                fprintf(stderr, "Changing the cluster size is not "
2113
                        "supported.\n");
2114
                return -ENOTSUP;
2115
            }
2116
        } else if (!strcmp(options[i].name, "lazy_refcounts")) {
2117
            lazy_refcounts = options[i].value.n;
2118
        } else {
2119
            /* if this assertion fails, this probably means a new option was
2120
             * added without having it covered here */
2121
            assert(false);
2122
        }
2123
    }
2124

    
2125
    if (new_version != old_version) {
2126
        if (new_version > old_version) {
2127
            /* Upgrade */
2128
            s->qcow_version = new_version;
2129
            ret = qcow2_update_header(bs);
2130
            if (ret < 0) {
2131
                s->qcow_version = old_version;
2132
                return ret;
2133
            }
2134
        } else {
2135
            ret = qcow2_downgrade(bs, new_version);
2136
            if (ret < 0) {
2137
                return ret;
2138
            }
2139
        }
2140
    }
2141

    
2142
    if (backing_file || backing_format) {
2143
        ret = qcow2_change_backing_file(bs, backing_file ?: bs->backing_file,
2144
                                        backing_format ?: bs->backing_format);
2145
        if (ret < 0) {
2146
            return ret;
2147
        }
2148
    }
2149

    
2150
    if (s->use_lazy_refcounts != lazy_refcounts) {
2151
        if (lazy_refcounts) {
2152
            if (s->qcow_version < 3) {
2153
                fprintf(stderr, "Lazy refcounts only supported with compatibility "
2154
                        "level 1.1 and above (use compat=1.1 or greater)\n");
2155
                return -EINVAL;
2156
            }
2157
            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
2158
            ret = qcow2_update_header(bs);
2159
            if (ret < 0) {
2160
                s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
2161
                return ret;
2162
            }
2163
            s->use_lazy_refcounts = true;
2164
        } else {
2165
            /* make image clean first */
2166
            ret = qcow2_mark_clean(bs);
2167
            if (ret < 0) {
2168
                return ret;
2169
            }
2170
            /* now disallow lazy refcounts */
2171
            s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
2172
            ret = qcow2_update_header(bs);
2173
            if (ret < 0) {
2174
                s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
2175
                return ret;
2176
            }
2177
            s->use_lazy_refcounts = false;
2178
        }
2179
    }
2180

    
2181
    if (new_size) {
2182
        ret = bdrv_truncate(bs, new_size);
2183
        if (ret < 0) {
2184
            return ret;
2185
        }
2186
    }
2187

    
2188
    return 0;
2189
}
2190

    
2191
static QEMUOptionParameter qcow2_create_options[] = {
2192
    {
2193
        .name = BLOCK_OPT_SIZE,
2194
        .type = OPT_SIZE,
2195
        .help = "Virtual disk size"
2196
    },
2197
    {
2198
        .name = BLOCK_OPT_COMPAT_LEVEL,
2199
        .type = OPT_STRING,
2200
        .help = "Compatibility level (0.10 or 1.1)"
2201
    },
2202
    {
2203
        .name = BLOCK_OPT_BACKING_FILE,
2204
        .type = OPT_STRING,
2205
        .help = "File name of a base image"
2206
    },
2207
    {
2208
        .name = BLOCK_OPT_BACKING_FMT,
2209
        .type = OPT_STRING,
2210
        .help = "Image format of the base image"
2211
    },
2212
    {
2213
        .name = BLOCK_OPT_ENCRYPT,
2214
        .type = OPT_FLAG,
2215
        .help = "Encrypt the image"
2216
    },
2217
    {
2218
        .name = BLOCK_OPT_CLUSTER_SIZE,
2219
        .type = OPT_SIZE,
2220
        .help = "qcow2 cluster size",
2221
        .value = { .n = DEFAULT_CLUSTER_SIZE },
2222
    },
2223
    {
2224
        .name = BLOCK_OPT_PREALLOC,
2225
        .type = OPT_STRING,
2226
        .help = "Preallocation mode (allowed values: off, metadata)"
2227
    },
2228
    {
2229
        .name = BLOCK_OPT_LAZY_REFCOUNTS,
2230
        .type = OPT_FLAG,
2231
        .help = "Postpone refcount updates",
2232
    },
2233
    { NULL }
2234
};
2235

    
2236
static BlockDriver bdrv_qcow2 = {
2237
    .format_name        = "qcow2",
2238
    .instance_size      = sizeof(BDRVQcowState),
2239
    .bdrv_probe         = qcow2_probe,
2240
    .bdrv_open          = qcow2_open,
2241
    .bdrv_close         = qcow2_close,
2242
    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
2243
    .bdrv_create        = qcow2_create,
2244
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
2245
    .bdrv_co_get_block_status = qcow2_co_get_block_status,
2246
    .bdrv_set_key       = qcow2_set_key,
2247
    .bdrv_make_empty    = qcow2_make_empty,
2248

    
2249
    .bdrv_co_readv          = qcow2_co_readv,
2250
    .bdrv_co_writev         = qcow2_co_writev,
2251
    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
2252

    
2253
    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
2254
    .bdrv_co_discard        = qcow2_co_discard,
2255
    .bdrv_truncate          = qcow2_truncate,
2256
    .bdrv_write_compressed  = qcow2_write_compressed,
2257

    
2258
    .bdrv_snapshot_create   = qcow2_snapshot_create,
2259
    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
2260
    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
2261
    .bdrv_snapshot_list     = qcow2_snapshot_list,
2262
    .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp,
2263
    .bdrv_get_info      = qcow2_get_info,
2264
    .bdrv_get_specific_info = qcow2_get_specific_info,
2265

    
2266
    .bdrv_save_vmstate    = qcow2_save_vmstate,
2267
    .bdrv_load_vmstate    = qcow2_load_vmstate,
2268

    
2269
    .bdrv_change_backing_file   = qcow2_change_backing_file,
2270

    
2271
    .bdrv_invalidate_cache      = qcow2_invalidate_cache,
2272

    
2273
    .create_options = qcow2_create_options,
2274
    .bdrv_check = qcow2_check,
2275
    .bdrv_amend_options = qcow2_amend_options,
2276
};
2277

    
2278
static void bdrv_qcow2_init(void)
2279
{
2280
    bdrv_register(&bdrv_qcow2);
2281
}
2282

    
2283
block_init(bdrv_qcow2_init);