Statistics
| Branch: | Revision:

root / block / vhdx.c @ d92aa883

History | View | Annotate | Download (43 kB)

1
/*
2
 * Block driver for Hyper-V VHDX Images
3
 *
4
 * Copyright (c) 2013 Red Hat, Inc.,
5
 *
6
 * Authors:
7
 *  Jeff Cody <jcody@redhat.com>
8
 *
9
 *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
10
 *  by Microsoft:
11
 *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
12
 *
13
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
14
 * See the COPYING.LIB file in the top-level directory.
15
 *
16
 */
17

    
18
#include "qemu-common.h"
19
#include "block/block_int.h"
20
#include "qemu/module.h"
21
#include "qemu/crc32c.h"
22
#include "block/vhdx.h"
23
#include "migration/migration.h"
24

    
25
#include <uuid/uuid.h>
26

    
27
/* Several metadata and region table data entries are identified by
28
 * guids in  a MS-specific GUID format. */
29

    
30

    
31
/* ------- Known Region Table GUIDs ---------------------- */
32
static const MSGUID bat_guid =      { .data1 = 0x2dc27766,
33
                                      .data2 = 0xf623,
34
                                      .data3 = 0x4200,
35
                                      .data4 = { 0x9d, 0x64, 0x11, 0x5e,
36
                                                 0x9b, 0xfd, 0x4a, 0x08} };
37

    
38
static const MSGUID metadata_guid = { .data1 = 0x8b7ca206,
39
                                      .data2 = 0x4790,
40
                                      .data3 = 0x4b9a,
41
                                      .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
42
                                                 0x05, 0x0f, 0x88, 0x6e} };
43

    
44

    
45

    
46
/* ------- Known Metadata Entry GUIDs ---------------------- */
47
static const MSGUID file_param_guid =   { .data1 = 0xcaa16737,
48
                                          .data2 = 0xfa36,
49
                                          .data3 = 0x4d43,
50
                                          .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
51
                                                     0xaa, 0x44, 0xe7, 0x6b} };
52

    
53
static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224,
54
                                          .data2 = 0xcd1b,
55
                                          .data3 = 0x4876,
56
                                          .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
57
                                                     0xd8, 0x3b, 0xf4, 0xb8} };
58

    
59
static const MSGUID page83_guid =       { .data1 = 0xbeca12ab,
60
                                          .data2 = 0xb2e6,
61
                                          .data3 = 0x4523,
62
                                          .data4 = { 0x93, 0xef, 0xc3, 0x09,
63
                                                     0xe0, 0x00, 0xc7, 0x46} };
64

    
65

    
66
static const MSGUID phys_sector_guid =  { .data1 = 0xcda348c7,
67
                                          .data2 = 0x445d,
68
                                          .data3 = 0x4471,
69
                                          .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
70
                                                     0x52, 0x51, 0xc5, 0x56} };
71

    
72
static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d,
73
                                            .data2 = 0xb30b,
74
                                            .data3 = 0x454d,
75
                                            .data4 = { 0xab, 0xf7, 0xd3,
76
                                                       0xd8, 0x48, 0x34,
77
                                                       0xab, 0x0c} };
78

    
79
static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d,
80
                                            .data2 = 0xa96f,
81
                                            .data3 = 0x4709,
82
                                            .data4 = { 0xba, 0x47, 0xf2,
83
                                                       0x33, 0xa8, 0xfa,
84
                                                       0xab, 0x5f} };
85

    
86
/* Each parent type must have a valid GUID; this is for parent images
87
 * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
88
 * need to make up our own QCOW2 GUID type */
89
static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7,
90
                                         .data2 = 0xd19e,
91
                                         .data3 = 0x4a81,
92
                                         .data4 = { 0xb7, 0x89, 0x25, 0xb8,
93
                                                    0xe9, 0x44, 0x59, 0x13} };
94

    
95

    
96
#define META_FILE_PARAMETER_PRESENT      0x01
97
#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
98
#define META_PAGE_83_PRESENT             0x04
99
#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
100
#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
101
#define META_PARENT_LOCATOR_PRESENT      0x20
102

    
103
#define META_ALL_PRESENT    \
104
    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
105
     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
106
     META_PHYS_SECTOR_SIZE_PRESENT)
107

    
108

    
109
typedef struct VHDXSectorInfo {
110
    uint32_t bat_idx;       /* BAT entry index */
111
    uint32_t sectors_avail; /* sectors available in payload block */
112
    uint32_t bytes_left;    /* bytes left in the block after data to r/w */
113
    uint32_t bytes_avail;   /* bytes available in payload block */
114
    uint64_t file_offset;   /* absolute offset in bytes, in file */
115
    uint64_t block_offset;  /* block offset, in bytes */
116
} VHDXSectorInfo;
117

    
118
/* Calculates new checksum.
119
 *
120
 * Zero is substituted during crc calculation for the original crc field
121
 * crc_offset: byte offset in buf of the buffer crc
122
 * buf: buffer pointer
123
 * size: size of buffer (must be > crc_offset+4)
124
 *
125
 * Note: The resulting checksum is in the CPU endianness, not necessarily
126
 *       in the file format endianness (LE).  Any header export to disk should
127
 *       make sure that vhdx_header_le_export() is used to convert to the
128
 *       correct endianness
129
 */
130
uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset)
131
{
132
    uint32_t crc;
133

    
134
    assert(buf != NULL);
135
    assert(size > (crc_offset + sizeof(crc)));
136

    
137
    memset(buf + crc_offset, 0, sizeof(crc));
138
    crc =  crc32c(0xffffffff, buf, size);
139
    memcpy(buf + crc_offset, &crc, sizeof(crc));
140

    
141
    return crc;
142
}
143

    
144
uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
145
                            int crc_offset)
146
{
147
    uint32_t crc_new;
148
    uint32_t crc_orig;
149
    assert(buf != NULL);
150

    
151
    if (crc_offset > 0) {
152
        memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
153
        memset(buf + crc_offset, 0, sizeof(crc_orig));
154
    }
155

    
156
    crc_new = crc32c(crc, buf, size);
157
    if (crc_offset > 0) {
158
        memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig));
159
    }
160

    
161
    return crc_new;
162
}
163

    
164
/* Validates the checksum of the buffer, with an in-place CRC.
165
 *
166
 * Zero is substituted during crc calculation for the original crc field,
167
 * and the crc field is restored afterwards.  But the buffer will be modifed
168
 * during the calculation, so this may not be not suitable for multi-threaded
169
 * use.
170
 *
171
 * crc_offset: byte offset in buf of the buffer crc
172
 * buf: buffer pointer
173
 * size: size of buffer (must be > crc_offset+4)
174
 *
175
 * returns true if checksum is valid, false otherwise
176
 */
177
bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
178
{
179
    uint32_t crc_orig;
180
    uint32_t crc;
181

    
182
    assert(buf != NULL);
183
    assert(size > (crc_offset + 4));
184

    
185
    memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
186
    crc_orig = le32_to_cpu(crc_orig);
187

    
188
    crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
189

    
190
    return crc == crc_orig;
191
}
192

    
193

    
194
/*
195
 * This generates a UUID that is compliant with the MS GUIDs used
196
 * in the VHDX spec (and elsewhere).
197
 */
198
void vhdx_guid_generate(MSGUID *guid)
199
{
200
    uuid_t uuid;
201
    assert(guid != NULL);
202

    
203
    uuid_generate(uuid);
204
    memcpy(guid, uuid, sizeof(MSGUID));
205
}
206

    
207
/* Check for region overlaps inside the VHDX image */
208
static int vhdx_region_check(BDRVVHDXState *s, uint64_t start, uint64_t length)
209
{
210
    int ret = 0;
211
    uint64_t end;
212
    VHDXRegionEntry *r;
213

    
214
    end = start + length;
215
    QLIST_FOREACH(r, &s->regions, entries) {
216
        if (!((start >= r->end) || (end <= r->start))) {
217
            ret = -EINVAL;
218
            goto exit;
219
        }
220
    }
221

    
222
exit:
223
    return ret;
224
}
225

    
226
/* Register a region for future checks */
227
static void vhdx_region_register(BDRVVHDXState *s,
228
                                 uint64_t start, uint64_t length)
229
{
230
    VHDXRegionEntry *r;
231

    
232
    r = g_malloc0(sizeof(*r));
233

    
234
    r->start = start;
235
    r->end = start + length;
236

    
237
    QLIST_INSERT_HEAD(&s->regions, r, entries);
238
}
239

    
240
/* Free all registered regions */
241
static void vhdx_region_unregister_all(BDRVVHDXState *s)
242
{
243
    VHDXRegionEntry *r, *r_next;
244

    
245
    QLIST_FOREACH_SAFE(r, &s->regions, entries, r_next) {
246
        QLIST_REMOVE(r, entries);
247
        g_free(r);
248
    }
249
}
250

    
251
/*
252
 * Per the MS VHDX Specification, for every VHDX file:
253
 *      - The header section is fixed size - 1 MB
254
 *      - The header section is always the first "object"
255
 *      - The first 64KB of the header is the File Identifier
256
 *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
257
 *      - The following 512 bytes constitute a UTF-16 string identifiying the
258
 *        software that created the file, and is optional and diagnostic only.
259
 *
260
 *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
261
 */
262
static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
263
{
264
    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
265
        return 100;
266
    }
267
    return 0;
268
}
269

    
270
/* Update the VHDX headers
271
 *
272
 * This follows the VHDX spec procedures for header updates.
273
 *
274
 *  - non-current header is updated with largest sequence number
275
 */
276
static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s,
277
                              bool generate_data_write_guid, MSGUID *log_guid)
278
{
279
    int ret = 0;
280
    int hdr_idx = 0;
281
    uint64_t header_offset = VHDX_HEADER1_OFFSET;
282

    
283
    VHDXHeader *active_header;
284
    VHDXHeader *inactive_header;
285
    VHDXHeader header_le;
286
    uint8_t *buffer;
287

    
288
    /* operate on the non-current header */
289
    if (s->curr_header == 0) {
290
        hdr_idx = 1;
291
        header_offset = VHDX_HEADER2_OFFSET;
292
    }
293

    
294
    active_header   = s->headers[s->curr_header];
295
    inactive_header = s->headers[hdr_idx];
296

    
297
    inactive_header->sequence_number = active_header->sequence_number + 1;
298

    
299
    /* a new file guid must be generated before any file write, including
300
     * headers */
301
    inactive_header->file_write_guid = s->session_guid;
302

    
303
    /* a new data guid only needs to be generated before any guest-visible
304
     * writes (i.e. something observable via virtual disk read) */
305
    if (generate_data_write_guid) {
306
        vhdx_guid_generate(&inactive_header->data_write_guid);
307
    }
308

    
309
    /* update the log guid if present */
310
    if (log_guid) {
311
        inactive_header->log_guid = *log_guid;
312
    }
313

    
314
    /* the header checksum is not over just the packed size of VHDXHeader,
315
     * but rather over the entire 'reserved' range for the header, which is
316
     * 4KB (VHDX_HEADER_SIZE). */
317

    
318
    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
319
    /* we can't assume the extra reserved bytes are 0 */
320
    ret = bdrv_pread(bs->file, header_offset, buffer, VHDX_HEADER_SIZE);
321
    if (ret < 0) {
322
        goto exit;
323
    }
324
    /* overwrite the actual VHDXHeader portion */
325
    memcpy(buffer, inactive_header, sizeof(VHDXHeader));
326
    inactive_header->checksum =
327
                        vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
328
                                             offsetof(VHDXHeader, checksum));
329
    vhdx_header_le_export(inactive_header, &header_le);
330
    ret = bdrv_pwrite_sync(bs->file, header_offset, &header_le,
331
                           sizeof(VHDXHeader));
332
    if (ret < 0) {
333
        goto exit;
334
    }
335
    s->curr_header = hdr_idx;
336

    
337
exit:
338
    qemu_vfree(buffer);
339
    return ret;
340
}
341

    
342
/*
343
 * The VHDX spec calls for header updates to be performed twice, so that both
344
 * the current and non-current header have valid info
345
 */
346
int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s,
347
                        bool generate_data_write_guid, MSGUID *log_guid)
348
{
349
    int ret;
350

    
351
    ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid);
352
    if (ret < 0) {
353
        return ret;
354
    }
355
    ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid);
356
    return ret;
357
}
358

    
359
/* opens the specified header block from the VHDX file header section */
360
static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
361
{
362
    int ret = 0;
363
    VHDXHeader *header1;
364
    VHDXHeader *header2;
365
    bool h1_valid = false;
366
    bool h2_valid = false;
367
    uint64_t h1_seq = 0;
368
    uint64_t h2_seq = 0;
369
    uint8_t *buffer;
370

    
371
    /* header1 & header2 are freed in vhdx_close() */
372
    header1 = qemu_blockalign(bs, sizeof(VHDXHeader));
373
    header2 = qemu_blockalign(bs, sizeof(VHDXHeader));
374

    
375
    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
376

    
377
    s->headers[0] = header1;
378
    s->headers[1] = header2;
379

    
380
    /* We have to read the whole VHDX_HEADER_SIZE instead of
381
     * sizeof(VHDXHeader), because the checksum is over the whole
382
     * region */
383
    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
384
    if (ret < 0) {
385
        goto fail;
386
    }
387
    /* copy over just the relevant portion that we need */
388
    memcpy(header1, buffer, sizeof(VHDXHeader));
389
    vhdx_header_le_import(header1);
390

    
391
    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
392
        !memcmp(&header1->signature, "head", 4)             &&
393
        header1->version == 1) {
394
        h1_seq = header1->sequence_number;
395
        h1_valid = true;
396
    }
397

    
398
    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
399
    if (ret < 0) {
400
        goto fail;
401
    }
402
    /* copy over just the relevant portion that we need */
403
    memcpy(header2, buffer, sizeof(VHDXHeader));
404
    vhdx_header_le_import(header2);
405

    
406
    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
407
        !memcmp(&header2->signature, "head", 4)             &&
408
        header2->version == 1) {
409
        h2_seq = header2->sequence_number;
410
        h2_valid = true;
411
    }
412

    
413
    /* If there is only 1 valid header (or no valid headers), we
414
     * don't care what the sequence numbers are */
415
    if (h1_valid && !h2_valid) {
416
        s->curr_header = 0;
417
    } else if (!h1_valid && h2_valid) {
418
        s->curr_header = 1;
419
    } else if (!h1_valid && !h2_valid) {
420
        ret = -EINVAL;
421
        goto fail;
422
    } else {
423
        /* If both headers are valid, then we choose the active one by the
424
         * highest sequence number.  If the sequence numbers are equal, that is
425
         * invalid */
426
        if (h1_seq > h2_seq) {
427
            s->curr_header = 0;
428
        } else if (h2_seq > h1_seq) {
429
            s->curr_header = 1;
430
        } else {
431
            ret = -EINVAL;
432
            goto fail;
433
        }
434
    }
435

    
436
    vhdx_region_register(s, s->headers[s->curr_header]->log_offset,
437
                            s->headers[s->curr_header]->log_length);
438

    
439
    ret = 0;
440

    
441
    goto exit;
442

    
443
fail:
444
    qerror_report(ERROR_CLASS_GENERIC_ERROR, "No valid VHDX header found");
445
    qemu_vfree(header1);
446
    qemu_vfree(header2);
447
    s->headers[0] = NULL;
448
    s->headers[1] = NULL;
449
exit:
450
    qemu_vfree(buffer);
451
    return ret;
452
}
453

    
454

    
455
static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
456
{
457
    int ret = 0;
458
    uint8_t *buffer;
459
    int offset = 0;
460
    VHDXRegionTableEntry rt_entry;
461
    uint32_t i;
462
    bool bat_rt_found = false;
463
    bool metadata_rt_found = false;
464

    
465
    /* We have to read the whole 64KB block, because the crc32 is over the
466
     * whole block */
467
    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
468

    
469
    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
470
                     VHDX_HEADER_BLOCK_SIZE);
471
    if (ret < 0) {
472
        goto fail;
473
    }
474
    memcpy(&s->rt, buffer, sizeof(s->rt));
475
    le32_to_cpus(&s->rt.signature);
476
    le32_to_cpus(&s->rt.checksum);
477
    le32_to_cpus(&s->rt.entry_count);
478
    le32_to_cpus(&s->rt.reserved);
479
    offset += sizeof(s->rt);
480

    
481
    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
482
        memcmp(&s->rt.signature, "regi", 4)) {
483
        ret = -EINVAL;
484
        goto fail;
485
    }
486

    
487
    /* Per spec, maximum region table entry count is 2047 */
488
    if (s->rt.entry_count > 2047) {
489
        ret = -EINVAL;
490
        goto fail;
491
    }
492

    
493
    for (i = 0; i < s->rt.entry_count; i++) {
494
        memcpy(&rt_entry, buffer + offset, sizeof(rt_entry));
495
        offset += sizeof(rt_entry);
496

    
497
        leguid_to_cpus(&rt_entry.guid);
498
        le64_to_cpus(&rt_entry.file_offset);
499
        le32_to_cpus(&rt_entry.length);
500
        le32_to_cpus(&rt_entry.data_bits);
501

    
502
        /* check for region overlap between these entries, and any
503
         * other memory regions in the file */
504
        ret = vhdx_region_check(s, rt_entry.file_offset, rt_entry.length);
505
        if (ret < 0) {
506
            goto fail;
507
        }
508

    
509
        vhdx_region_register(s, rt_entry.file_offset, rt_entry.length);
510

    
511
        /* see if we recognize the entry */
512
        if (guid_eq(rt_entry.guid, bat_guid)) {
513
            /* must be unique; if we have already found it this is invalid */
514
            if (bat_rt_found) {
515
                ret = -EINVAL;
516
                goto fail;
517
            }
518
            bat_rt_found = true;
519
            s->bat_rt = rt_entry;
520
            continue;
521
        }
522

    
523
        if (guid_eq(rt_entry.guid, metadata_guid)) {
524
            /* must be unique; if we have already found it this is invalid */
525
            if (metadata_rt_found) {
526
                ret = -EINVAL;
527
                goto fail;
528
            }
529
            metadata_rt_found = true;
530
            s->metadata_rt = rt_entry;
531
            continue;
532
        }
533

    
534
        if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
535
            /* cannot read vhdx file - required region table entry that
536
             * we do not understand.  per spec, we must fail to open */
537
            ret = -ENOTSUP;
538
            goto fail;
539
        }
540
    }
541

    
542
    if (!bat_rt_found || !metadata_rt_found) {
543
        ret = -EINVAL;
544
        goto fail;
545
    }
546

    
547
    ret = 0;
548

    
549
fail:
550
    qemu_vfree(buffer);
551
    return ret;
552
}
553

    
554

    
555

    
556
/* Metadata initial parser
557
 *
558
 * This loads all the metadata entry fields.  This may cause additional
559
 * fields to be processed (e.g. parent locator, etc..).
560
 *
561
 * There are 5 Metadata items that are always required:
562
 *      - File Parameters (block size, has a parent)
563
 *      - Virtual Disk Size (size, in bytes, of the virtual drive)
564
 *      - Page 83 Data (scsi page 83 guid)
565
 *      - Logical Sector Size (logical sector size in bytes, either 512 or
566
 *                             4096.  We only support 512 currently)
567
 *      - Physical Sector Size (512 or 4096)
568
 *
569
 * Also, if the File Parameters indicate this is a differencing file,
570
 * we must also look for the Parent Locator metadata item.
571
 */
572
static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
573
{
574
    int ret = 0;
575
    uint8_t *buffer;
576
    int offset = 0;
577
    uint32_t i = 0;
578
    VHDXMetadataTableEntry md_entry;
579

    
580
    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
581

    
582
    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
583
                     VHDX_METADATA_TABLE_MAX_SIZE);
584
    if (ret < 0) {
585
        goto exit;
586
    }
587
    memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
588
    offset += sizeof(s->metadata_hdr);
589

    
590
    le64_to_cpus(&s->metadata_hdr.signature);
591
    le16_to_cpus(&s->metadata_hdr.reserved);
592
    le16_to_cpus(&s->metadata_hdr.entry_count);
593

    
594
    if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) {
595
        ret = -EINVAL;
596
        goto exit;
597
    }
598

    
599
    s->metadata_entries.present = 0;
600

    
601
    if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
602
        (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
603
        ret = -EINVAL;
604
        goto exit;
605
    }
606

    
607
    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
608
        memcpy(&md_entry, buffer + offset, sizeof(md_entry));
609
        offset += sizeof(md_entry);
610

    
611
        leguid_to_cpus(&md_entry.item_id);
612
        le32_to_cpus(&md_entry.offset);
613
        le32_to_cpus(&md_entry.length);
614
        le32_to_cpus(&md_entry.data_bits);
615
        le32_to_cpus(&md_entry.reserved2);
616

    
617
        if (guid_eq(md_entry.item_id, file_param_guid)) {
618
            if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) {
619
                ret = -EINVAL;
620
                goto exit;
621
            }
622
            s->metadata_entries.file_parameters_entry = md_entry;
623
            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
624
            continue;
625
        }
626

    
627
        if (guid_eq(md_entry.item_id, virtual_size_guid)) {
628
            if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) {
629
                ret = -EINVAL;
630
                goto exit;
631
            }
632
            s->metadata_entries.virtual_disk_size_entry = md_entry;
633
            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
634
            continue;
635
        }
636

    
637
        if (guid_eq(md_entry.item_id, page83_guid)) {
638
            if (s->metadata_entries.present & META_PAGE_83_PRESENT) {
639
                ret = -EINVAL;
640
                goto exit;
641
            }
642
            s->metadata_entries.page83_data_entry = md_entry;
643
            s->metadata_entries.present |= META_PAGE_83_PRESENT;
644
            continue;
645
        }
646

    
647
        if (guid_eq(md_entry.item_id, logical_sector_guid)) {
648
            if (s->metadata_entries.present &
649
                META_LOGICAL_SECTOR_SIZE_PRESENT) {
650
                ret = -EINVAL;
651
                goto exit;
652
            }
653
            s->metadata_entries.logical_sector_size_entry = md_entry;
654
            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
655
            continue;
656
        }
657

    
658
        if (guid_eq(md_entry.item_id, phys_sector_guid)) {
659
            if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) {
660
                ret = -EINVAL;
661
                goto exit;
662
            }
663
            s->metadata_entries.phys_sector_size_entry = md_entry;
664
            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
665
            continue;
666
        }
667

    
668
        if (guid_eq(md_entry.item_id, parent_locator_guid)) {
669
            if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
670
                ret = -EINVAL;
671
                goto exit;
672
            }
673
            s->metadata_entries.parent_locator_entry = md_entry;
674
            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
675
            continue;
676
        }
677

    
678
        if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
679
            /* cannot read vhdx file - required region table entry that
680
             * we do not understand.  per spec, we must fail to open */
681
            ret = -ENOTSUP;
682
            goto exit;
683
        }
684
    }
685

    
686
    if (s->metadata_entries.present != META_ALL_PRESENT) {
687
        ret = -ENOTSUP;
688
        goto exit;
689
    }
690

    
691
    ret = bdrv_pread(bs->file,
692
                     s->metadata_entries.file_parameters_entry.offset
693
                                         + s->metadata_rt.file_offset,
694
                     &s->params,
695
                     sizeof(s->params));
696

    
697
    if (ret < 0) {
698
        goto exit;
699
    }
700

    
701
    le32_to_cpus(&s->params.block_size);
702
    le32_to_cpus(&s->params.data_bits);
703

    
704

    
705
    /* We now have the file parameters, so we can tell if this is a
706
     * differencing file (i.e.. has_parent), is dynamic or fixed
707
     * sized (leave_blocks_allocated), and the block size */
708

    
709
    /* The parent locator required iff the file parameters has_parent set */
710
    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
711
        if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
712
            /* TODO: parse  parent locator fields */
713
            ret = -ENOTSUP; /* temp, until differencing files are supported */
714
            goto exit;
715
        } else {
716
            /* if has_parent is set, but there is not parent locator present,
717
             * then that is an invalid combination */
718
            ret = -EINVAL;
719
            goto exit;
720
        }
721
    }
722

    
723
    /* determine virtual disk size, logical sector size,
724
     * and phys sector size */
725

    
726
    ret = bdrv_pread(bs->file,
727
                     s->metadata_entries.virtual_disk_size_entry.offset
728
                                           + s->metadata_rt.file_offset,
729
                     &s->virtual_disk_size,
730
                     sizeof(uint64_t));
731
    if (ret < 0) {
732
        goto exit;
733
    }
734
    ret = bdrv_pread(bs->file,
735
                     s->metadata_entries.logical_sector_size_entry.offset
736
                                             + s->metadata_rt.file_offset,
737
                     &s->logical_sector_size,
738
                     sizeof(uint32_t));
739
    if (ret < 0) {
740
        goto exit;
741
    }
742
    ret = bdrv_pread(bs->file,
743
                     s->metadata_entries.phys_sector_size_entry.offset
744
                                          + s->metadata_rt.file_offset,
745
                     &s->physical_sector_size,
746
                     sizeof(uint32_t));
747
    if (ret < 0) {
748
        goto exit;
749
    }
750

    
751
    le64_to_cpus(&s->virtual_disk_size);
752
    le32_to_cpus(&s->logical_sector_size);
753
    le32_to_cpus(&s->physical_sector_size);
754

    
755
    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
756
        ret = -EINVAL;
757
        goto exit;
758
    }
759

    
760
    /* both block_size and sector_size are guaranteed powers of 2 */
761
    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
762
    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
763
                     (uint64_t)s->logical_sector_size /
764
                     (uint64_t)s->params.block_size;
765

    
766
    /* These values are ones we will want to use for division / multiplication
767
     * later on, and they are all guaranteed (per the spec) to be powers of 2,
768
     * so we can take advantage of that for shift operations during
769
     * reads/writes */
770
    if (s->logical_sector_size & (s->logical_sector_size - 1)) {
771
        ret = -EINVAL;
772
        goto exit;
773
    }
774
    if (s->sectors_per_block & (s->sectors_per_block - 1)) {
775
        ret = -EINVAL;
776
        goto exit;
777
    }
778
    if (s->chunk_ratio & (s->chunk_ratio - 1)) {
779
        ret = -EINVAL;
780
        goto exit;
781
    }
782
    s->block_size = s->params.block_size;
783
    if (s->block_size & (s->block_size - 1)) {
784
        ret = -EINVAL;
785
        goto exit;
786
    }
787

    
788
    s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size);
789
    s->sectors_per_block_bits =   31 - clz32(s->sectors_per_block);
790
    s->chunk_ratio_bits =         63 - clz64(s->chunk_ratio);
791
    s->block_size_bits =          31 - clz32(s->block_size);
792

    
793
    ret = 0;
794

    
795
exit:
796
    qemu_vfree(buffer);
797
    return ret;
798
}
799

    
800

    
801
static void vhdx_close(BlockDriverState *bs)
802
{
803
    BDRVVHDXState *s = bs->opaque;
804
    qemu_vfree(s->headers[0]);
805
    s->headers[0] = NULL;
806
    qemu_vfree(s->headers[1]);
807
    s->headers[1] = NULL;
808
    qemu_vfree(s->bat);
809
    s->bat = NULL;
810
    qemu_vfree(s->parent_entries);
811
    s->parent_entries = NULL;
812
    migrate_del_blocker(s->migration_blocker);
813
    error_free(s->migration_blocker);
814
    qemu_vfree(s->log.hdr);
815
    s->log.hdr = NULL;
816
    vhdx_region_unregister_all(s);
817
}
818

    
819
static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
820
                     Error **errp)
821
{
822
    BDRVVHDXState *s = bs->opaque;
823
    int ret = 0;
824
    uint32_t i;
825
    uint64_t signature;
826
    uint32_t data_blocks_cnt, bitmap_blocks_cnt;
827
    bool log_flushed = false;
828

    
829

    
830
    s->bat = NULL;
831
    s->first_visible_write = true;
832

    
833
    qemu_co_mutex_init(&s->lock);
834
    QLIST_INIT(&s->regions);
835

    
836
    /* validate the file signature */
837
    ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t));
838
    if (ret < 0) {
839
        goto fail;
840
    }
841
    if (memcmp(&signature, "vhdxfile", 8)) {
842
        ret = -EINVAL;
843
        goto fail;
844
    }
845

    
846
    /* This is used for any header updates, for the file_write_guid.
847
     * The spec dictates that a new value should be used for the first
848
     * header update */
849
    vhdx_guid_generate(&s->session_guid);
850

    
851
    ret = vhdx_parse_header(bs, s);
852
    if (ret < 0) {
853
        goto fail;
854
    }
855

    
856
    ret = vhdx_parse_log(bs, s, &log_flushed);
857
    if (ret < 0) {
858
        goto fail;
859
    }
860

    
861
    ret = vhdx_open_region_tables(bs, s);
862
    if (ret < 0) {
863
        goto fail;
864
    }
865

    
866
    ret = vhdx_parse_metadata(bs, s);
867
    if (ret < 0) {
868
        goto fail;
869
    }
870

    
871
    s->block_size = s->params.block_size;
872

    
873
    /* the VHDX spec dictates that virtual_disk_size is always a multiple of
874
     * logical_sector_size */
875
    bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits;
876

    
877
    data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits;
878
    if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) {
879
        data_blocks_cnt++;
880
    }
881
    bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits;
882
    if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) {
883
        bitmap_blocks_cnt++;
884
    }
885

    
886
    if (s->parent_entries) {
887
        s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
888
    } else {
889
        s->bat_entries = data_blocks_cnt +
890
                         ((data_blocks_cnt - 1) >> s->chunk_ratio_bits);
891
    }
892

    
893
    s->bat_offset = s->bat_rt.file_offset;
894

    
895
    if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) {
896
        /* BAT allocation is not large enough for all entries */
897
        ret = -EINVAL;
898
        goto fail;
899
    }
900

    
901
    /* s->bat is freed in vhdx_close() */
902
    s->bat = qemu_blockalign(bs, s->bat_rt.length);
903

    
904
    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
905
    if (ret < 0) {
906
        goto fail;
907
    }
908

    
909
    uint64_t payblocks = s->chunk_ratio;
910
    /* endian convert, and verify populated BAT field file offsets against
911
     * region table and log entries */
912
    for (i = 0; i < s->bat_entries; i++) {
913
        le64_to_cpus(&s->bat[i]);
914
        if (payblocks--) {
915
            /* payload bat entries */
916
            if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) ==
917
                    PAYLOAD_BLOCK_FULLY_PRESENT) {
918
                ret = vhdx_region_check(s, s->bat[i] & VHDX_BAT_FILE_OFF_MASK,
919
                                        s->block_size);
920
                if (ret < 0) {
921
                    goto fail;
922
                }
923
            }
924
        } else {
925
            payblocks = s->chunk_ratio;
926
            /* Once differencing files are supported, verify sector bitmap
927
             * blocks here */
928
        }
929
    }
930

    
931
    if (flags & BDRV_O_RDWR) {
932
        ret = vhdx_update_headers(bs, s, false, NULL);
933
        if (ret < 0) {
934
            goto fail;
935
        }
936
    }
937

    
938
    /* TODO: differencing files */
939

    
940
    /* Disable migration when VHDX images are used */
941
    error_set(&s->migration_blocker,
942
            QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
943
            "vhdx", bs->device_name, "live migration");
944
    migrate_add_blocker(s->migration_blocker);
945

    
946
    return 0;
947
fail:
948
    vhdx_close(bs);
949
    return ret;
950
}
951

    
952
static int vhdx_reopen_prepare(BDRVReopenState *state,
953
                               BlockReopenQueue *queue, Error **errp)
954
{
955
    return 0;
956
}
957

    
958

    
959
/*
960
 * Perform sector to block offset translations, to get various
961
 * sector and file offsets into the image.  See VHDXSectorInfo
962
 */
963
static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num,
964
                                 int nb_sectors, VHDXSectorInfo *sinfo)
965
{
966
    uint32_t block_offset;
967

    
968
    sinfo->bat_idx = sector_num >> s->sectors_per_block_bits;
969
    /* effectively a modulo - this gives us the offset into the block
970
     * (in sector sizes) for our sector number */
971
    block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits);
972
    /* the chunk ratio gives us the interleaving of the sector
973
     * bitmaps, so we need to advance our page block index by the
974
     * sector bitmaps entry number */
975
    sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits;
976

    
977
    /* the number of sectors we can read/write in this cycle */
978
    sinfo->sectors_avail = s->sectors_per_block - block_offset;
979

    
980
    sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits;
981

    
982
    if (sinfo->sectors_avail > nb_sectors) {
983
        sinfo->sectors_avail = nb_sectors;
984
    }
985

    
986
    sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits;
987

    
988
    sinfo->file_offset = s->bat[sinfo->bat_idx] >> VHDX_BAT_FILE_OFF_BITS;
989

    
990
    sinfo->block_offset = block_offset << s->logical_sector_size_bits;
991

    
992
    /* The file offset must be past the header section, so must be > 0 */
993
    if (sinfo->file_offset == 0) {
994
        return;
995
    }
996

    
997
    /* block offset is the offset in vhdx logical sectors, in
998
     * the payload data block. Convert that to a byte offset
999
     * in the block, and add in the payload data block offset
1000
     * in the file, in bytes, to get the final read address */
1001

    
1002
    sinfo->file_offset <<= 20;  /* now in bytes, rather than 1MB units */
1003
    sinfo->file_offset += sinfo->block_offset;
1004
}
1005

    
1006

    
1007

    
1008
static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
1009
                                      int nb_sectors, QEMUIOVector *qiov)
1010
{
1011
    BDRVVHDXState *s = bs->opaque;
1012
    int ret = 0;
1013
    VHDXSectorInfo sinfo;
1014
    uint64_t bytes_done = 0;
1015
    QEMUIOVector hd_qiov;
1016

    
1017
    qemu_iovec_init(&hd_qiov, qiov->niov);
1018

    
1019
    qemu_co_mutex_lock(&s->lock);
1020

    
1021
    while (nb_sectors > 0) {
1022
        /* We are a differencing file, so we need to inspect the sector bitmap
1023
         * to see if we have the data or not */
1024
        if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
1025
            /* not supported yet */
1026
            ret = -ENOTSUP;
1027
            goto exit;
1028
        } else {
1029
            vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
1030

    
1031
            qemu_iovec_reset(&hd_qiov);
1032
            qemu_iovec_concat(&hd_qiov, qiov,  bytes_done, sinfo.bytes_avail);
1033

    
1034
            /* check the payload block state */
1035
            switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) {
1036
            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
1037
            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
1038
            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
1039
            case PAYLOAD_BLOCK_ZERO:
1040
                /* return zero */
1041
                qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail);
1042
                break;
1043
            case PAYLOAD_BLOCK_FULLY_PRESENT:
1044
                qemu_co_mutex_unlock(&s->lock);
1045
                ret = bdrv_co_readv(bs->file,
1046
                                    sinfo.file_offset >> BDRV_SECTOR_BITS,
1047
                                    sinfo.sectors_avail, &hd_qiov);
1048
                qemu_co_mutex_lock(&s->lock);
1049
                if (ret < 0) {
1050
                    goto exit;
1051
                }
1052
                break;
1053
            case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
1054
                /* we don't yet support difference files, fall through
1055
                 * to error */
1056
            default:
1057
                ret = -EIO;
1058
                goto exit;
1059
                break;
1060
            }
1061
            nb_sectors -= sinfo.sectors_avail;
1062
            sector_num += sinfo.sectors_avail;
1063
            bytes_done += sinfo.bytes_avail;
1064
        }
1065
    }
1066
    ret = 0;
1067
exit:
1068
    qemu_co_mutex_unlock(&s->lock);
1069
    qemu_iovec_destroy(&hd_qiov);
1070
    return ret;
1071
}
1072

    
1073
/*
1074
 * Allocate a new payload block at the end of the file.
1075
 *
1076
 * Allocation will happen at 1MB alignment inside the file
1077
 *
1078
 * Returns the file offset start of the new payload block
1079
 */
1080
static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
1081
                                    uint64_t *new_offset)
1082
{
1083
    *new_offset = bdrv_getlength(bs->file);
1084

    
1085
    /* per the spec, the address for a block is in units of 1MB */
1086
    *new_offset = ROUND_UP(*new_offset, 1024 * 1024);
1087

    
1088
    return bdrv_truncate(bs->file, *new_offset + s->block_size);
1089
}
1090

    
1091
/*
1092
 * Update the BAT table entry with the new file offset, and the new entry
1093
 * state */
1094
static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s,
1095
                                       VHDXSectorInfo *sinfo,
1096
                                       uint64_t *bat_entry_le,
1097
                                       uint64_t *bat_offset, int state)
1098
{
1099
    /* The BAT entry is a uint64, with 44 bits for the file offset in units of
1100
     * 1MB, and 3 bits for the block state. */
1101
    s->bat[sinfo->bat_idx]  = ((sinfo->file_offset>>20) <<
1102
                               VHDX_BAT_FILE_OFF_BITS);
1103

    
1104
    s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK;
1105

    
1106
    *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]);
1107
    *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry);
1108

    
1109
}
1110

    
1111
/* Per the spec, on the first write of guest-visible data to the file the
1112
 * data write guid must be updated in the header */
1113
int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s)
1114
{
1115
    int ret = 0;
1116
    if (s->first_visible_write) {
1117
        s->first_visible_write = false;
1118
        ret = vhdx_update_headers(bs, s, true, NULL);
1119
    }
1120
    return ret;
1121
}
1122

    
1123
static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
1124
                                      int nb_sectors, QEMUIOVector *qiov)
1125
{
1126
    int ret = -ENOTSUP;
1127
    BDRVVHDXState *s = bs->opaque;
1128
    VHDXSectorInfo sinfo;
1129
    uint64_t bytes_done = 0;
1130
    uint64_t bat_entry = 0;
1131
    uint64_t bat_entry_offset = 0;
1132
    QEMUIOVector hd_qiov;
1133
    struct iovec iov1 = { 0 };
1134
    struct iovec iov2 = { 0 };
1135
    int sectors_to_write;
1136
    int bat_state;
1137
    uint64_t bat_prior_offset = 0;
1138
    bool bat_update = false;
1139

    
1140
    qemu_iovec_init(&hd_qiov, qiov->niov);
1141

    
1142
    qemu_co_mutex_lock(&s->lock);
1143

    
1144
    ret = vhdx_user_visible_write(bs, s);
1145
    if (ret < 0) {
1146
        goto exit;
1147
    }
1148

    
1149
    while (nb_sectors > 0) {
1150
        bool use_zero_buffers = false;
1151
        bat_update = false;
1152
        if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
1153
            /* not supported yet */
1154
            ret = -ENOTSUP;
1155
            goto exit;
1156
        } else {
1157
            vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
1158
            sectors_to_write = sinfo.sectors_avail;
1159

    
1160
            qemu_iovec_reset(&hd_qiov);
1161
            /* check the payload block state */
1162
            bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK;
1163
            switch (bat_state) {
1164
            case PAYLOAD_BLOCK_ZERO:
1165
                /* in this case, we need to preserve zero writes for
1166
                 * data that is not part of this write, so we must pad
1167
                 * the rest of the buffer to zeroes */
1168

    
1169
                /* if we are on a posix system with ftruncate() that extends
1170
                 * a file, then it is zero-filled for us.  On Win32, the raw
1171
                 * layer uses SetFilePointer and SetFileEnd, which does not
1172
                 * zero fill AFAIK */
1173

    
1174
                /* Queue another write of zero buffers if the underlying file
1175
                 * does not zero-fill on file extension */
1176

    
1177
                if (bdrv_has_zero_init(bs->file) == 0) {
1178
                    use_zero_buffers = true;
1179

    
1180
                    /* zero fill the front, if any */
1181
                    if (sinfo.block_offset) {
1182
                        iov1.iov_len = sinfo.block_offset;
1183
                        iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
1184
                        memset(iov1.iov_base, 0, iov1.iov_len);
1185
                        qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
1186
                                              sinfo.block_offset);
1187
                        sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
1188
                    }
1189

    
1190
                    /* our actual data */
1191
                    qemu_iovec_concat(&hd_qiov, qiov,  bytes_done,
1192
                                      sinfo.bytes_avail);
1193

    
1194
                    /* zero fill the back, if any */
1195
                    if ((sinfo.bytes_avail - sinfo.block_offset) <
1196
                         s->block_size) {
1197
                        iov2.iov_len = s->block_size -
1198
                                      (sinfo.bytes_avail + sinfo.block_offset);
1199
                        iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
1200
                        memset(iov2.iov_base, 0, iov2.iov_len);
1201
                        qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
1202
                                              sinfo.block_offset);
1203
                        sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
1204
                    }
1205
                }
1206

    
1207
                /* fall through */
1208
            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
1209
            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
1210
            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
1211
                bat_prior_offset = sinfo.file_offset;
1212
                ret = vhdx_allocate_block(bs, s, &sinfo.file_offset);
1213
                if (ret < 0) {
1214
                    goto exit;
1215
                }
1216
                /* once we support differencing files, this may also be
1217
                 * partially present */
1218
                /* update block state to the newly specified state */
1219
                vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry,
1220
                                            &bat_entry_offset,
1221
                                            PAYLOAD_BLOCK_FULLY_PRESENT);
1222
                bat_update = true;
1223
                /* since we just allocated a block, file_offset is the
1224
                 * beginning of the payload block. It needs to be the
1225
                 * write address, which includes the offset into the block */
1226
                if (!use_zero_buffers) {
1227
                    sinfo.file_offset += sinfo.block_offset;
1228
                }
1229
                /* fall through */
1230
            case PAYLOAD_BLOCK_FULLY_PRESENT:
1231
                /* if the file offset address is in the header zone,
1232
                 * there is a problem */
1233
                if (sinfo.file_offset < (1024 * 1024)) {
1234
                    ret = -EFAULT;
1235
                    goto error_bat_restore;
1236
                }
1237

    
1238
                if (!use_zero_buffers) {
1239
                    qemu_iovec_concat(&hd_qiov, qiov,  bytes_done,
1240
                                      sinfo.bytes_avail);
1241
                }
1242
                /* block exists, so we can just overwrite it */
1243
                qemu_co_mutex_unlock(&s->lock);
1244
                ret = bdrv_co_writev(bs->file,
1245
                                    sinfo.file_offset >> BDRV_SECTOR_BITS,
1246
                                    sectors_to_write, &hd_qiov);
1247
                qemu_co_mutex_lock(&s->lock);
1248
                if (ret < 0) {
1249
                    goto error_bat_restore;
1250
                }
1251
                break;
1252
            case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
1253
                /* we don't yet support difference files, fall through
1254
                 * to error */
1255
            default:
1256
                ret = -EIO;
1257
                goto exit;
1258
                break;
1259
            }
1260

    
1261
            if (bat_update) {
1262
                /* this will update the BAT entry into the log journal, and
1263
                 * then flush the log journal out to disk */
1264
                ret =  vhdx_log_write_and_flush(bs, s, &bat_entry,
1265
                                                sizeof(VHDXBatEntry),
1266
                                                bat_entry_offset);
1267
                if (ret < 0) {
1268
                    goto exit;
1269
                }
1270
            }
1271

    
1272
            nb_sectors -= sinfo.sectors_avail;
1273
            sector_num += sinfo.sectors_avail;
1274
            bytes_done += sinfo.bytes_avail;
1275

    
1276
        }
1277
    }
1278

    
1279
    goto exit;
1280

    
1281
error_bat_restore:
1282
    if (bat_update) {
1283
        /* keep metadata in sync, and restore the bat entry state
1284
         * if error. */
1285
        sinfo.file_offset = bat_prior_offset;
1286
        vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry,
1287
                                    &bat_entry_offset, bat_state);
1288
    }
1289
exit:
1290
    qemu_vfree(iov1.iov_base);
1291
    qemu_vfree(iov2.iov_base);
1292
    qemu_co_mutex_unlock(&s->lock);
1293
    qemu_iovec_destroy(&hd_qiov);
1294
    return ret;
1295
}
1296

    
1297

    
1298
static BlockDriver bdrv_vhdx = {
1299
    .format_name            = "vhdx",
1300
    .instance_size          = sizeof(BDRVVHDXState),
1301
    .bdrv_probe             = vhdx_probe,
1302
    .bdrv_open              = vhdx_open,
1303
    .bdrv_close             = vhdx_close,
1304
    .bdrv_reopen_prepare    = vhdx_reopen_prepare,
1305
    .bdrv_co_readv          = vhdx_co_readv,
1306
    .bdrv_co_writev         = vhdx_co_writev,
1307
};
1308

    
1309
static void bdrv_vhdx_init(void)
1310
{
1311
    bdrv_register(&bdrv_vhdx);
1312
}
1313

    
1314
block_init(bdrv_vhdx_init);