Statistics
| Branch: | Revision:

root / block / vmdk.c @ d1833ef5

History | View | Annotate | Download (63.3 kB)

1
/*
2
 * Block driver for the VMDK format
3
 *
4
 * Copyright (c) 2004 Fabrice Bellard
5
 * Copyright (c) 2005 Filip Navara
6
 *
7
 * Permission is hereby granted, free of charge, to any person obtaining a copy
8
 * of this software and associated documentation files (the "Software"), to deal
9
 * in the Software without restriction, including without limitation the rights
10
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the Software is
12
 * furnished to do so, subject to the following conditions:
13
 *
14
 * The above copyright notice and this permission notice shall be included in
15
 * all copies or substantial portions of the Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
 * THE SOFTWARE.
24
 */
25

    
26
#include "qemu-common.h"
27
#include "block/block_int.h"
28
#include "qemu/module.h"
29
#include "migration/migration.h"
30
#include <zlib.h>
31

    
32
#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
33
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
34
#define VMDK4_COMPRESSION_DEFLATE 1
35
#define VMDK4_FLAG_NL_DETECT (1 << 0)
36
#define VMDK4_FLAG_RGD (1 << 1)
37
/* Zeroed-grain enable bit */
38
#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
39
#define VMDK4_FLAG_COMPRESS (1 << 16)
40
#define VMDK4_FLAG_MARKER (1 << 17)
41
#define VMDK4_GD_AT_END 0xffffffffffffffffULL
42

    
43
#define VMDK_GTE_ZEROED 0x1
44

    
45
/* VMDK internal error codes */
46
#define VMDK_OK      0
47
#define VMDK_ERROR   (-1)
48
/* Cluster not allocated */
49
#define VMDK_UNALLOC (-2)
50
#define VMDK_ZEROED  (-3)
51

    
52
#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"
53

    
54
typedef struct {
55
    uint32_t version;
56
    uint32_t flags;
57
    uint32_t disk_sectors;
58
    uint32_t granularity;
59
    uint32_t l1dir_offset;
60
    uint32_t l1dir_size;
61
    uint32_t file_sectors;
62
    uint32_t cylinders;
63
    uint32_t heads;
64
    uint32_t sectors_per_track;
65
} QEMU_PACKED VMDK3Header;
66

    
67
typedef struct {
68
    uint32_t version;
69
    uint32_t flags;
70
    uint64_t capacity;
71
    uint64_t granularity;
72
    uint64_t desc_offset;
73
    uint64_t desc_size;
74
    /* Number of GrainTableEntries per GrainTable */
75
    uint32_t num_gtes_per_gt;
76
    uint64_t rgd_offset;
77
    uint64_t gd_offset;
78
    uint64_t grain_offset;
79
    char filler[1];
80
    char check_bytes[4];
81
    uint16_t compressAlgorithm;
82
} QEMU_PACKED VMDK4Header;
83

    
84
#define L2_CACHE_SIZE 16
85

    
86
typedef struct VmdkExtent {
87
    BlockDriverState *file;
88
    bool flat;
89
    bool compressed;
90
    bool has_marker;
91
    bool has_zero_grain;
92
    int version;
93
    int64_t sectors;
94
    int64_t end_sector;
95
    int64_t flat_start_offset;
96
    int64_t l1_table_offset;
97
    int64_t l1_backup_table_offset;
98
    uint32_t *l1_table;
99
    uint32_t *l1_backup_table;
100
    unsigned int l1_size;
101
    uint32_t l1_entry_sectors;
102

    
103
    unsigned int l2_size;
104
    uint32_t *l2_cache;
105
    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
106
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
107

    
108
    int64_t cluster_sectors;
109
    char *type;
110
} VmdkExtent;
111

    
112
typedef struct BDRVVmdkState {
113
    CoMutex lock;
114
    uint64_t desc_offset;
115
    bool cid_updated;
116
    bool cid_checked;
117
    uint32_t cid;
118
    uint32_t parent_cid;
119
    int num_extents;
120
    /* Extent array with num_extents entries, ascend ordered by address */
121
    VmdkExtent *extents;
122
    Error *migration_blocker;
123
    char *create_type;
124
} BDRVVmdkState;
125

    
126
typedef struct VmdkMetaData {
127
    uint32_t offset;
128
    unsigned int l1_index;
129
    unsigned int l2_index;
130
    unsigned int l2_offset;
131
    int valid;
132
    uint32_t *l2_cache_entry;
133
} VmdkMetaData;
134

    
135
typedef struct VmdkGrainMarker {
136
    uint64_t lba;
137
    uint32_t size;
138
    uint8_t  data[0];
139
} QEMU_PACKED VmdkGrainMarker;
140

    
141
enum {
142
    MARKER_END_OF_STREAM    = 0,
143
    MARKER_GRAIN_TABLE      = 1,
144
    MARKER_GRAIN_DIRECTORY  = 2,
145
    MARKER_FOOTER           = 3,
146
};
147

    
148
static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
149
{
150
    uint32_t magic;
151

    
152
    if (buf_size < 4) {
153
        return 0;
154
    }
155
    magic = be32_to_cpu(*(uint32_t *)buf);
156
    if (magic == VMDK3_MAGIC ||
157
        magic == VMDK4_MAGIC) {
158
        return 100;
159
    } else {
160
        const char *p = (const char *)buf;
161
        const char *end = p + buf_size;
162
        while (p < end) {
163
            if (*p == '#') {
164
                /* skip comment line */
165
                while (p < end && *p != '\n') {
166
                    p++;
167
                }
168
                p++;
169
                continue;
170
            }
171
            if (*p == ' ') {
172
                while (p < end && *p == ' ') {
173
                    p++;
174
                }
175
                /* skip '\r' if windows line endings used. */
176
                if (p < end && *p == '\r') {
177
                    p++;
178
                }
179
                /* only accept blank lines before 'version=' line */
180
                if (p == end || *p != '\n') {
181
                    return 0;
182
                }
183
                p++;
184
                continue;
185
            }
186
            if (end - p >= strlen("version=X\n")) {
187
                if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
188
                    strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
189
                    return 100;
190
                }
191
            }
192
            if (end - p >= strlen("version=X\r\n")) {
193
                if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
194
                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
195
                    return 100;
196
                }
197
            }
198
            return 0;
199
        }
200
        return 0;
201
    }
202
}
203

    
204
#define SECTOR_SIZE 512
205
#define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
206
#define BUF_SIZE 4096
207
#define HEADER_SIZE 512                 /* first sector of 512 bytes */
208

    
209
static void vmdk_free_extents(BlockDriverState *bs)
210
{
211
    int i;
212
    BDRVVmdkState *s = bs->opaque;
213
    VmdkExtent *e;
214

    
215
    for (i = 0; i < s->num_extents; i++) {
216
        e = &s->extents[i];
217
        g_free(e->l1_table);
218
        g_free(e->l2_cache);
219
        g_free(e->l1_backup_table);
220
        g_free(e->type);
221
        if (e->file != bs->file) {
222
            bdrv_unref(e->file);
223
        }
224
    }
225
    g_free(s->extents);
226
}
227

    
228
static void vmdk_free_last_extent(BlockDriverState *bs)
229
{
230
    BDRVVmdkState *s = bs->opaque;
231

    
232
    if (s->num_extents == 0) {
233
        return;
234
    }
235
    s->num_extents--;
236
    s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent));
237
}
238

    
239
static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
240
{
241
    char desc[DESC_SIZE];
242
    uint32_t cid = 0xffffffff;
243
    const char *p_name, *cid_str;
244
    size_t cid_str_size;
245
    BDRVVmdkState *s = bs->opaque;
246
    int ret;
247

    
248
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
249
    if (ret < 0) {
250
        return 0;
251
    }
252

    
253
    if (parent) {
254
        cid_str = "parentCID";
255
        cid_str_size = sizeof("parentCID");
256
    } else {
257
        cid_str = "CID";
258
        cid_str_size = sizeof("CID");
259
    }
260

    
261
    desc[DESC_SIZE - 1] = '\0';
262
    p_name = strstr(desc, cid_str);
263
    if (p_name != NULL) {
264
        p_name += cid_str_size;
265
        sscanf(p_name, "%x", &cid);
266
    }
267

    
268
    return cid;
269
}
270

    
271
static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
272
{
273
    char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
274
    char *p_name, *tmp_str;
275
    BDRVVmdkState *s = bs->opaque;
276
    int ret;
277

    
278
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
279
    if (ret < 0) {
280
        return ret;
281
    }
282

    
283
    desc[DESC_SIZE - 1] = '\0';
284
    tmp_str = strstr(desc, "parentCID");
285
    if (tmp_str == NULL) {
286
        return -EINVAL;
287
    }
288

    
289
    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
290
    p_name = strstr(desc, "CID");
291
    if (p_name != NULL) {
292
        p_name += sizeof("CID");
293
        snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
294
        pstrcat(desc, sizeof(desc), tmp_desc);
295
    }
296

    
297
    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
298
    if (ret < 0) {
299
        return ret;
300
    }
301

    
302
    return 0;
303
}
304

    
305
static int vmdk_is_cid_valid(BlockDriverState *bs)
306
{
307
    BDRVVmdkState *s = bs->opaque;
308
    BlockDriverState *p_bs = bs->backing_hd;
309
    uint32_t cur_pcid;
310

    
311
    if (!s->cid_checked && p_bs) {
312
        cur_pcid = vmdk_read_cid(p_bs, 0);
313
        if (s->parent_cid != cur_pcid) {
314
            /* CID not valid */
315
            return 0;
316
        }
317
    }
318
    s->cid_checked = true;
319
    /* CID valid */
320
    return 1;
321
}
322

    
323
/* Queue extents, if any, for reopen() */
324
static int vmdk_reopen_prepare(BDRVReopenState *state,
325
                               BlockReopenQueue *queue, Error **errp)
326
{
327
    BDRVVmdkState *s;
328
    int ret = -1;
329
    int i;
330
    VmdkExtent *e;
331

    
332
    assert(state != NULL);
333
    assert(state->bs != NULL);
334

    
335
    if (queue == NULL) {
336
        error_setg(errp, "No reopen queue for VMDK extents");
337
        goto exit;
338
    }
339

    
340
    s = state->bs->opaque;
341

    
342
    assert(s != NULL);
343

    
344
    for (i = 0; i < s->num_extents; i++) {
345
        e = &s->extents[i];
346
        if (e->file != state->bs->file) {
347
            bdrv_reopen_queue(queue, e->file, state->flags);
348
        }
349
    }
350
    ret = 0;
351

    
352
exit:
353
    return ret;
354
}
355

    
356
static int vmdk_parent_open(BlockDriverState *bs)
357
{
358
    char *p_name;
359
    char desc[DESC_SIZE + 1];
360
    BDRVVmdkState *s = bs->opaque;
361
    int ret;
362

    
363
    desc[DESC_SIZE] = '\0';
364
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
365
    if (ret < 0) {
366
        return ret;
367
    }
368

    
369
    p_name = strstr(desc, "parentFileNameHint");
370
    if (p_name != NULL) {
371
        char *end_name;
372

    
373
        p_name += sizeof("parentFileNameHint") + 1;
374
        end_name = strchr(p_name, '\"');
375
        if (end_name == NULL) {
376
            return -EINVAL;
377
        }
378
        if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
379
            return -EINVAL;
380
        }
381

    
382
        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
383
    }
384

    
385
    return 0;
386
}
387

    
388
/* Create and append extent to the extent array. Return the added VmdkExtent
389
 * address. return NULL if allocation failed. */
390
static int vmdk_add_extent(BlockDriverState *bs,
391
                           BlockDriverState *file, bool flat, int64_t sectors,
392
                           int64_t l1_offset, int64_t l1_backup_offset,
393
                           uint32_t l1_size,
394
                           int l2_size, uint64_t cluster_sectors,
395
                           VmdkExtent **new_extent,
396
                           Error **errp)
397
{
398
    VmdkExtent *extent;
399
    BDRVVmdkState *s = bs->opaque;
400

    
401
    if (cluster_sectors > 0x200000) {
402
        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
403
        error_setg(errp, "Invalid granularity, image may be corrupt");
404
        return -EFBIG;
405
    }
406
    if (l1_size > 512 * 1024 * 1024) {
407
        /* Although with big capacity and small l1_entry_sectors, we can get a
408
         * big l1_size, we don't want unbounded value to allocate the table.
409
         * Limit it to 512M, which is 16PB for default cluster and L2 table
410
         * size */
411
        error_setg(errp, "L1 size too big");
412
        return -EFBIG;
413
    }
414

    
415
    s->extents = g_realloc(s->extents,
416
                              (s->num_extents + 1) * sizeof(VmdkExtent));
417
    extent = &s->extents[s->num_extents];
418
    s->num_extents++;
419

    
420
    memset(extent, 0, sizeof(VmdkExtent));
421
    extent->file = file;
422
    extent->flat = flat;
423
    extent->sectors = sectors;
424
    extent->l1_table_offset = l1_offset;
425
    extent->l1_backup_table_offset = l1_backup_offset;
426
    extent->l1_size = l1_size;
427
    extent->l1_entry_sectors = l2_size * cluster_sectors;
428
    extent->l2_size = l2_size;
429
    extent->cluster_sectors = flat ? sectors : cluster_sectors;
430

    
431
    if (s->num_extents > 1) {
432
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
433
    } else {
434
        extent->end_sector = extent->sectors;
435
    }
436
    bs->total_sectors = extent->end_sector;
437
    if (new_extent) {
438
        *new_extent = extent;
439
    }
440
    return 0;
441
}
442

    
443
static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
444
                            Error **errp)
445
{
446
    int ret;
447
    int l1_size, i;
448

    
449
    /* read the L1 table */
450
    l1_size = extent->l1_size * sizeof(uint32_t);
451
    extent->l1_table = g_malloc(l1_size);
452
    ret = bdrv_pread(extent->file,
453
                     extent->l1_table_offset,
454
                     extent->l1_table,
455
                     l1_size);
456
    if (ret < 0) {
457
        error_setg_errno(errp, -ret,
458
                         "Could not read l1 table from extent '%s'",
459
                         extent->file->filename);
460
        goto fail_l1;
461
    }
462
    for (i = 0; i < extent->l1_size; i++) {
463
        le32_to_cpus(&extent->l1_table[i]);
464
    }
465

    
466
    if (extent->l1_backup_table_offset) {
467
        extent->l1_backup_table = g_malloc(l1_size);
468
        ret = bdrv_pread(extent->file,
469
                         extent->l1_backup_table_offset,
470
                         extent->l1_backup_table,
471
                         l1_size);
472
        if (ret < 0) {
473
            error_setg_errno(errp, -ret,
474
                             "Could not read l1 backup table from extent '%s'",
475
                             extent->file->filename);
476
            goto fail_l1b;
477
        }
478
        for (i = 0; i < extent->l1_size; i++) {
479
            le32_to_cpus(&extent->l1_backup_table[i]);
480
        }
481
    }
482

    
483
    extent->l2_cache =
484
        g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
485
    return 0;
486
 fail_l1b:
487
    g_free(extent->l1_backup_table);
488
 fail_l1:
489
    g_free(extent->l1_table);
490
    return ret;
491
}
492

    
493
static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
494
                                 BlockDriverState *file,
495
                                 int flags, Error **errp)
496
{
497
    int ret;
498
    uint32_t magic;
499
    VMDK3Header header;
500
    VmdkExtent *extent;
501

    
502
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
503
    if (ret < 0) {
504
        error_setg_errno(errp, -ret,
505
                         "Could not read header from file '%s'",
506
                         file->filename);
507
        return ret;
508
    }
509
    ret = vmdk_add_extent(bs, file, false,
510
                          le32_to_cpu(header.disk_sectors),
511
                          le32_to_cpu(header.l1dir_offset) << 9,
512
                          0,
513
                          le32_to_cpu(header.l1dir_size),
514
                          4096,
515
                          le32_to_cpu(header.granularity),
516
                          &extent,
517
                          errp);
518
    if (ret < 0) {
519
        return ret;
520
    }
521
    ret = vmdk_init_tables(bs, extent, errp);
522
    if (ret) {
523
        /* free extent allocated by vmdk_add_extent */
524
        vmdk_free_last_extent(bs);
525
    }
526
    return ret;
527
}
528

    
529
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
530
                               Error **errp);
531

    
532
static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
533
                            Error **errp)
534
{
535
    int64_t size;
536
    char *buf;
537
    int ret;
538

    
539
    size = bdrv_getlength(file);
540
    if (size < 0) {
541
        error_setg_errno(errp, -size, "Could not access file");
542
        return NULL;
543
    }
544

    
545
    size = MIN(size, 1 << 20);  /* avoid unbounded allocation */
546
    buf = g_malloc0(size + 1);
547

    
548
    ret = bdrv_pread(file, desc_offset, buf, size);
549
    if (ret < 0) {
550
        error_setg_errno(errp, -ret, "Could not read from file");
551
        g_free(buf);
552
        return NULL;
553
    }
554

    
555
    return buf;
556
}
557

    
558
static int vmdk_open_vmdk4(BlockDriverState *bs,
559
                           BlockDriverState *file,
560
                           int flags, Error **errp)
561
{
562
    int ret;
563
    uint32_t magic;
564
    uint32_t l1_size, l1_entry_sectors;
565
    VMDK4Header header;
566
    VmdkExtent *extent;
567
    BDRVVmdkState *s = bs->opaque;
568
    int64_t l1_backup_offset = 0;
569

    
570
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
571
    if (ret < 0) {
572
        error_setg_errno(errp, -ret,
573
                         "Could not read header from file '%s'",
574
                         file->filename);
575
    }
576
    if (header.capacity == 0) {
577
        uint64_t desc_offset = le64_to_cpu(header.desc_offset);
578
        if (desc_offset) {
579
            char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
580
            if (!buf) {
581
                return -EINVAL;
582
            }
583
            ret = vmdk_open_desc_file(bs, flags, buf, errp);
584
            g_free(buf);
585
            return ret;
586
        }
587
    }
588

    
589
    if (!s->create_type) {
590
        s->create_type = g_strdup("monolithicSparse");
591
    }
592

    
593
    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
594
        /*
595
         * The footer takes precedence over the header, so read it in. The
596
         * footer starts at offset -1024 from the end: One sector for the
597
         * footer, and another one for the end-of-stream marker.
598
         */
599
        struct {
600
            struct {
601
                uint64_t val;
602
                uint32_t size;
603
                uint32_t type;
604
                uint8_t pad[512 - 16];
605
            } QEMU_PACKED footer_marker;
606

    
607
            uint32_t magic;
608
            VMDK4Header header;
609
            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
610

    
611
            struct {
612
                uint64_t val;
613
                uint32_t size;
614
                uint32_t type;
615
                uint8_t pad[512 - 16];
616
            } QEMU_PACKED eos_marker;
617
        } QEMU_PACKED footer;
618

    
619
        ret = bdrv_pread(file,
620
            bs->file->total_sectors * 512 - 1536,
621
            &footer, sizeof(footer));
622
        if (ret < 0) {
623
            return ret;
624
        }
625

    
626
        /* Some sanity checks for the footer */
627
        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
628
            le32_to_cpu(footer.footer_marker.size) != 0  ||
629
            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
630
            le64_to_cpu(footer.eos_marker.val) != 0  ||
631
            le32_to_cpu(footer.eos_marker.size) != 0  ||
632
            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
633
        {
634
            return -EINVAL;
635
        }
636

    
637
        header = footer.header;
638
    }
639

    
640
    if (le32_to_cpu(header.version) > 3) {
641
        char buf[64];
642
        snprintf(buf, sizeof(buf), "VMDK version %d",
643
                 le32_to_cpu(header.version));
644
        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
645
                bs->device_name, "vmdk", buf);
646
        return -ENOTSUP;
647
    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
648
        /* VMware KB 2064959 explains that version 3 added support for
649
         * persistent changed block tracking (CBT), and backup software can
650
         * read it as version=1 if it doesn't care about the changed area
651
         * information. So we are safe to enable read only. */
652
        error_setg(errp, "VMDK version 3 must be read only");
653
        return -EINVAL;
654
    }
655

    
656
    if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
657
        error_report("L2 table size too big");
658
        return -EINVAL;
659
    }
660

    
661
    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
662
                        * le64_to_cpu(header.granularity);
663
    if (l1_entry_sectors == 0) {
664
        return -EINVAL;
665
    }
666
    l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
667
                / l1_entry_sectors;
668
    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
669
        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
670
    }
671
    if (bdrv_getlength(file) <
672
            le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) {
673
        error_report("File truncated, expecting at least %lld bytes",
674
                le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
675
        return -EINVAL;
676
    }
677

    
678
    ret = vmdk_add_extent(bs, file, false,
679
                          le64_to_cpu(header.capacity),
680
                          le64_to_cpu(header.gd_offset) << 9,
681
                          l1_backup_offset,
682
                          l1_size,
683
                          le32_to_cpu(header.num_gtes_per_gt),
684
                          le64_to_cpu(header.granularity),
685
                          &extent,
686
                          errp);
687
    if (ret < 0) {
688
        return ret;
689
    }
690
    extent->compressed =
691
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
692
    if (extent->compressed) {
693
        g_free(s->create_type);
694
        s->create_type = g_strdup("streamOptimized");
695
    }
696
    extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
697
    extent->version = le32_to_cpu(header.version);
698
    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
699
    ret = vmdk_init_tables(bs, extent, errp);
700
    if (ret) {
701
        /* free extent allocated by vmdk_add_extent */
702
        vmdk_free_last_extent(bs);
703
    }
704
    return ret;
705
}
706

    
707
/* find an option value out of descriptor file */
708
static int vmdk_parse_description(const char *desc, const char *opt_name,
709
        char *buf, int buf_size)
710
{
711
    char *opt_pos, *opt_end;
712
    const char *end = desc + strlen(desc);
713

    
714
    opt_pos = strstr(desc, opt_name);
715
    if (!opt_pos) {
716
        return VMDK_ERROR;
717
    }
718
    /* Skip "=\"" following opt_name */
719
    opt_pos += strlen(opt_name) + 2;
720
    if (opt_pos >= end) {
721
        return VMDK_ERROR;
722
    }
723
    opt_end = opt_pos;
724
    while (opt_end < end && *opt_end != '"') {
725
        opt_end++;
726
    }
727
    if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
728
        return VMDK_ERROR;
729
    }
730
    pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
731
    return VMDK_OK;
732
}
733

    
734
/* Open an extent file and append to bs array */
735
static int vmdk_open_sparse(BlockDriverState *bs,
736
                            BlockDriverState *file, int flags,
737
                            char *buf, Error **errp)
738
{
739
    uint32_t magic;
740

    
741
    magic = ldl_be_p(buf);
742
    switch (magic) {
743
        case VMDK3_MAGIC:
744
            return vmdk_open_vmfs_sparse(bs, file, flags, errp);
745
            break;
746
        case VMDK4_MAGIC:
747
            return vmdk_open_vmdk4(bs, file, flags, errp);
748
            break;
749
        default:
750
            return -EMEDIUMTYPE;
751
            break;
752
    }
753
}
754

    
755
static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
756
                              const char *desc_file_path, Error **errp)
757
{
758
    int ret;
759
    char access[11];
760
    char type[11];
761
    char fname[512];
762
    const char *p = desc;
763
    int64_t sectors = 0;
764
    int64_t flat_offset;
765
    char extent_path[PATH_MAX];
766
    BlockDriverState *extent_file;
767
    BDRVVmdkState *s = bs->opaque;
768
    VmdkExtent *extent;
769

    
770
    while (*p) {
771
        /* parse extent line:
772
         * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
773
         * or
774
         * RW [size in sectors] SPARSE "file-name.vmdk"
775
         */
776
        flat_offset = -1;
777
        ret = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
778
                access, &sectors, type, fname, &flat_offset);
779
        if (ret < 4 || strcmp(access, "RW")) {
780
            goto next_line;
781
        } else if (!strcmp(type, "FLAT")) {
782
            if (ret != 5 || flat_offset < 0) {
783
                error_setg(errp, "Invalid extent lines: \n%s", p);
784
                return -EINVAL;
785
            }
786
        } else if (!strcmp(type, "VMFS")) {
787
            if (ret == 4) {
788
                flat_offset = 0;
789
            } else {
790
                error_setg(errp, "Invalid extent lines:\n%s", p);
791
                return -EINVAL;
792
            }
793
        } else if (ret != 4) {
794
            error_setg(errp, "Invalid extent lines:\n%s", p);
795
            return -EINVAL;
796
        }
797

    
798
        if (sectors <= 0 ||
799
            (strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
800
             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) ||
801
            (strcmp(access, "RW"))) {
802
            goto next_line;
803
        }
804

    
805
        path_combine(extent_path, sizeof(extent_path),
806
                desc_file_path, fname);
807
        extent_file = NULL;
808
        ret = bdrv_open(&extent_file, extent_path, NULL, NULL,
809
                        bs->open_flags | BDRV_O_PROTOCOL, NULL, errp);
810
        if (ret) {
811
            return ret;
812
        }
813

    
814
        /* save to extents array */
815
        if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
816
            /* FLAT extent */
817

    
818
            ret = vmdk_add_extent(bs, extent_file, true, sectors,
819
                            0, 0, 0, 0, 0, &extent, errp);
820
            if (ret < 0) {
821
                return ret;
822
            }
823
            extent->flat_start_offset = flat_offset << 9;
824
        } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
825
            /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
826
            char *buf = vmdk_read_desc(extent_file, 0, errp);
827
            if (!buf) {
828
                ret = -EINVAL;
829
            } else {
830
                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp);
831
            }
832
            if (ret) {
833
                g_free(buf);
834
                bdrv_unref(extent_file);
835
                return ret;
836
            }
837
            extent = &s->extents[s->num_extents - 1];
838
        } else {
839
            error_setg(errp, "Unsupported extent type '%s'", type);
840
            return -ENOTSUP;
841
        }
842
        extent->type = g_strdup(type);
843
next_line:
844
        /* move to next line */
845
        while (*p) {
846
            if (*p == '\n') {
847
                p++;
848
                break;
849
            }
850
            p++;
851
        }
852
    }
853
    return 0;
854
}
855

    
856
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
857
                               Error **errp)
858
{
859
    int ret;
860
    char ct[128];
861
    BDRVVmdkState *s = bs->opaque;
862

    
863
    if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
864
        ret = -EMEDIUMTYPE;
865
        goto exit;
866
    }
867
    if (strcmp(ct, "monolithicFlat") &&
868
        strcmp(ct, "vmfs") &&
869
        strcmp(ct, "vmfsSparse") &&
870
        strcmp(ct, "twoGbMaxExtentSparse") &&
871
        strcmp(ct, "twoGbMaxExtentFlat")) {
872
        error_setg(errp, "Unsupported image type '%s'", ct);
873
        ret = -ENOTSUP;
874
        goto exit;
875
    }
876
    s->create_type = g_strdup(ct);
877
    s->desc_offset = 0;
878
    ret = vmdk_parse_extents(buf, bs, bs->file->filename, errp);
879
exit:
880
    return ret;
881
}
882

    
883
static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
884
                     Error **errp)
885
{
886
    char *buf = NULL;
887
    int ret;
888
    BDRVVmdkState *s = bs->opaque;
889

    
890
    buf = vmdk_read_desc(bs->file, 0, errp);
891
    if (!buf) {
892
        return -EINVAL;
893
    }
894

    
895
    if (vmdk_open_sparse(bs, bs->file, flags, buf, errp) == 0) {
896
        s->desc_offset = 0x200;
897
    } else {
898
        ret = vmdk_open_desc_file(bs, flags, buf, errp);
899
        if (ret) {
900
            goto fail;
901
        }
902
    }
903
    /* try to open parent images, if exist */
904
    ret = vmdk_parent_open(bs);
905
    if (ret) {
906
        goto fail;
907
    }
908
    s->cid = vmdk_read_cid(bs, 0);
909
    s->parent_cid = vmdk_read_cid(bs, 1);
910
    qemu_co_mutex_init(&s->lock);
911

    
912
    /* Disable migration when VMDK images are used */
913
    error_set(&s->migration_blocker,
914
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
915
              "vmdk", bs->device_name, "live migration");
916
    migrate_add_blocker(s->migration_blocker);
917
    g_free(buf);
918
    return 0;
919

    
920
fail:
921
    g_free(buf);
922
    g_free(s->create_type);
923
    s->create_type = NULL;
924
    vmdk_free_extents(bs);
925
    return ret;
926
}
927

    
928

    
929
static int vmdk_refresh_limits(BlockDriverState *bs)
930
{
931
    BDRVVmdkState *s = bs->opaque;
932
    int i;
933

    
934
    for (i = 0; i < s->num_extents; i++) {
935
        if (!s->extents[i].flat) {
936
            bs->bl.write_zeroes_alignment =
937
                MAX(bs->bl.write_zeroes_alignment,
938
                    s->extents[i].cluster_sectors);
939
        }
940
    }
941

    
942
    return 0;
943
}
944

    
945
static int get_whole_cluster(BlockDriverState *bs,
946
                VmdkExtent *extent,
947
                uint64_t cluster_offset,
948
                uint64_t offset,
949
                bool allocate)
950
{
951
    int ret = VMDK_OK;
952
    uint8_t *whole_grain = NULL;
953

    
954
    /* we will be here if it's first write on non-exist grain(cluster).
955
     * try to read from parent image, if exist */
956
    if (bs->backing_hd) {
957
        whole_grain =
958
            qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
959
        if (!vmdk_is_cid_valid(bs)) {
960
            ret = VMDK_ERROR;
961
            goto exit;
962
        }
963

    
964
        /* floor offset to cluster */
965
        offset -= offset % (extent->cluster_sectors * 512);
966
        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
967
                extent->cluster_sectors);
968
        if (ret < 0) {
969
            ret = VMDK_ERROR;
970
            goto exit;
971
        }
972

    
973
        /* Write grain only into the active image */
974
        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
975
                extent->cluster_sectors);
976
        if (ret < 0) {
977
            ret = VMDK_ERROR;
978
            goto exit;
979
        }
980
    }
981
exit:
982
    qemu_vfree(whole_grain);
983
    return ret;
984
}
985

    
986
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
987
{
988
    uint32_t offset;
989
    QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
990
    offset = cpu_to_le32(m_data->offset);
991
    /* update L2 table */
992
    if (bdrv_pwrite_sync(
993
                extent->file,
994
                ((int64_t)m_data->l2_offset * 512)
995
                    + (m_data->l2_index * sizeof(m_data->offset)),
996
                &offset, sizeof(offset)) < 0) {
997
        return VMDK_ERROR;
998
    }
999
    /* update backup L2 table */
1000
    if (extent->l1_backup_table_offset != 0) {
1001
        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
1002
        if (bdrv_pwrite_sync(
1003
                    extent->file,
1004
                    ((int64_t)m_data->l2_offset * 512)
1005
                        + (m_data->l2_index * sizeof(m_data->offset)),
1006
                    &offset, sizeof(offset)) < 0) {
1007
            return VMDK_ERROR;
1008
        }
1009
    }
1010
    if (m_data->l2_cache_entry) {
1011
        *m_data->l2_cache_entry = offset;
1012
    }
1013

    
1014
    return VMDK_OK;
1015
}
1016

    
1017
static int get_cluster_offset(BlockDriverState *bs,
1018
                                    VmdkExtent *extent,
1019
                                    VmdkMetaData *m_data,
1020
                                    uint64_t offset,
1021
                                    int allocate,
1022
                                    uint64_t *cluster_offset)
1023
{
1024
    unsigned int l1_index, l2_offset, l2_index;
1025
    int min_index, i, j;
1026
    uint32_t min_count, *l2_table;
1027
    bool zeroed = false;
1028

    
1029
    if (m_data) {
1030
        m_data->valid = 0;
1031
    }
1032
    if (extent->flat) {
1033
        *cluster_offset = extent->flat_start_offset;
1034
        return VMDK_OK;
1035
    }
1036

    
1037
    offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
1038
    l1_index = (offset >> 9) / extent->l1_entry_sectors;
1039
    if (l1_index >= extent->l1_size) {
1040
        return VMDK_ERROR;
1041
    }
1042
    l2_offset = extent->l1_table[l1_index];
1043
    if (!l2_offset) {
1044
        return VMDK_UNALLOC;
1045
    }
1046
    for (i = 0; i < L2_CACHE_SIZE; i++) {
1047
        if (l2_offset == extent->l2_cache_offsets[i]) {
1048
            /* increment the hit count */
1049
            if (++extent->l2_cache_counts[i] == 0xffffffff) {
1050
                for (j = 0; j < L2_CACHE_SIZE; j++) {
1051
                    extent->l2_cache_counts[j] >>= 1;
1052
                }
1053
            }
1054
            l2_table = extent->l2_cache + (i * extent->l2_size);
1055
            goto found;
1056
        }
1057
    }
1058
    /* not found: load a new entry in the least used one */
1059
    min_index = 0;
1060
    min_count = 0xffffffff;
1061
    for (i = 0; i < L2_CACHE_SIZE; i++) {
1062
        if (extent->l2_cache_counts[i] < min_count) {
1063
            min_count = extent->l2_cache_counts[i];
1064
            min_index = i;
1065
        }
1066
    }
1067
    l2_table = extent->l2_cache + (min_index * extent->l2_size);
1068
    if (bdrv_pread(
1069
                extent->file,
1070
                (int64_t)l2_offset * 512,
1071
                l2_table,
1072
                extent->l2_size * sizeof(uint32_t)
1073
            ) != extent->l2_size * sizeof(uint32_t)) {
1074
        return VMDK_ERROR;
1075
    }
1076

    
1077
    extent->l2_cache_offsets[min_index] = l2_offset;
1078
    extent->l2_cache_counts[min_index] = 1;
1079
 found:
1080
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
1081
    *cluster_offset = le32_to_cpu(l2_table[l2_index]);
1082

    
1083
    if (m_data) {
1084
        m_data->valid = 1;
1085
        m_data->l1_index = l1_index;
1086
        m_data->l2_index = l2_index;
1087
        m_data->offset = *cluster_offset;
1088
        m_data->l2_offset = l2_offset;
1089
        m_data->l2_cache_entry = &l2_table[l2_index];
1090
    }
1091
    if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
1092
        zeroed = true;
1093
    }
1094

    
1095
    if (!*cluster_offset || zeroed) {
1096
        if (!allocate) {
1097
            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
1098
        }
1099

    
1100
        /* Avoid the L2 tables update for the images that have snapshots. */
1101
        *cluster_offset = bdrv_getlength(extent->file);
1102
        if (!extent->compressed) {
1103
            bdrv_truncate(
1104
                extent->file,
1105
                *cluster_offset + (extent->cluster_sectors << 9)
1106
            );
1107
        }
1108

    
1109
        *cluster_offset >>= 9;
1110
        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
1111

    
1112
        /* First of all we write grain itself, to avoid race condition
1113
         * that may to corrupt the image.
1114
         * This problem may occur because of insufficient space on host disk
1115
         * or inappropriate VM shutdown.
1116
         */
1117
        if (get_whole_cluster(
1118
                bs, extent, *cluster_offset, offset, allocate) == -1) {
1119
            return VMDK_ERROR;
1120
        }
1121

    
1122
        if (m_data) {
1123
            m_data->offset = *cluster_offset;
1124
        }
1125
    }
1126
    *cluster_offset <<= 9;
1127
    return VMDK_OK;
1128
}
1129

    
1130
static VmdkExtent *find_extent(BDRVVmdkState *s,
1131
                                int64_t sector_num, VmdkExtent *start_hint)
1132
{
1133
    VmdkExtent *extent = start_hint;
1134

    
1135
    if (!extent) {
1136
        extent = &s->extents[0];
1137
    }
1138
    while (extent < &s->extents[s->num_extents]) {
1139
        if (sector_num < extent->end_sector) {
1140
            return extent;
1141
        }
1142
        extent++;
1143
    }
1144
    return NULL;
1145
}
1146

    
1147
static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
1148
        int64_t sector_num, int nb_sectors, int *pnum)
1149
{
1150
    BDRVVmdkState *s = bs->opaque;
1151
    int64_t index_in_cluster, n, ret;
1152
    uint64_t offset;
1153
    VmdkExtent *extent;
1154

    
1155
    extent = find_extent(s, sector_num, NULL);
1156
    if (!extent) {
1157
        return 0;
1158
    }
1159
    qemu_co_mutex_lock(&s->lock);
1160
    ret = get_cluster_offset(bs, extent, NULL,
1161
                            sector_num * 512, 0, &offset);
1162
    qemu_co_mutex_unlock(&s->lock);
1163

    
1164
    switch (ret) {
1165
    case VMDK_ERROR:
1166
        ret = -EIO;
1167
        break;
1168
    case VMDK_UNALLOC:
1169
        ret = 0;
1170
        break;
1171
    case VMDK_ZEROED:
1172
        ret = BDRV_BLOCK_ZERO;
1173
        break;
1174
    case VMDK_OK:
1175
        ret = BDRV_BLOCK_DATA;
1176
        if (extent->file == bs->file) {
1177
            ret |= BDRV_BLOCK_OFFSET_VALID | offset;
1178
        }
1179

    
1180
        break;
1181
    }
1182

    
1183
    index_in_cluster = sector_num % extent->cluster_sectors;
1184
    n = extent->cluster_sectors - index_in_cluster;
1185
    if (n > nb_sectors) {
1186
        n = nb_sectors;
1187
    }
1188
    *pnum = n;
1189
    return ret;
1190
}
1191

    
1192
static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
1193
                            int64_t offset_in_cluster, const uint8_t *buf,
1194
                            int nb_sectors, int64_t sector_num)
1195
{
1196
    int ret;
1197
    VmdkGrainMarker *data = NULL;
1198
    uLongf buf_len;
1199
    const uint8_t *write_buf = buf;
1200
    int write_len = nb_sectors * 512;
1201

    
1202
    if (extent->compressed) {
1203
        if (!extent->has_marker) {
1204
            ret = -EINVAL;
1205
            goto out;
1206
        }
1207
        buf_len = (extent->cluster_sectors << 9) * 2;
1208
        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
1209
        if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
1210
                buf_len == 0) {
1211
            ret = -EINVAL;
1212
            goto out;
1213
        }
1214
        data->lba = sector_num;
1215
        data->size = buf_len;
1216
        write_buf = (uint8_t *)data;
1217
        write_len = buf_len + sizeof(VmdkGrainMarker);
1218
    }
1219
    ret = bdrv_pwrite(extent->file,
1220
                        cluster_offset + offset_in_cluster,
1221
                        write_buf,
1222
                        write_len);
1223
    if (ret != write_len) {
1224
        ret = ret < 0 ? ret : -EIO;
1225
        goto out;
1226
    }
1227
    ret = 0;
1228
 out:
1229
    g_free(data);
1230
    return ret;
1231
}
1232

    
1233
static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
1234
                            int64_t offset_in_cluster, uint8_t *buf,
1235
                            int nb_sectors)
1236
{
1237
    int ret;
1238
    int cluster_bytes, buf_bytes;
1239
    uint8_t *cluster_buf, *compressed_data;
1240
    uint8_t *uncomp_buf;
1241
    uint32_t data_len;
1242
    VmdkGrainMarker *marker;
1243
    uLongf buf_len;
1244

    
1245

    
1246
    if (!extent->compressed) {
1247
        ret = bdrv_pread(extent->file,
1248
                          cluster_offset + offset_in_cluster,
1249
                          buf, nb_sectors * 512);
1250
        if (ret == nb_sectors * 512) {
1251
            return 0;
1252
        } else {
1253
            return -EIO;
1254
        }
1255
    }
1256
    cluster_bytes = extent->cluster_sectors * 512;
1257
    /* Read two clusters in case GrainMarker + compressed data > one cluster */
1258
    buf_bytes = cluster_bytes * 2;
1259
    cluster_buf = g_malloc(buf_bytes);
1260
    uncomp_buf = g_malloc(cluster_bytes);
1261
    ret = bdrv_pread(extent->file,
1262
                cluster_offset,
1263
                cluster_buf, buf_bytes);
1264
    if (ret < 0) {
1265
        goto out;
1266
    }
1267
    compressed_data = cluster_buf;
1268
    buf_len = cluster_bytes;
1269
    data_len = cluster_bytes;
1270
    if (extent->has_marker) {
1271
        marker = (VmdkGrainMarker *)cluster_buf;
1272
        compressed_data = marker->data;
1273
        data_len = le32_to_cpu(marker->size);
1274
    }
1275
    if (!data_len || data_len > buf_bytes) {
1276
        ret = -EINVAL;
1277
        goto out;
1278
    }
1279
    ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
1280
    if (ret != Z_OK) {
1281
        ret = -EINVAL;
1282
        goto out;
1283

    
1284
    }
1285
    if (offset_in_cluster < 0 ||
1286
            offset_in_cluster + nb_sectors * 512 > buf_len) {
1287
        ret = -EINVAL;
1288
        goto out;
1289
    }
1290
    memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
1291
    ret = 0;
1292

    
1293
 out:
1294
    g_free(uncomp_buf);
1295
    g_free(cluster_buf);
1296
    return ret;
1297
}
1298

    
1299
static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
1300
                    uint8_t *buf, int nb_sectors)
1301
{
1302
    BDRVVmdkState *s = bs->opaque;
1303
    int ret;
1304
    uint64_t n, index_in_cluster;
1305
    uint64_t extent_begin_sector, extent_relative_sector_num;
1306
    VmdkExtent *extent = NULL;
1307
    uint64_t cluster_offset;
1308

    
1309
    while (nb_sectors > 0) {
1310
        extent = find_extent(s, sector_num, extent);
1311
        if (!extent) {
1312
            return -EIO;
1313
        }
1314
        ret = get_cluster_offset(
1315
                            bs, extent, NULL,
1316
                            sector_num << 9, 0, &cluster_offset);
1317
        extent_begin_sector = extent->end_sector - extent->sectors;
1318
        extent_relative_sector_num = sector_num - extent_begin_sector;
1319
        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
1320
        n = extent->cluster_sectors - index_in_cluster;
1321
        if (n > nb_sectors) {
1322
            n = nb_sectors;
1323
        }
1324
        if (ret != VMDK_OK) {
1325
            /* if not allocated, try to read from parent image, if exist */
1326
            if (bs->backing_hd && ret != VMDK_ZEROED) {
1327
                if (!vmdk_is_cid_valid(bs)) {
1328
                    return -EINVAL;
1329
                }
1330
                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
1331
                if (ret < 0) {
1332
                    return ret;
1333
                }
1334
            } else {
1335
                memset(buf, 0, 512 * n);
1336
            }
1337
        } else {
1338
            ret = vmdk_read_extent(extent,
1339
                            cluster_offset, index_in_cluster * 512,
1340
                            buf, n);
1341
            if (ret) {
1342
                return ret;
1343
            }
1344
        }
1345
        nb_sectors -= n;
1346
        sector_num += n;
1347
        buf += n * 512;
1348
    }
1349
    return 0;
1350
}
1351

    
1352
static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
1353
                                     uint8_t *buf, int nb_sectors)
1354
{
1355
    int ret;
1356
    BDRVVmdkState *s = bs->opaque;
1357
    qemu_co_mutex_lock(&s->lock);
1358
    ret = vmdk_read(bs, sector_num, buf, nb_sectors);
1359
    qemu_co_mutex_unlock(&s->lock);
1360
    return ret;
1361
}
1362

    
1363
/**
1364
 * vmdk_write:
1365
 * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
1366
 *                if possible, otherwise return -ENOTSUP.
1367
 * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
1368
 *                with each cluster. By dry run we can find if the zero write
1369
 *                is possible without modifying image data.
1370
 *
1371
 * Returns: error code with 0 for success.
1372
 */
1373
static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
1374
                      const uint8_t *buf, int nb_sectors,
1375
                      bool zeroed, bool zero_dry_run)
1376
{
1377
    BDRVVmdkState *s = bs->opaque;
1378
    VmdkExtent *extent = NULL;
1379
    int ret;
1380
    int64_t index_in_cluster, n;
1381
    uint64_t extent_begin_sector, extent_relative_sector_num;
1382
    uint64_t cluster_offset;
1383
    VmdkMetaData m_data;
1384

    
1385
    if (sector_num > bs->total_sectors) {
1386
        error_report("Wrong offset: sector_num=0x%" PRIx64
1387
                " total_sectors=0x%" PRIx64 "\n",
1388
                sector_num, bs->total_sectors);
1389
        return -EIO;
1390
    }
1391

    
1392
    while (nb_sectors > 0) {
1393
        extent = find_extent(s, sector_num, extent);
1394
        if (!extent) {
1395
            return -EIO;
1396
        }
1397
        ret = get_cluster_offset(
1398
                                bs,
1399
                                extent,
1400
                                &m_data,
1401
                                sector_num << 9, !extent->compressed,
1402
                                &cluster_offset);
1403
        if (extent->compressed) {
1404
            if (ret == VMDK_OK) {
1405
                /* Refuse write to allocated cluster for streamOptimized */
1406
                error_report("Could not write to allocated cluster"
1407
                              " for streamOptimized");
1408
                return -EIO;
1409
            } else {
1410
                /* allocate */
1411
                ret = get_cluster_offset(
1412
                                        bs,
1413
                                        extent,
1414
                                        &m_data,
1415
                                        sector_num << 9, 1,
1416
                                        &cluster_offset);
1417
            }
1418
        }
1419
        if (ret == VMDK_ERROR) {
1420
            return -EINVAL;
1421
        }
1422
        extent_begin_sector = extent->end_sector - extent->sectors;
1423
        extent_relative_sector_num = sector_num - extent_begin_sector;
1424
        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
1425
        n = extent->cluster_sectors - index_in_cluster;
1426
        if (n > nb_sectors) {
1427
            n = nb_sectors;
1428
        }
1429
        if (zeroed) {
1430
            /* Do zeroed write, buf is ignored */
1431
            if (extent->has_zero_grain &&
1432
                    index_in_cluster == 0 &&
1433
                    n >= extent->cluster_sectors) {
1434
                n = extent->cluster_sectors;
1435
                if (!zero_dry_run) {
1436
                    m_data.offset = VMDK_GTE_ZEROED;
1437
                    /* update L2 tables */
1438
                    if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
1439
                        return -EIO;
1440
                    }
1441
                }
1442
            } else {
1443
                return -ENOTSUP;
1444
            }
1445
        } else {
1446
            ret = vmdk_write_extent(extent,
1447
                            cluster_offset, index_in_cluster * 512,
1448
                            buf, n, sector_num);
1449
            if (ret) {
1450
                return ret;
1451
            }
1452
            if (m_data.valid) {
1453
                /* update L2 tables */
1454
                if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
1455
                    return -EIO;
1456
                }
1457
            }
1458
        }
1459
        nb_sectors -= n;
1460
        sector_num += n;
1461
        buf += n * 512;
1462

    
1463
        /* update CID on the first write every time the virtual disk is
1464
         * opened */
1465
        if (!s->cid_updated) {
1466
            ret = vmdk_write_cid(bs, time(NULL));
1467
            if (ret < 0) {
1468
                return ret;
1469
            }
1470
            s->cid_updated = true;
1471
        }
1472
    }
1473
    return 0;
1474
}
1475

    
1476
static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
1477
                                      const uint8_t *buf, int nb_sectors)
1478
{
1479
    int ret;
1480
    BDRVVmdkState *s = bs->opaque;
1481
    qemu_co_mutex_lock(&s->lock);
1482
    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
1483
    qemu_co_mutex_unlock(&s->lock);
1484
    return ret;
1485
}
1486

    
1487
static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
1488
                                             int64_t sector_num,
1489
                                             int nb_sectors,
1490
                                             BdrvRequestFlags flags)
1491
{
1492
    int ret;
1493
    BDRVVmdkState *s = bs->opaque;
1494
    qemu_co_mutex_lock(&s->lock);
1495
    /* write zeroes could fail if sectors not aligned to cluster, test it with
1496
     * dry_run == true before really updating image */
1497
    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
1498
    if (!ret) {
1499
        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
1500
    }
1501
    qemu_co_mutex_unlock(&s->lock);
1502
    return ret;
1503
}
1504

    
1505
static int vmdk_create_extent(const char *filename, int64_t filesize,
1506
                              bool flat, bool compress, bool zeroed_grain,
1507
                              Error **errp)
1508
{
1509
    int ret, i;
1510
    BlockDriverState *bs = NULL;
1511
    VMDK4Header header;
1512
    Error *local_err;
1513
    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
1514
    uint32_t *gd_buf = NULL;
1515
    int gd_buf_size;
1516

    
1517
    ret = bdrv_create_file(filename, NULL, &local_err);
1518
    if (ret < 0) {
1519
        error_propagate(errp, local_err);
1520
        goto exit;
1521
    }
1522

    
1523
    assert(bs == NULL);
1524
    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
1525
                    NULL, &local_err);
1526
    if (ret < 0) {
1527
        error_propagate(errp, local_err);
1528
        goto exit;
1529
    }
1530

    
1531
    if (flat) {
1532
        ret = bdrv_truncate(bs, filesize);
1533
        if (ret < 0) {
1534
            error_setg_errno(errp, -ret, "Could not truncate file");
1535
        }
1536
        goto exit;
1537
    }
1538
    magic = cpu_to_be32(VMDK4_MAGIC);
1539
    memset(&header, 0, sizeof(header));
1540
    header.version = zeroed_grain ? 2 : 1;
1541
    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
1542
                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
1543
                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
1544
    header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
1545
    header.capacity = filesize / BDRV_SECTOR_SIZE;
1546
    header.granularity = 128;
1547
    header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
1548

    
1549
    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
1550
    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
1551
                           BDRV_SECTOR_SIZE);
1552
    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
1553
    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
1554

    
1555
    header.desc_offset = 1;
1556
    header.desc_size = 20;
1557
    header.rgd_offset = header.desc_offset + header.desc_size;
1558
    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
1559
    header.grain_offset =
1560
        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
1561
                 header.granularity);
1562
    /* swap endianness for all header fields */
1563
    header.version = cpu_to_le32(header.version);
1564
    header.flags = cpu_to_le32(header.flags);
1565
    header.capacity = cpu_to_le64(header.capacity);
1566
    header.granularity = cpu_to_le64(header.granularity);
1567
    header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
1568
    header.desc_offset = cpu_to_le64(header.desc_offset);
1569
    header.desc_size = cpu_to_le64(header.desc_size);
1570
    header.rgd_offset = cpu_to_le64(header.rgd_offset);
1571
    header.gd_offset = cpu_to_le64(header.gd_offset);
1572
    header.grain_offset = cpu_to_le64(header.grain_offset);
1573
    header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
1574

    
1575
    header.check_bytes[0] = 0xa;
1576
    header.check_bytes[1] = 0x20;
1577
    header.check_bytes[2] = 0xd;
1578
    header.check_bytes[3] = 0xa;
1579

    
1580
    /* write all the data */
1581
    ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic));
1582
    if (ret < 0) {
1583
        error_set(errp, QERR_IO_ERROR);
1584
        goto exit;
1585
    }
1586
    ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header));
1587
    if (ret < 0) {
1588
        error_set(errp, QERR_IO_ERROR);
1589
        goto exit;
1590
    }
1591

    
1592
    ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9);
1593
    if (ret < 0) {
1594
        error_setg_errno(errp, -ret, "Could not truncate file");
1595
        goto exit;
1596
    }
1597

    
1598
    /* write grain directory */
1599
    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
1600
    gd_buf = g_malloc0(gd_buf_size);
1601
    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
1602
         i < gt_count; i++, tmp += gt_size) {
1603
        gd_buf[i] = cpu_to_le32(tmp);
1604
    }
1605
    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
1606
                      gd_buf, gd_buf_size);
1607
    if (ret < 0) {
1608
        error_set(errp, QERR_IO_ERROR);
1609
        goto exit;
1610
    }
1611

    
1612
    /* write backup grain directory */
1613
    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
1614
         i < gt_count; i++, tmp += gt_size) {
1615
        gd_buf[i] = cpu_to_le32(tmp);
1616
    }
1617
    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
1618
                      gd_buf, gd_buf_size);
1619
    if (ret < 0) {
1620
        error_set(errp, QERR_IO_ERROR);
1621
        goto exit;
1622
    }
1623

    
1624
    ret = 0;
1625
exit:
1626
    if (bs) {
1627
        bdrv_unref(bs);
1628
    }
1629
    g_free(gd_buf);
1630
    return ret;
1631
}
1632

    
1633
static int filename_decompose(const char *filename, char *path, char *prefix,
1634
                              char *postfix, size_t buf_len, Error **errp)
1635
{
1636
    const char *p, *q;
1637

    
1638
    if (filename == NULL || !strlen(filename)) {
1639
        error_setg(errp, "No filename provided");
1640
        return VMDK_ERROR;
1641
    }
1642
    p = strrchr(filename, '/');
1643
    if (p == NULL) {
1644
        p = strrchr(filename, '\\');
1645
    }
1646
    if (p == NULL) {
1647
        p = strrchr(filename, ':');
1648
    }
1649
    if (p != NULL) {
1650
        p++;
1651
        if (p - filename >= buf_len) {
1652
            return VMDK_ERROR;
1653
        }
1654
        pstrcpy(path, p - filename + 1, filename);
1655
    } else {
1656
        p = filename;
1657
        path[0] = '\0';
1658
    }
1659
    q = strrchr(p, '.');
1660
    if (q == NULL) {
1661
        pstrcpy(prefix, buf_len, p);
1662
        postfix[0] = '\0';
1663
    } else {
1664
        if (q - p >= buf_len) {
1665
            return VMDK_ERROR;
1666
        }
1667
        pstrcpy(prefix, q - p + 1, p);
1668
        pstrcpy(postfix, buf_len, q);
1669
    }
1670
    return VMDK_OK;
1671
}
1672

    
1673
static int vmdk_create(const char *filename, QEMUOptionParameter *options,
1674
                       Error **errp)
1675
{
1676
    int idx = 0;
1677
    BlockDriverState *new_bs = NULL;
1678
    Error *local_err;
1679
    char *desc = NULL;
1680
    int64_t total_size = 0, filesize;
1681
    const char *adapter_type = NULL;
1682
    const char *backing_file = NULL;
1683
    const char *fmt = NULL;
1684
    int flags = 0;
1685
    int ret = 0;
1686
    bool flat, split, compress;
1687
    GString *ext_desc_lines;
1688
    char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX];
1689
    const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
1690
    const char *desc_extent_line;
1691
    char parent_desc_line[BUF_SIZE] = "";
1692
    uint32_t parent_cid = 0xffffffff;
1693
    uint32_t number_heads = 16;
1694
    bool zeroed_grain = false;
1695
    uint32_t desc_offset = 0, desc_len;
1696
    const char desc_template[] =
1697
        "# Disk DescriptorFile\n"
1698
        "version=1\n"
1699
        "CID=%x\n"
1700
        "parentCID=%x\n"
1701
        "createType=\"%s\"\n"
1702
        "%s"
1703
        "\n"
1704
        "# Extent description\n"
1705
        "%s"
1706
        "\n"
1707
        "# The Disk Data Base\n"
1708
        "#DDB\n"
1709
        "\n"
1710
        "ddb.virtualHWVersion = \"%d\"\n"
1711
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
1712
        "ddb.geometry.heads = \"%d\"\n"
1713
        "ddb.geometry.sectors = \"63\"\n"
1714
        "ddb.adapterType = \"%s\"\n";
1715

    
1716
    ext_desc_lines = g_string_new(NULL);
1717

    
1718
    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
1719
        ret = -EINVAL;
1720
        goto exit;
1721
    }
1722
    /* Read out options */
1723
    while (options && options->name) {
1724
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1725
            total_size = options->value.n;
1726
        } else if (!strcmp(options->name, BLOCK_OPT_ADAPTER_TYPE)) {
1727
            adapter_type = options->value.s;
1728
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1729
            backing_file = options->value.s;
1730
        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
1731
            flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
1732
        } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
1733
            fmt = options->value.s;
1734
        } else if (!strcmp(options->name, BLOCK_OPT_ZEROED_GRAIN)) {
1735
            zeroed_grain |= options->value.n;
1736
        }
1737
        options++;
1738
    }
1739
    if (!adapter_type) {
1740
        adapter_type = "ide";
1741
    } else if (strcmp(adapter_type, "ide") &&
1742
               strcmp(adapter_type, "buslogic") &&
1743
               strcmp(adapter_type, "lsilogic") &&
1744
               strcmp(adapter_type, "legacyESX")) {
1745
        error_setg(errp, "Unknown adapter type: '%s'", adapter_type);
1746
        ret = -EINVAL;
1747
        goto exit;
1748
    }
1749
    if (strcmp(adapter_type, "ide") != 0) {
1750
        /* that's the number of heads with which vmware operates when
1751
           creating, exporting, etc. vmdk files with a non-ide adapter type */
1752
        number_heads = 255;
1753
    }
1754
    if (!fmt) {
1755
        /* Default format to monolithicSparse */
1756
        fmt = "monolithicSparse";
1757
    } else if (strcmp(fmt, "monolithicFlat") &&
1758
               strcmp(fmt, "monolithicSparse") &&
1759
               strcmp(fmt, "twoGbMaxExtentSparse") &&
1760
               strcmp(fmt, "twoGbMaxExtentFlat") &&
1761
               strcmp(fmt, "streamOptimized")) {
1762
        error_setg(errp, "Unknown subformat: '%s'", fmt);
1763
        ret = -EINVAL;
1764
        goto exit;
1765
    }
1766
    split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
1767
              strcmp(fmt, "twoGbMaxExtentSparse"));
1768
    flat = !(strcmp(fmt, "monolithicFlat") &&
1769
             strcmp(fmt, "twoGbMaxExtentFlat"));
1770
    compress = !strcmp(fmt, "streamOptimized");
1771
    if (flat) {
1772
        desc_extent_line = "RW %lld FLAT \"%s\" 0\n";
1773
    } else {
1774
        desc_extent_line = "RW %lld SPARSE \"%s\"\n";
1775
    }
1776
    if (flat && backing_file) {
1777
        error_setg(errp, "Flat image can't have backing file");
1778
        ret = -ENOTSUP;
1779
        goto exit;
1780
    }
1781
    if (flat && zeroed_grain) {
1782
        error_setg(errp, "Flat image can't enable zeroed grain");
1783
        ret = -ENOTSUP;
1784
        goto exit;
1785
    }
1786
    if (backing_file) {
1787
        BlockDriverState *bs = NULL;
1788
        ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_NO_BACKING, NULL,
1789
                        errp);
1790
        if (ret != 0) {
1791
            goto exit;
1792
        }
1793
        if (strcmp(bs->drv->format_name, "vmdk")) {
1794
            bdrv_unref(bs);
1795
            ret = -EINVAL;
1796
            goto exit;
1797
        }
1798
        parent_cid = vmdk_read_cid(bs, 0);
1799
        bdrv_unref(bs);
1800
        snprintf(parent_desc_line, sizeof(parent_desc_line),
1801
                "parentFileNameHint=\"%s\"", backing_file);
1802
    }
1803

    
1804
    /* Create extents */
1805
    filesize = total_size;
1806
    while (filesize > 0) {
1807
        char desc_line[BUF_SIZE];
1808
        char ext_filename[PATH_MAX];
1809
        char desc_filename[PATH_MAX];
1810
        int64_t size = filesize;
1811

    
1812
        if (split && size > split_size) {
1813
            size = split_size;
1814
        }
1815
        if (split) {
1816
            snprintf(desc_filename, sizeof(desc_filename), "%s-%c%03d%s",
1817
                    prefix, flat ? 'f' : 's', ++idx, postfix);
1818
        } else if (flat) {
1819
            snprintf(desc_filename, sizeof(desc_filename), "%s-flat%s",
1820
                    prefix, postfix);
1821
        } else {
1822
            snprintf(desc_filename, sizeof(desc_filename), "%s%s",
1823
                    prefix, postfix);
1824
        }
1825
        snprintf(ext_filename, sizeof(ext_filename), "%s%s",
1826
                path, desc_filename);
1827

    
1828
        if (vmdk_create_extent(ext_filename, size,
1829
                               flat, compress, zeroed_grain, errp)) {
1830
            ret = -EINVAL;
1831
            goto exit;
1832
        }
1833
        filesize -= size;
1834

    
1835
        /* Format description line */
1836
        snprintf(desc_line, sizeof(desc_line),
1837
                    desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename);
1838
        g_string_append(ext_desc_lines, desc_line);
1839
    }
1840
    /* generate descriptor file */
1841
    desc = g_strdup_printf(desc_template,
1842
                           (unsigned int)time(NULL),
1843
                           parent_cid,
1844
                           fmt,
1845
                           parent_desc_line,
1846
                           ext_desc_lines->str,
1847
                           (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
1848
                           total_size /
1849
                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
1850
                           number_heads,
1851
                           adapter_type);
1852
    desc_len = strlen(desc);
1853
    /* the descriptor offset = 0x200 */
1854
    if (!split && !flat) {
1855
        desc_offset = 0x200;
1856
    } else {
1857
        ret = bdrv_create_file(filename, options, &local_err);
1858
        if (ret < 0) {
1859
            error_setg_errno(errp, -ret, "Could not create image file");
1860
            goto exit;
1861
        }
1862
    }
1863
    assert(new_bs == NULL);
1864
    ret = bdrv_open(&new_bs, filename, NULL, NULL,
1865
                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err);
1866
    if (ret < 0) {
1867
        error_setg_errno(errp, -ret, "Could not write description");
1868
        goto exit;
1869
    }
1870
    ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len);
1871
    if (ret < 0) {
1872
        error_setg_errno(errp, -ret, "Could not write description");
1873
        goto exit;
1874
    }
1875
    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
1876
     * for description file */
1877
    if (desc_offset == 0) {
1878
        ret = bdrv_truncate(new_bs, desc_len);
1879
        if (ret < 0) {
1880
            error_setg_errno(errp, -ret, "Could not truncate file");
1881
        }
1882
    }
1883
exit:
1884
    if (new_bs) {
1885
        bdrv_unref(new_bs);
1886
    }
1887
    g_free(desc);
1888
    g_string_free(ext_desc_lines, true);
1889
    return ret;
1890
}
1891

    
1892
static void vmdk_close(BlockDriverState *bs)
1893
{
1894
    BDRVVmdkState *s = bs->opaque;
1895

    
1896
    vmdk_free_extents(bs);
1897
    g_free(s->create_type);
1898

    
1899
    migrate_del_blocker(s->migration_blocker);
1900
    error_free(s->migration_blocker);
1901
}
1902

    
1903
static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
1904
{
1905
    BDRVVmdkState *s = bs->opaque;
1906
    int i, err;
1907
    int ret = 0;
1908

    
1909
    for (i = 0; i < s->num_extents; i++) {
1910
        err = bdrv_co_flush(s->extents[i].file);
1911
        if (err < 0) {
1912
            ret = err;
1913
        }
1914
    }
1915
    return ret;
1916
}
1917

    
1918
static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
1919
{
1920
    int i;
1921
    int64_t ret = 0;
1922
    int64_t r;
1923
    BDRVVmdkState *s = bs->opaque;
1924

    
1925
    ret = bdrv_get_allocated_file_size(bs->file);
1926
    if (ret < 0) {
1927
        return ret;
1928
    }
1929
    for (i = 0; i < s->num_extents; i++) {
1930
        if (s->extents[i].file == bs->file) {
1931
            continue;
1932
        }
1933
        r = bdrv_get_allocated_file_size(s->extents[i].file);
1934
        if (r < 0) {
1935
            return r;
1936
        }
1937
        ret += r;
1938
    }
1939
    return ret;
1940
}
1941

    
1942
static int vmdk_has_zero_init(BlockDriverState *bs)
1943
{
1944
    int i;
1945
    BDRVVmdkState *s = bs->opaque;
1946

    
1947
    /* If has a flat extent and its underlying storage doesn't have zero init,
1948
     * return 0. */
1949
    for (i = 0; i < s->num_extents; i++) {
1950
        if (s->extents[i].flat) {
1951
            if (!bdrv_has_zero_init(s->extents[i].file)) {
1952
                return 0;
1953
            }
1954
        }
1955
    }
1956
    return 1;
1957
}
1958

    
1959
static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
1960
{
1961
    ImageInfo *info = g_new0(ImageInfo, 1);
1962

    
1963
    *info = (ImageInfo){
1964
        .filename         = g_strdup(extent->file->filename),
1965
        .format           = g_strdup(extent->type),
1966
        .virtual_size     = extent->sectors * BDRV_SECTOR_SIZE,
1967
        .compressed       = extent->compressed,
1968
        .has_compressed   = extent->compressed,
1969
        .cluster_size     = extent->cluster_sectors * BDRV_SECTOR_SIZE,
1970
        .has_cluster_size = !extent->flat,
1971
    };
1972

    
1973
    return info;
1974
}
1975

    
1976
static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
1977
                      BdrvCheckMode fix)
1978
{
1979
    BDRVVmdkState *s = bs->opaque;
1980
    VmdkExtent *extent = NULL;
1981
    int64_t sector_num = 0;
1982
    int64_t total_sectors = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
1983
    int ret;
1984
    uint64_t cluster_offset;
1985

    
1986
    if (fix) {
1987
        return -ENOTSUP;
1988
    }
1989

    
1990
    for (;;) {
1991
        if (sector_num >= total_sectors) {
1992
            return 0;
1993
        }
1994
        extent = find_extent(s, sector_num, extent);
1995
        if (!extent) {
1996
            fprintf(stderr,
1997
                    "ERROR: could not find extent for sector %" PRId64 "\n",
1998
                    sector_num);
1999
            break;
2000
        }
2001
        ret = get_cluster_offset(bs, extent, NULL,
2002
                                 sector_num << BDRV_SECTOR_BITS,
2003
                                 0, &cluster_offset);
2004
        if (ret == VMDK_ERROR) {
2005
            fprintf(stderr,
2006
                    "ERROR: could not get cluster_offset for sector %"
2007
                    PRId64 "\n", sector_num);
2008
            break;
2009
        }
2010
        if (ret == VMDK_OK && cluster_offset >= bdrv_getlength(extent->file)) {
2011
            fprintf(stderr,
2012
                    "ERROR: cluster offset for sector %"
2013
                    PRId64 " points after EOF\n", sector_num);
2014
            break;
2015
        }
2016
        sector_num += extent->cluster_sectors;
2017
    }
2018

    
2019
    result->corruptions++;
2020
    return 0;
2021
}
2022

    
2023
static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
2024
{
2025
    int i;
2026
    BDRVVmdkState *s = bs->opaque;
2027
    ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1);
2028
    ImageInfoList **next;
2029

    
2030
    *spec_info = (ImageInfoSpecific){
2031
        .kind = IMAGE_INFO_SPECIFIC_KIND_VMDK,
2032
        {
2033
            .vmdk = g_new0(ImageInfoSpecificVmdk, 1),
2034
        },
2035
    };
2036

    
2037
    *spec_info->vmdk = (ImageInfoSpecificVmdk) {
2038
        .create_type = g_strdup(s->create_type),
2039
        .cid = s->cid,
2040
        .parent_cid = s->parent_cid,
2041
    };
2042

    
2043
    next = &spec_info->vmdk->extents;
2044
    for (i = 0; i < s->num_extents; i++) {
2045
        *next = g_new0(ImageInfoList, 1);
2046
        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
2047
        (*next)->next = NULL;
2048
        next = &(*next)->next;
2049
    }
2050

    
2051
    return spec_info;
2052
}
2053

    
2054
static QEMUOptionParameter vmdk_create_options[] = {
2055
    {
2056
        .name = BLOCK_OPT_SIZE,
2057
        .type = OPT_SIZE,
2058
        .help = "Virtual disk size"
2059
    },
2060
    {
2061
        .name = BLOCK_OPT_ADAPTER_TYPE,
2062
        .type = OPT_STRING,
2063
        .help = "Virtual adapter type, can be one of "
2064
                "ide (default), lsilogic, buslogic or legacyESX"
2065
    },
2066
    {
2067
        .name = BLOCK_OPT_BACKING_FILE,
2068
        .type = OPT_STRING,
2069
        .help = "File name of a base image"
2070
    },
2071
    {
2072
        .name = BLOCK_OPT_COMPAT6,
2073
        .type = OPT_FLAG,
2074
        .help = "VMDK version 6 image"
2075
    },
2076
    {
2077
        .name = BLOCK_OPT_SUBFMT,
2078
        .type = OPT_STRING,
2079
        .help =
2080
            "VMDK flat extent format, can be one of "
2081
            "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
2082
    },
2083
    {
2084
        .name = BLOCK_OPT_ZEROED_GRAIN,
2085
        .type = OPT_FLAG,
2086
        .help = "Enable efficient zero writes using the zeroed-grain GTE feature"
2087
    },
2088
    { NULL }
2089
};
2090

    
2091
static BlockDriver bdrv_vmdk = {
2092
    .format_name                  = "vmdk",
2093
    .instance_size                = sizeof(BDRVVmdkState),
2094
    .bdrv_probe                   = vmdk_probe,
2095
    .bdrv_open                    = vmdk_open,
2096
    .bdrv_check                   = vmdk_check,
2097
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
2098
    .bdrv_read                    = vmdk_co_read,
2099
    .bdrv_write                   = vmdk_co_write,
2100
    .bdrv_co_write_zeroes         = vmdk_co_write_zeroes,
2101
    .bdrv_close                   = vmdk_close,
2102
    .bdrv_create                  = vmdk_create,
2103
    .bdrv_co_flush_to_disk        = vmdk_co_flush,
2104
    .bdrv_co_get_block_status     = vmdk_co_get_block_status,
2105
    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
2106
    .bdrv_has_zero_init           = vmdk_has_zero_init,
2107
    .bdrv_get_specific_info       = vmdk_get_specific_info,
2108
    .bdrv_refresh_limits          = vmdk_refresh_limits,
2109

    
2110
    .create_options               = vmdk_create_options,
2111
};
2112

    
2113
static void bdrv_vmdk_init(void)
2114
{
2115
    bdrv_register(&bdrv_vmdk);
2116
}
2117

    
2118
block_init(bdrv_vmdk_init);