Statistics
| Branch: | Revision:

root / block / vmdk.c @ 0e69c543

History | View | Annotate | Download (29.4 kB)

1
/*
2
 * Block driver for the VMDK format
3
 *
4
 * Copyright (c) 2004 Fabrice Bellard
5
 * Copyright (c) 2005 Filip Navara
6
 *
7
 * Permission is hereby granted, free of charge, to any person obtaining a copy
8
 * of this software and associated documentation files (the "Software"), to deal
9
 * in the Software without restriction, including without limitation the rights
10
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the Software is
12
 * furnished to do so, subject to the following conditions:
13
 *
14
 * The above copyright notice and this permission notice shall be included in
15
 * all copies or substantial portions of the Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
 * THE SOFTWARE.
24
 */
25

    
26
#include "qemu-common.h"
27
#include "block_int.h"
28
#include "module.h"
29

    
30
#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
31
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
32

    
33
typedef struct {
34
    uint32_t version;
35
    uint32_t flags;
36
    uint32_t disk_sectors;
37
    uint32_t granularity;
38
    uint32_t l1dir_offset;
39
    uint32_t l1dir_size;
40
    uint32_t file_sectors;
41
    uint32_t cylinders;
42
    uint32_t heads;
43
    uint32_t sectors_per_track;
44
} VMDK3Header;
45

    
46
typedef struct {
47
    uint32_t version;
48
    uint32_t flags;
49
    int64_t capacity;
50
    int64_t granularity;
51
    int64_t desc_offset;
52
    int64_t desc_size;
53
    int32_t num_gtes_per_gte;
54
    int64_t rgd_offset;
55
    int64_t gd_offset;
56
    int64_t grain_offset;
57
    char filler[1];
58
    char check_bytes[4];
59
} __attribute__((packed)) VMDK4Header;
60

    
61
#define L2_CACHE_SIZE 16
62

    
63
typedef struct VmdkExtent {
64
    BlockDriverState *file;
65
    bool flat;
66
    int64_t sectors;
67
    int64_t end_sector;
68
    int64_t l1_table_offset;
69
    int64_t l1_backup_table_offset;
70
    uint32_t *l1_table;
71
    uint32_t *l1_backup_table;
72
    unsigned int l1_size;
73
    uint32_t l1_entry_sectors;
74

    
75
    unsigned int l2_size;
76
    uint32_t *l2_cache;
77
    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
78
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
79

    
80
    unsigned int cluster_sectors;
81
} VmdkExtent;
82

    
83
typedef struct BDRVVmdkState {
84
    uint32_t parent_cid;
85
    int num_extents;
86
    /* Extent array with num_extents entries, ascend ordered by address */
87
    VmdkExtent *extents;
88
} BDRVVmdkState;
89

    
90
typedef struct VmdkMetaData {
91
    uint32_t offset;
92
    unsigned int l1_index;
93
    unsigned int l2_index;
94
    unsigned int l2_offset;
95
    int valid;
96
} VmdkMetaData;
97

    
98
static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
99
{
100
    uint32_t magic;
101

    
102
    if (buf_size < 4)
103
        return 0;
104
    magic = be32_to_cpu(*(uint32_t *)buf);
105
    if (magic == VMDK3_MAGIC ||
106
        magic == VMDK4_MAGIC)
107
        return 100;
108
    else
109
        return 0;
110
}
111

    
112
#define CHECK_CID 1
113

    
114
#define SECTOR_SIZE 512
115
#define DESC_SIZE 20*SECTOR_SIZE        // 20 sectors of 512 bytes each
116
#define HEADER_SIZE 512                           // first sector of 512 bytes
117

    
118
static void vmdk_free_extents(BlockDriverState *bs)
119
{
120
    int i;
121
    BDRVVmdkState *s = bs->opaque;
122

    
123
    for (i = 0; i < s->num_extents; i++) {
124
        qemu_free(s->extents[i].l1_table);
125
        qemu_free(s->extents[i].l2_cache);
126
        qemu_free(s->extents[i].l1_backup_table);
127
    }
128
    qemu_free(s->extents);
129
}
130

    
131
static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
132
{
133
    char desc[DESC_SIZE];
134
    uint32_t cid;
135
    const char *p_name, *cid_str;
136
    size_t cid_str_size;
137

    
138
    /* the descriptor offset = 0x200 */
139
    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
140
        return 0;
141

    
142
    if (parent) {
143
        cid_str = "parentCID";
144
        cid_str_size = sizeof("parentCID");
145
    } else {
146
        cid_str = "CID";
147
        cid_str_size = sizeof("CID");
148
    }
149

    
150
    if ((p_name = strstr(desc,cid_str)) != NULL) {
151
        p_name += cid_str_size;
152
        sscanf(p_name,"%x",&cid);
153
    }
154

    
155
    return cid;
156
}
157

    
158
static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
159
{
160
    char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
161
    char *p_name, *tmp_str;
162

    
163
    /* the descriptor offset = 0x200 */
164
    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
165
        return -1;
166

    
167
    tmp_str = strstr(desc,"parentCID");
168
    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
169
    if ((p_name = strstr(desc,"CID")) != NULL) {
170
        p_name += sizeof("CID");
171
        snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
172
        pstrcat(desc, sizeof(desc), tmp_desc);
173
    }
174

    
175
    if (bdrv_pwrite_sync(bs->file, 0x200, desc, DESC_SIZE) < 0)
176
        return -1;
177
    return 0;
178
}
179

    
180
static int vmdk_is_cid_valid(BlockDriverState *bs)
181
{
182
#ifdef CHECK_CID
183
    BDRVVmdkState *s = bs->opaque;
184
    BlockDriverState *p_bs = bs->backing_hd;
185
    uint32_t cur_pcid;
186

    
187
    if (p_bs) {
188
        cur_pcid = vmdk_read_cid(p_bs,0);
189
        if (s->parent_cid != cur_pcid)
190
            // CID not valid
191
            return 0;
192
    }
193
#endif
194
    // CID valid
195
    return 1;
196
}
197

    
198
static int vmdk_snapshot_create(const char *filename, const char *backing_file)
199
{
200
    int snp_fd, p_fd;
201
    int ret;
202
    uint32_t p_cid;
203
    char *p_name, *gd_buf, *rgd_buf;
204
    const char *real_filename, *temp_str;
205
    VMDK4Header header;
206
    uint32_t gde_entries, gd_size;
207
    int64_t gd_offset, rgd_offset, capacity, gt_size;
208
    char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
209
    static const char desc_template[] =
210
    "# Disk DescriptorFile\n"
211
    "version=1\n"
212
    "CID=%x\n"
213
    "parentCID=%x\n"
214
    "createType=\"monolithicSparse\"\n"
215
    "parentFileNameHint=\"%s\"\n"
216
    "\n"
217
    "# Extent description\n"
218
    "RW %u SPARSE \"%s\"\n"
219
    "\n"
220
    "# The Disk Data Base \n"
221
    "#DDB\n"
222
    "\n";
223

    
224
    snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
225
    if (snp_fd < 0)
226
        return -errno;
227
    p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
228
    if (p_fd < 0) {
229
        close(snp_fd);
230
        return -errno;
231
    }
232

    
233
    /* read the header */
234
    if (lseek(p_fd, 0x0, SEEK_SET) == -1) {
235
        ret = -errno;
236
        goto fail;
237
    }
238
    if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE) {
239
        ret = -errno;
240
        goto fail;
241
    }
242

    
243
    /* write the header */
244
    if (lseek(snp_fd, 0x0, SEEK_SET) == -1) {
245
        ret = -errno;
246
        goto fail;
247
    }
248
    if (write(snp_fd, hdr, HEADER_SIZE) == -1) {
249
        ret = -errno;
250
        goto fail;
251
    }
252

    
253
    memset(&header, 0, sizeof(header));
254
    memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
255

    
256
    if (ftruncate(snp_fd, header.grain_offset << 9)) {
257
        ret = -errno;
258
        goto fail;
259
    }
260
    /* the descriptor offset = 0x200 */
261
    if (lseek(p_fd, 0x200, SEEK_SET) == -1) {
262
        ret = -errno;
263
        goto fail;
264
    }
265
    if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE) {
266
        ret = -errno;
267
        goto fail;
268
    }
269

    
270
    if ((p_name = strstr(p_desc,"CID")) != NULL) {
271
        p_name += sizeof("CID");
272
        sscanf(p_name,"%x",&p_cid);
273
    }
274

    
275
    real_filename = filename;
276
    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
277
        real_filename = temp_str + 1;
278
    if ((temp_str = strrchr(real_filename, '/')) != NULL)
279
        real_filename = temp_str + 1;
280
    if ((temp_str = strrchr(real_filename, ':')) != NULL)
281
        real_filename = temp_str + 1;
282

    
283
    snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
284
             (uint32_t)header.capacity, real_filename);
285

    
286
    /* write the descriptor */
287
    if (lseek(snp_fd, 0x200, SEEK_SET) == -1) {
288
        ret = -errno;
289
        goto fail;
290
    }
291
    if (write(snp_fd, s_desc, strlen(s_desc)) == -1) {
292
        ret = -errno;
293
        goto fail;
294
    }
295

    
296
    gd_offset = header.gd_offset * SECTOR_SIZE;     // offset of GD table
297
    rgd_offset = header.rgd_offset * SECTOR_SIZE;   // offset of RGD table
298
    capacity = header.capacity * SECTOR_SIZE;       // Extent size
299
    /*
300
     * Each GDE span 32M disk, means:
301
     * 512 GTE per GT, each GTE points to grain
302
     */
303
    gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
304
    if (!gt_size) {
305
        ret = -EINVAL;
306
        goto fail;
307
    }
308
    gde_entries = (uint32_t)(capacity / gt_size);  // number of gde/rgde
309
    gd_size = gde_entries * sizeof(uint32_t);
310

    
311
    /* write RGD */
312
    rgd_buf = qemu_malloc(gd_size);
313
    if (lseek(p_fd, rgd_offset, SEEK_SET) == -1) {
314
        ret = -errno;
315
        goto fail_rgd;
316
    }
317
    if (read(p_fd, rgd_buf, gd_size) != gd_size) {
318
        ret = -errno;
319
        goto fail_rgd;
320
    }
321
    if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1) {
322
        ret = -errno;
323
        goto fail_rgd;
324
    }
325
    if (write(snp_fd, rgd_buf, gd_size) == -1) {
326
        ret = -errno;
327
        goto fail_rgd;
328
    }
329

    
330
    /* write GD */
331
    gd_buf = qemu_malloc(gd_size);
332
    if (lseek(p_fd, gd_offset, SEEK_SET) == -1) {
333
        ret = -errno;
334
        goto fail_gd;
335
    }
336
    if (read(p_fd, gd_buf, gd_size) != gd_size) {
337
        ret = -errno;
338
        goto fail_gd;
339
    }
340
    if (lseek(snp_fd, gd_offset, SEEK_SET) == -1) {
341
        ret = -errno;
342
        goto fail_gd;
343
    }
344
    if (write(snp_fd, gd_buf, gd_size) == -1) {
345
        ret = -errno;
346
        goto fail_gd;
347
    }
348
    ret = 0;
349

    
350
fail_gd:
351
    qemu_free(gd_buf);
352
fail_rgd:
353
    qemu_free(rgd_buf);
354
fail:
355
    close(p_fd);
356
    close(snp_fd);
357
    return ret;
358
}
359

    
360
static int vmdk_parent_open(BlockDriverState *bs)
361
{
362
    char *p_name;
363
    char desc[DESC_SIZE];
364

    
365
    /* the descriptor offset = 0x200 */
366
    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
367
        return -1;
368

    
369
    if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
370
        char *end_name;
371

    
372
        p_name += sizeof("parentFileNameHint") + 1;
373
        if ((end_name = strchr(p_name,'\"')) == NULL)
374
            return -1;
375
        if ((end_name - p_name) > sizeof (bs->backing_file) - 1)
376
            return -1;
377

    
378
        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
379
    }
380

    
381
    return 0;
382
}
383

    
384
/* Create and append extent to the extent array. Return the added VmdkExtent
385
 * address. return NULL if allocation failed. */
386
static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
387
                           BlockDriverState *file, bool flat, int64_t sectors,
388
                           int64_t l1_offset, int64_t l1_backup_offset,
389
                           uint32_t l1_size,
390
                           int l2_size, unsigned int cluster_sectors)
391
{
392
    VmdkExtent *extent;
393
    BDRVVmdkState *s = bs->opaque;
394

    
395
    s->extents = qemu_realloc(s->extents,
396
                              (s->num_extents + 1) * sizeof(VmdkExtent));
397
    extent = &s->extents[s->num_extents];
398
    s->num_extents++;
399

    
400
    memset(extent, 0, sizeof(VmdkExtent));
401
    extent->file = file;
402
    extent->flat = flat;
403
    extent->sectors = sectors;
404
    extent->l1_table_offset = l1_offset;
405
    extent->l1_backup_table_offset = l1_backup_offset;
406
    extent->l1_size = l1_size;
407
    extent->l1_entry_sectors = l2_size * cluster_sectors;
408
    extent->l2_size = l2_size;
409
    extent->cluster_sectors = cluster_sectors;
410

    
411
    if (s->num_extents > 1) {
412
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
413
    } else {
414
        extent->end_sector = extent->sectors;
415
    }
416
    bs->total_sectors = extent->end_sector;
417
    return extent;
418
}
419

    
420

    
421
static int vmdk_open(BlockDriverState *bs, int flags)
422
{
423
    BDRVVmdkState *s = bs->opaque;
424
    uint32_t magic;
425
    int i;
426
    uint32_t l1_size, l1_entry_sectors;
427
    VmdkExtent *extent = NULL;
428

    
429
    if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic))
430
        goto fail;
431

    
432
    magic = be32_to_cpu(magic);
433
    if (magic == VMDK3_MAGIC) {
434
        VMDK3Header header;
435
        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
436
                != sizeof(header)) {
437
            goto fail;
438
        }
439
        extent = vmdk_add_extent(bs, bs->file, false,
440
                              le32_to_cpu(header.disk_sectors),
441
                              le32_to_cpu(header.l1dir_offset) << 9, 0,
442
                              1 << 6, 1 << 9, le32_to_cpu(header.granularity));
443
    } else if (magic == VMDK4_MAGIC) {
444
        VMDK4Header header;
445
        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
446
                != sizeof(header)) {
447
            goto fail;
448
        }
449
        l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
450
                            * le64_to_cpu(header.granularity);
451
        l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
452
                    / l1_entry_sectors;
453
        extent = vmdk_add_extent(bs, bs->file, false,
454
                              le64_to_cpu(header.capacity),
455
                              le64_to_cpu(header.gd_offset) << 9,
456
                              le64_to_cpu(header.rgd_offset) << 9,
457
                              l1_size,
458
                              le32_to_cpu(header.num_gtes_per_gte),
459
                              le64_to_cpu(header.granularity));
460
        if (extent->l1_entry_sectors <= 0) {
461
            goto fail;
462
        }
463
        // try to open parent images, if exist
464
        if (vmdk_parent_open(bs) != 0)
465
            goto fail;
466
        // write the CID once after the image creation
467
        s->parent_cid = vmdk_read_cid(bs,1);
468
    } else {
469
        goto fail;
470
    }
471

    
472
    /* read the L1 table */
473
    l1_size = extent->l1_size * sizeof(uint32_t);
474
    extent->l1_table = qemu_malloc(l1_size);
475
    if (bdrv_pread(bs->file,
476
            extent->l1_table_offset,
477
            extent->l1_table,
478
            l1_size)
479
        != l1_size) {
480
        goto fail;
481
    }
482
    for (i = 0; i < extent->l1_size; i++) {
483
        le32_to_cpus(&extent->l1_table[i]);
484
    }
485

    
486
    if (extent->l1_backup_table_offset) {
487
        extent->l1_backup_table = qemu_malloc(l1_size);
488
        if (bdrv_pread(bs->file,
489
                    extent->l1_backup_table_offset,
490
                    extent->l1_backup_table,
491
                    l1_size)
492
                != l1_size) {
493
            goto fail;
494
        }
495
        for (i = 0; i < extent->l1_size; i++) {
496
            le32_to_cpus(&extent->l1_backup_table[i]);
497
        }
498
    }
499

    
500
    extent->l2_cache =
501
        qemu_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
502
    return 0;
503
 fail:
504
    vmdk_free_extents(bs);
505
    return -1;
506
}
507

    
508
static int get_whole_cluster(BlockDriverState *bs,
509
                VmdkExtent *extent,
510
                uint64_t cluster_offset,
511
                uint64_t offset,
512
                bool allocate)
513
{
514
    /* 128 sectors * 512 bytes each = grain size 64KB */
515
    uint8_t  whole_grain[extent->cluster_sectors * 512];
516

    
517
    /* we will be here if it's first write on non-exist grain(cluster).
518
     * try to read from parent image, if exist */
519
    if (bs->backing_hd) {
520
        int ret;
521

    
522
        if (!vmdk_is_cid_valid(bs))
523
            return -1;
524

    
525
        /* floor offset to cluster */
526
        offset -= offset % (extent->cluster_sectors * 512);
527
        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
528
                extent->cluster_sectors);
529
        if (ret < 0) {
530
            return -1;
531
        }
532

    
533
        /* Write grain only into the active image */
534
        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
535
                extent->cluster_sectors);
536
        if (ret < 0) {
537
            return -1;
538
        }
539
    }
540
    return 0;
541
}
542

    
543
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
544
{
545
    /* update L2 table */
546
    if (bdrv_pwrite_sync(
547
                extent->file,
548
                ((int64_t)m_data->l2_offset * 512)
549
                    + (m_data->l2_index * sizeof(m_data->offset)),
550
                &(m_data->offset),
551
                sizeof(m_data->offset)
552
            ) < 0) {
553
        return -1;
554
    }
555
    /* update backup L2 table */
556
    if (extent->l1_backup_table_offset != 0) {
557
        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
558
        if (bdrv_pwrite_sync(
559
                    extent->file,
560
                    ((int64_t)m_data->l2_offset * 512)
561
                        + (m_data->l2_index * sizeof(m_data->offset)),
562
                    &(m_data->offset), sizeof(m_data->offset)
563
                ) < 0) {
564
            return -1;
565
        }
566
    }
567

    
568
    return 0;
569
}
570

    
571
static uint64_t get_cluster_offset(BlockDriverState *bs,
572
                                    VmdkExtent *extent,
573
                                    VmdkMetaData *m_data,
574
                                    uint64_t offset, int allocate)
575
{
576
    unsigned int l1_index, l2_offset, l2_index;
577
    int min_index, i, j;
578
    uint32_t min_count, *l2_table, tmp = 0;
579
    uint64_t cluster_offset;
580

    
581
    if (m_data)
582
        m_data->valid = 0;
583

    
584
    l1_index = (offset >> 9) / extent->l1_entry_sectors;
585
    if (l1_index >= extent->l1_size) {
586
        return 0;
587
    }
588
    l2_offset = extent->l1_table[l1_index];
589
    if (!l2_offset) {
590
        return 0;
591
    }
592
    for(i = 0; i < L2_CACHE_SIZE; i++) {
593
        if (l2_offset == extent->l2_cache_offsets[i]) {
594
            /* increment the hit count */
595
            if (++extent->l2_cache_counts[i] == 0xffffffff) {
596
                for(j = 0; j < L2_CACHE_SIZE; j++) {
597
                    extent->l2_cache_counts[j] >>= 1;
598
                }
599
            }
600
            l2_table = extent->l2_cache + (i * extent->l2_size);
601
            goto found;
602
        }
603
    }
604
    /* not found: load a new entry in the least used one */
605
    min_index = 0;
606
    min_count = 0xffffffff;
607
    for(i = 0; i < L2_CACHE_SIZE; i++) {
608
        if (extent->l2_cache_counts[i] < min_count) {
609
            min_count = extent->l2_cache_counts[i];
610
            min_index = i;
611
        }
612
    }
613
    l2_table = extent->l2_cache + (min_index * extent->l2_size);
614
    if (bdrv_pread(
615
                extent->file,
616
                (int64_t)l2_offset * 512,
617
                l2_table,
618
                extent->l2_size * sizeof(uint32_t)
619
            ) != extent->l2_size * sizeof(uint32_t)) {
620
        return 0;
621
    }
622

    
623
    extent->l2_cache_offsets[min_index] = l2_offset;
624
    extent->l2_cache_counts[min_index] = 1;
625
 found:
626
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
627
    cluster_offset = le32_to_cpu(l2_table[l2_index]);
628

    
629
    if (!cluster_offset) {
630
        if (!allocate)
631
            return 0;
632

    
633
        // Avoid the L2 tables update for the images that have snapshots.
634
        cluster_offset = bdrv_getlength(extent->file);
635
        bdrv_truncate(
636
            extent->file,
637
            cluster_offset + (extent->cluster_sectors << 9)
638
        );
639

    
640
        cluster_offset >>= 9;
641
        tmp = cpu_to_le32(cluster_offset);
642
        l2_table[l2_index] = tmp;
643

    
644
        /* First of all we write grain itself, to avoid race condition
645
         * that may to corrupt the image.
646
         * This problem may occur because of insufficient space on host disk
647
         * or inappropriate VM shutdown.
648
         */
649
        if (get_whole_cluster(
650
                bs, extent, cluster_offset, offset, allocate) == -1)
651
            return 0;
652

    
653
        if (m_data) {
654
            m_data->offset = tmp;
655
            m_data->l1_index = l1_index;
656
            m_data->l2_index = l2_index;
657
            m_data->l2_offset = l2_offset;
658
            m_data->valid = 1;
659
        }
660
    }
661
    cluster_offset <<= 9;
662
    return cluster_offset;
663
}
664

    
665
static VmdkExtent *find_extent(BDRVVmdkState *s,
666
                                int64_t sector_num, VmdkExtent *start_hint)
667
{
668
    VmdkExtent *extent = start_hint;
669

    
670
    if (!extent) {
671
        extent = &s->extents[0];
672
    }
673
    while (extent < &s->extents[s->num_extents]) {
674
        if (sector_num < extent->end_sector) {
675
            return extent;
676
        }
677
        extent++;
678
    }
679
    return NULL;
680
}
681

    
682
static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
683
                             int nb_sectors, int *pnum)
684
{
685
    BDRVVmdkState *s = bs->opaque;
686

    
687
    int64_t index_in_cluster, n, ret;
688
    uint64_t offset;
689
    VmdkExtent *extent;
690

    
691
    extent = find_extent(s, sector_num, NULL);
692
    if (!extent) {
693
        return 0;
694
    }
695
    if (extent->flat) {
696
        n = extent->end_sector - sector_num;
697
        ret = 1;
698
    } else {
699
        offset = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0);
700
        index_in_cluster = sector_num % extent->cluster_sectors;
701
        n = extent->cluster_sectors - index_in_cluster;
702
        ret = offset ? 1 : 0;
703
    }
704
    if (n > nb_sectors)
705
        n = nb_sectors;
706
    *pnum = n;
707
    return ret;
708
}
709

    
710
static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
711
                    uint8_t *buf, int nb_sectors)
712
{
713
    BDRVVmdkState *s = bs->opaque;
714
    int ret;
715
    uint64_t n, index_in_cluster;
716
    VmdkExtent *extent = NULL;
717
    uint64_t cluster_offset;
718

    
719
    while (nb_sectors > 0) {
720
        extent = find_extent(s, sector_num, extent);
721
        if (!extent) {
722
            return -EIO;
723
        }
724
        cluster_offset = get_cluster_offset(
725
                            bs, extent, NULL, sector_num << 9, 0);
726
        index_in_cluster = sector_num % extent->cluster_sectors;
727
        n = extent->cluster_sectors - index_in_cluster;
728
        if (n > nb_sectors)
729
            n = nb_sectors;
730
        if (!cluster_offset) {
731
            // try to read from parent image, if exist
732
            if (bs->backing_hd) {
733
                if (!vmdk_is_cid_valid(bs))
734
                    return -1;
735
                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
736
                if (ret < 0)
737
                    return -1;
738
            } else {
739
                memset(buf, 0, 512 * n);
740
            }
741
        } else {
742
            if(bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
743
                return -1;
744
        }
745
        nb_sectors -= n;
746
        sector_num += n;
747
        buf += n * 512;
748
    }
749
    return 0;
750
}
751

    
752
static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
753
                     const uint8_t *buf, int nb_sectors)
754
{
755
    BDRVVmdkState *s = bs->opaque;
756
    VmdkExtent *extent = NULL;
757
    int n;
758
    int64_t index_in_cluster;
759
    uint64_t cluster_offset;
760
    static int cid_update = 0;
761
    VmdkMetaData m_data;
762

    
763
    if (sector_num > bs->total_sectors) {
764
        fprintf(stderr,
765
                "(VMDK) Wrong offset: sector_num=0x%" PRIx64
766
                " total_sectors=0x%" PRIx64 "\n",
767
                sector_num, bs->total_sectors);
768
        return -1;
769
    }
770

    
771
    while (nb_sectors > 0) {
772
        extent = find_extent(s, sector_num, extent);
773
        if (!extent) {
774
            return -EIO;
775
        }
776
        cluster_offset = get_cluster_offset(
777
                                bs,
778
                                extent,
779
                                &m_data,
780
                                sector_num << 9, 1);
781
        if (!cluster_offset) {
782
            return -1;
783
        }
784
        index_in_cluster = sector_num % extent->cluster_sectors;
785
        n = extent->cluster_sectors - index_in_cluster;
786
        if (n > nb_sectors) {
787
            n = nb_sectors;
788
        }
789

    
790
        if (bdrv_pwrite(bs->file,
791
                        cluster_offset + index_in_cluster * 512,
792
                        buf, n * 512)
793
                != n * 512) {
794
            return -1;
795
        }
796
        if (m_data.valid) {
797
            /* update L2 tables */
798
            if (vmdk_L2update(extent, &m_data) == -1) {
799
                return -1;
800
            }
801
        }
802
        nb_sectors -= n;
803
        sector_num += n;
804
        buf += n * 512;
805

    
806
        // update CID on the first write every time the virtual disk is opened
807
        if (!cid_update) {
808
            vmdk_write_cid(bs, time(NULL));
809
            cid_update++;
810
        }
811
    }
812
    return 0;
813
}
814

    
815
static int vmdk_create(const char *filename, QEMUOptionParameter *options)
816
{
817
    int fd, i;
818
    VMDK4Header header;
819
    uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
820
    static const char desc_template[] =
821
        "# Disk DescriptorFile\n"
822
        "version=1\n"
823
        "CID=%x\n"
824
        "parentCID=ffffffff\n"
825
        "createType=\"monolithicSparse\"\n"
826
        "\n"
827
        "# Extent description\n"
828
        "RW %" PRId64 " SPARSE \"%s\"\n"
829
        "\n"
830
        "# The Disk Data Base \n"
831
        "#DDB\n"
832
        "\n"
833
        "ddb.virtualHWVersion = \"%d\"\n"
834
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
835
        "ddb.geometry.heads = \"16\"\n"
836
        "ddb.geometry.sectors = \"63\"\n"
837
        "ddb.adapterType = \"ide\"\n";
838
    char desc[1024];
839
    const char *real_filename, *temp_str;
840
    int64_t total_size = 0;
841
    const char *backing_file = NULL;
842
    int flags = 0;
843
    int ret;
844

    
845
    // Read out options
846
    while (options && options->name) {
847
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
848
            total_size = options->value.n / 512;
849
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
850
            backing_file = options->value.s;
851
        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
852
            flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
853
        }
854
        options++;
855
    }
856

    
857
    /* XXX: add support for backing file */
858
    if (backing_file) {
859
        return vmdk_snapshot_create(filename, backing_file);
860
    }
861

    
862
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
863
              0644);
864
    if (fd < 0)
865
        return -errno;
866
    magic = cpu_to_be32(VMDK4_MAGIC);
867
    memset(&header, 0, sizeof(header));
868
    header.version = 1;
869
    header.flags = 3; /* ?? */
870
    header.capacity = total_size;
871
    header.granularity = 128;
872
    header.num_gtes_per_gte = 512;
873

    
874
    grains = (total_size + header.granularity - 1) / header.granularity;
875
    gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
876
    gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
877
    gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
878

    
879
    header.desc_offset = 1;
880
    header.desc_size = 20;
881
    header.rgd_offset = header.desc_offset + header.desc_size;
882
    header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
883
    header.grain_offset =
884
       ((header.gd_offset + gd_size + (gt_size * gt_count) +
885
         header.granularity - 1) / header.granularity) *
886
        header.granularity;
887

    
888
    /* swap endianness for all header fields */
889
    header.version = cpu_to_le32(header.version);
890
    header.flags = cpu_to_le32(header.flags);
891
    header.capacity = cpu_to_le64(header.capacity);
892
    header.granularity = cpu_to_le64(header.granularity);
893
    header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
894
    header.desc_offset = cpu_to_le64(header.desc_offset);
895
    header.desc_size = cpu_to_le64(header.desc_size);
896
    header.rgd_offset = cpu_to_le64(header.rgd_offset);
897
    header.gd_offset = cpu_to_le64(header.gd_offset);
898
    header.grain_offset = cpu_to_le64(header.grain_offset);
899

    
900
    header.check_bytes[0] = 0xa;
901
    header.check_bytes[1] = 0x20;
902
    header.check_bytes[2] = 0xd;
903
    header.check_bytes[3] = 0xa;
904

    
905
    /* write all the data */
906
    ret = qemu_write_full(fd, &magic, sizeof(magic));
907
    if (ret != sizeof(magic)) {
908
        ret = -errno;
909
        goto exit;
910
    }
911
    ret = qemu_write_full(fd, &header, sizeof(header));
912
    if (ret != sizeof(header)) {
913
        ret = -errno;
914
        goto exit;
915
    }
916

    
917
    ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
918
    if (ret < 0) {
919
        ret = -errno;
920
        goto exit;
921
    }
922

    
923
    /* write grain directory */
924
    lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
925
    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
926
         i < gt_count; i++, tmp += gt_size) {
927
        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
928
        if (ret != sizeof(tmp)) {
929
            ret = -errno;
930
            goto exit;
931
        }
932
    }
933

    
934
    /* write backup grain directory */
935
    lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
936
    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
937
         i < gt_count; i++, tmp += gt_size) {
938
        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
939
        if (ret != sizeof(tmp)) {
940
            ret = -errno;
941
            goto exit;
942
        }
943
    }
944

    
945
    /* compose the descriptor */
946
    real_filename = filename;
947
    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
948
        real_filename = temp_str + 1;
949
    if ((temp_str = strrchr(real_filename, '/')) != NULL)
950
        real_filename = temp_str + 1;
951
    if ((temp_str = strrchr(real_filename, ':')) != NULL)
952
        real_filename = temp_str + 1;
953
    snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
954
             total_size, real_filename,
955
             (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
956
             total_size / (int64_t)(63 * 16));
957

    
958
    /* write the descriptor */
959
    lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
960
    ret = qemu_write_full(fd, desc, strlen(desc));
961
    if (ret != strlen(desc)) {
962
        ret = -errno;
963
        goto exit;
964
    }
965

    
966
    ret = 0;
967
exit:
968
    close(fd);
969
    return ret;
970
}
971

    
972
static void vmdk_close(BlockDriverState *bs)
973
{
974
    vmdk_free_extents(bs);
975
}
976

    
977
static int vmdk_flush(BlockDriverState *bs)
978
{
979
    return bdrv_flush(bs->file);
980
}
981

    
982

    
983
static QEMUOptionParameter vmdk_create_options[] = {
984
    {
985
        .name = BLOCK_OPT_SIZE,
986
        .type = OPT_SIZE,
987
        .help = "Virtual disk size"
988
    },
989
    {
990
        .name = BLOCK_OPT_BACKING_FILE,
991
        .type = OPT_STRING,
992
        .help = "File name of a base image"
993
    },
994
    {
995
        .name = BLOCK_OPT_COMPAT6,
996
        .type = OPT_FLAG,
997
        .help = "VMDK version 6 image"
998
    },
999
    { NULL }
1000
};
1001

    
1002
static BlockDriver bdrv_vmdk = {
1003
    .format_name        = "vmdk",
1004
    .instance_size        = sizeof(BDRVVmdkState),
1005
    .bdrv_probe                = vmdk_probe,
1006
    .bdrv_open      = vmdk_open,
1007
    .bdrv_read                = vmdk_read,
1008
    .bdrv_write                = vmdk_write,
1009
    .bdrv_close                = vmdk_close,
1010
    .bdrv_create        = vmdk_create,
1011
    .bdrv_flush                = vmdk_flush,
1012
    .bdrv_is_allocated        = vmdk_is_allocated,
1013

    
1014
    .create_options = vmdk_create_options,
1015
};
1016

    
1017
static void bdrv_vmdk_init(void)
1018
{
1019
    bdrv_register(&bdrv_vmdk);
1020
}
1021

    
1022
block_init(bdrv_vmdk_init);