Statistics
| Branch: | Revision:

root / block / vmdk.c @ b3976d3c

History | View | Annotate | Download (29.3 kB)

1
/*
2
 * Block driver for the VMDK format
3
 *
4
 * Copyright (c) 2004 Fabrice Bellard
5
 * Copyright (c) 2005 Filip Navara
6
 *
7
 * Permission is hereby granted, free of charge, to any person obtaining a copy
8
 * of this software and associated documentation files (the "Software"), to deal
9
 * in the Software without restriction, including without limitation the rights
10
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the Software is
12
 * furnished to do so, subject to the following conditions:
13
 *
14
 * The above copyright notice and this permission notice shall be included in
15
 * all copies or substantial portions of the Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
 * THE SOFTWARE.
24
 */
25

    
26
#include "qemu-common.h"
27
#include "block_int.h"
28
#include "module.h"
29

    
30
#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
31
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
32

    
33
typedef struct {
34
    uint32_t version;
35
    uint32_t flags;
36
    uint32_t disk_sectors;
37
    uint32_t granularity;
38
    uint32_t l1dir_offset;
39
    uint32_t l1dir_size;
40
    uint32_t file_sectors;
41
    uint32_t cylinders;
42
    uint32_t heads;
43
    uint32_t sectors_per_track;
44
} VMDK3Header;
45

    
46
typedef struct {
47
    uint32_t version;
48
    uint32_t flags;
49
    int64_t capacity;
50
    int64_t granularity;
51
    int64_t desc_offset;
52
    int64_t desc_size;
53
    int32_t num_gtes_per_gte;
54
    int64_t rgd_offset;
55
    int64_t gd_offset;
56
    int64_t grain_offset;
57
    char filler[1];
58
    char check_bytes[4];
59
} __attribute__((packed)) VMDK4Header;
60

    
61
#define L2_CACHE_SIZE 16
62

    
63
typedef struct VmdkExtent {
64
    BlockDriverState *file;
65
    bool flat;
66
    int64_t sectors;
67
    int64_t end_sector;
68
    int64_t l1_table_offset;
69
    int64_t l1_backup_table_offset;
70
    uint32_t *l1_table;
71
    uint32_t *l1_backup_table;
72
    unsigned int l1_size;
73
    uint32_t l1_entry_sectors;
74

    
75
    unsigned int l2_size;
76
    uint32_t *l2_cache;
77
    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
78
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
79

    
80
    unsigned int cluster_sectors;
81
} VmdkExtent;
82

    
83
typedef struct BDRVVmdkState {
84
    uint32_t parent_cid;
85
    int num_extents;
86
    /* Extent array with num_extents entries, ascend ordered by address */
87
    VmdkExtent *extents;
88
} BDRVVmdkState;
89

    
90
typedef struct VmdkMetaData {
91
    uint32_t offset;
92
    unsigned int l1_index;
93
    unsigned int l2_index;
94
    unsigned int l2_offset;
95
    int valid;
96
} VmdkMetaData;
97

    
98
static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
99
{
100
    uint32_t magic;
101

    
102
    if (buf_size < 4)
103
        return 0;
104
    magic = be32_to_cpu(*(uint32_t *)buf);
105
    if (magic == VMDK3_MAGIC ||
106
        magic == VMDK4_MAGIC)
107
        return 100;
108
    else
109
        return 0;
110
}
111

    
112
#define CHECK_CID 1
113

    
114
#define SECTOR_SIZE 512
115
#define DESC_SIZE 20*SECTOR_SIZE        // 20 sectors of 512 bytes each
116
#define HEADER_SIZE 512                           // first sector of 512 bytes
117

    
118
static void vmdk_free_extents(BlockDriverState *bs)
119
{
120
    int i;
121
    BDRVVmdkState *s = bs->opaque;
122

    
123
    for (i = 0; i < s->num_extents; i++) {
124
        qemu_free(s->extents[i].l1_table);
125
        qemu_free(s->extents[i].l2_cache);
126
        qemu_free(s->extents[i].l1_backup_table);
127
    }
128
    qemu_free(s->extents);
129
}
130

    
131
static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
132
{
133
    char desc[DESC_SIZE];
134
    uint32_t cid;
135
    const char *p_name, *cid_str;
136
    size_t cid_str_size;
137

    
138
    /* the descriptor offset = 0x200 */
139
    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
140
        return 0;
141

    
142
    if (parent) {
143
        cid_str = "parentCID";
144
        cid_str_size = sizeof("parentCID");
145
    } else {
146
        cid_str = "CID";
147
        cid_str_size = sizeof("CID");
148
    }
149

    
150
    if ((p_name = strstr(desc,cid_str)) != NULL) {
151
        p_name += cid_str_size;
152
        sscanf(p_name,"%x",&cid);
153
    }
154

    
155
    return cid;
156
}
157

    
158
static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
159
{
160
    char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
161
    char *p_name, *tmp_str;
162

    
163
    /* the descriptor offset = 0x200 */
164
    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
165
        return -1;
166

    
167
    tmp_str = strstr(desc,"parentCID");
168
    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
169
    if ((p_name = strstr(desc,"CID")) != NULL) {
170
        p_name += sizeof("CID");
171
        snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
172
        pstrcat(desc, sizeof(desc), tmp_desc);
173
    }
174

    
175
    if (bdrv_pwrite_sync(bs->file, 0x200, desc, DESC_SIZE) < 0)
176
        return -1;
177
    return 0;
178
}
179

    
180
static int vmdk_is_cid_valid(BlockDriverState *bs)
181
{
182
#ifdef CHECK_CID
183
    BDRVVmdkState *s = bs->opaque;
184
    BlockDriverState *p_bs = bs->backing_hd;
185
    uint32_t cur_pcid;
186

    
187
    if (p_bs) {
188
        cur_pcid = vmdk_read_cid(p_bs,0);
189
        if (s->parent_cid != cur_pcid)
190
            // CID not valid
191
            return 0;
192
    }
193
#endif
194
    // CID valid
195
    return 1;
196
}
197

    
198
static int vmdk_snapshot_create(const char *filename, const char *backing_file)
199
{
200
    int snp_fd, p_fd;
201
    int ret;
202
    uint32_t p_cid;
203
    char *p_name, *gd_buf, *rgd_buf;
204
    const char *real_filename, *temp_str;
205
    VMDK4Header header;
206
    uint32_t gde_entries, gd_size;
207
    int64_t gd_offset, rgd_offset, capacity, gt_size;
208
    char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
209
    static const char desc_template[] =
210
    "# Disk DescriptorFile\n"
211
    "version=1\n"
212
    "CID=%x\n"
213
    "parentCID=%x\n"
214
    "createType=\"monolithicSparse\"\n"
215
    "parentFileNameHint=\"%s\"\n"
216
    "\n"
217
    "# Extent description\n"
218
    "RW %u SPARSE \"%s\"\n"
219
    "\n"
220
    "# The Disk Data Base \n"
221
    "#DDB\n"
222
    "\n";
223

    
224
    snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
225
    if (snp_fd < 0)
226
        return -errno;
227
    p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
228
    if (p_fd < 0) {
229
        close(snp_fd);
230
        return -errno;
231
    }
232

    
233
    /* read the header */
234
    if (lseek(p_fd, 0x0, SEEK_SET) == -1) {
235
        ret = -errno;
236
        goto fail;
237
    }
238
    if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE) {
239
        ret = -errno;
240
        goto fail;
241
    }
242

    
243
    /* write the header */
244
    if (lseek(snp_fd, 0x0, SEEK_SET) == -1) {
245
        ret = -errno;
246
        goto fail;
247
    }
248
    if (write(snp_fd, hdr, HEADER_SIZE) == -1) {
249
        ret = -errno;
250
        goto fail;
251
    }
252

    
253
    memset(&header, 0, sizeof(header));
254
    memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
255

    
256
    if (ftruncate(snp_fd, header.grain_offset << 9)) {
257
        ret = -errno;
258
        goto fail;
259
    }
260
    /* the descriptor offset = 0x200 */
261
    if (lseek(p_fd, 0x200, SEEK_SET) == -1) {
262
        ret = -errno;
263
        goto fail;
264
    }
265
    if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE) {
266
        ret = -errno;
267
        goto fail;
268
    }
269

    
270
    if ((p_name = strstr(p_desc,"CID")) != NULL) {
271
        p_name += sizeof("CID");
272
        sscanf(p_name,"%x",&p_cid);
273
    }
274

    
275
    real_filename = filename;
276
    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
277
        real_filename = temp_str + 1;
278
    if ((temp_str = strrchr(real_filename, '/')) != NULL)
279
        real_filename = temp_str + 1;
280
    if ((temp_str = strrchr(real_filename, ':')) != NULL)
281
        real_filename = temp_str + 1;
282

    
283
    snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
284
             (uint32_t)header.capacity, real_filename);
285

    
286
    /* write the descriptor */
287
    if (lseek(snp_fd, 0x200, SEEK_SET) == -1) {
288
        ret = -errno;
289
        goto fail;
290
    }
291
    if (write(snp_fd, s_desc, strlen(s_desc)) == -1) {
292
        ret = -errno;
293
        goto fail;
294
    }
295

    
296
    gd_offset = header.gd_offset * SECTOR_SIZE;     // offset of GD table
297
    rgd_offset = header.rgd_offset * SECTOR_SIZE;   // offset of RGD table
298
    capacity = header.capacity * SECTOR_SIZE;       // Extent size
299
    /*
300
     * Each GDE span 32M disk, means:
301
     * 512 GTE per GT, each GTE points to grain
302
     */
303
    gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
304
    if (!gt_size) {
305
        ret = -EINVAL;
306
        goto fail;
307
    }
308
    gde_entries = (uint32_t)(capacity / gt_size);  // number of gde/rgde
309
    gd_size = gde_entries * sizeof(uint32_t);
310

    
311
    /* write RGD */
312
    rgd_buf = qemu_malloc(gd_size);
313
    if (lseek(p_fd, rgd_offset, SEEK_SET) == -1) {
314
        ret = -errno;
315
        goto fail_rgd;
316
    }
317
    if (read(p_fd, rgd_buf, gd_size) != gd_size) {
318
        ret = -errno;
319
        goto fail_rgd;
320
    }
321
    if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1) {
322
        ret = -errno;
323
        goto fail_rgd;
324
    }
325
    if (write(snp_fd, rgd_buf, gd_size) == -1) {
326
        ret = -errno;
327
        goto fail_rgd;
328
    }
329

    
330
    /* write GD */
331
    gd_buf = qemu_malloc(gd_size);
332
    if (lseek(p_fd, gd_offset, SEEK_SET) == -1) {
333
        ret = -errno;
334
        goto fail_gd;
335
    }
336
    if (read(p_fd, gd_buf, gd_size) != gd_size) {
337
        ret = -errno;
338
        goto fail_gd;
339
    }
340
    if (lseek(snp_fd, gd_offset, SEEK_SET) == -1) {
341
        ret = -errno;
342
        goto fail_gd;
343
    }
344
    if (write(snp_fd, gd_buf, gd_size) == -1) {
345
        ret = -errno;
346
        goto fail_gd;
347
    }
348
    ret = 0;
349

    
350
fail_gd:
351
    qemu_free(gd_buf);
352
fail_rgd:
353
    qemu_free(rgd_buf);
354
fail:
355
    close(p_fd);
356
    close(snp_fd);
357
    return ret;
358
}
359

    
360
static int vmdk_parent_open(BlockDriverState *bs)
361
{
362
    char *p_name;
363
    char desc[DESC_SIZE];
364

    
365
    /* the descriptor offset = 0x200 */
366
    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
367
        return -1;
368

    
369
    if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
370
        char *end_name;
371

    
372
        p_name += sizeof("parentFileNameHint") + 1;
373
        if ((end_name = strchr(p_name,'\"')) == NULL)
374
            return -1;
375
        if ((end_name - p_name) > sizeof (bs->backing_file) - 1)
376
            return -1;
377

    
378
        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
379
    }
380

    
381
    return 0;
382
}
383

    
384
/* Create and append extent to the extent array. Return the added VmdkExtent
385
 * address. return NULL if allocation failed. */
386
static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
387
                           BlockDriverState *file, bool flat, int64_t sectors,
388
                           int64_t l1_offset, int64_t l1_backup_offset,
389
                           uint32_t l1_size,
390
                           int l2_size, unsigned int cluster_sectors)
391
{
392
    VmdkExtent *extent;
393
    BDRVVmdkState *s = bs->opaque;
394

    
395
    s->extents = qemu_realloc(s->extents,
396
                              (s->num_extents + 1) * sizeof(VmdkExtent));
397
    extent = &s->extents[s->num_extents];
398
    s->num_extents++;
399

    
400
    memset(extent, 0, sizeof(VmdkExtent));
401
    extent->file = file;
402
    extent->flat = flat;
403
    extent->sectors = sectors;
404
    extent->l1_table_offset = l1_offset;
405
    extent->l1_backup_table_offset = l1_backup_offset;
406
    extent->l1_size = l1_size;
407
    extent->l1_entry_sectors = l2_size * cluster_sectors;
408
    extent->l2_size = l2_size;
409
    extent->cluster_sectors = cluster_sectors;
410

    
411
    if (s->num_extents > 1) {
412
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
413
    } else {
414
        extent->end_sector = extent->sectors;
415
    }
416
    bs->total_sectors = extent->end_sector;
417
    return extent;
418
}
419

    
420

    
421
static int vmdk_open(BlockDriverState *bs, int flags)
422
{
423
    BDRVVmdkState *s = bs->opaque;
424
    uint32_t magic;
425
    int i;
426
    uint32_t l1_size, l1_entry_sectors;
427
    VmdkExtent *extent = NULL;
428

    
429
    if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic))
430
        goto fail;
431

    
432
    magic = be32_to_cpu(magic);
433
    if (magic == VMDK3_MAGIC) {
434
        VMDK3Header header;
435
        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
436
                != sizeof(header)) {
437
            goto fail;
438
        }
439
        extent = vmdk_add_extent(bs, bs->file, false,
440
                              le32_to_cpu(header.disk_sectors),
441
                              le32_to_cpu(header.l1dir_offset) << 9, 0,
442
                              1 << 6, 1 << 9, le32_to_cpu(header.granularity));
443
    } else if (magic == VMDK4_MAGIC) {
444
        VMDK4Header header;
445
        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
446
                != sizeof(header)) {
447
            goto fail;
448
        }
449
        l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
450
                            * le64_to_cpu(header.granularity);
451
        l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
452
                    / l1_entry_sectors;
453
        extent = vmdk_add_extent(bs, bs->file, false,
454
                              le64_to_cpu(header.capacity),
455
                              le64_to_cpu(header.gd_offset) << 9,
456
                              le64_to_cpu(header.rgd_offset) << 9,
457
                              l1_size,
458
                              le32_to_cpu(header.num_gtes_per_gte),
459
                              le64_to_cpu(header.granularity));
460
        if (extent->l1_entry_sectors <= 0) {
461
            goto fail;
462
        }
463
        // try to open parent images, if exist
464
        if (vmdk_parent_open(bs) != 0)
465
            goto fail;
466
        // write the CID once after the image creation
467
        s->parent_cid = vmdk_read_cid(bs,1);
468
    } else {
469
        goto fail;
470
    }
471

    
472
    /* read the L1 table */
473
    l1_size = extent->l1_size * sizeof(uint32_t);
474
    extent->l1_table = qemu_malloc(l1_size);
475
    if (bdrv_pread(bs->file,
476
            extent->l1_table_offset,
477
            extent->l1_table,
478
            l1_size)
479
        != l1_size) {
480
        goto fail;
481
    }
482
    for (i = 0; i < extent->l1_size; i++) {
483
        le32_to_cpus(&extent->l1_table[i]);
484
    }
485

    
486
    if (extent->l1_backup_table_offset) {
487
        extent->l1_backup_table = qemu_malloc(l1_size);
488
        if (bdrv_pread(bs->file,
489
                    extent->l1_backup_table_offset,
490
                    extent->l1_backup_table,
491
                    l1_size)
492
                != l1_size) {
493
            goto fail;
494
        }
495
        for (i = 0; i < extent->l1_size; i++) {
496
            le32_to_cpus(&extent->l1_backup_table[i]);
497
        }
498
    }
499

    
500
    extent->l2_cache =
501
        qemu_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
502
    return 0;
503
 fail:
504
    vmdk_free_extents(bs);
505
    return -1;
506
}
507

    
508
static int get_whole_cluster(BlockDriverState *bs,
509
                VmdkExtent *extent,
510
                uint64_t cluster_offset,
511
                uint64_t offset,
512
                bool allocate)
513
{
514
    /* 128 sectors * 512 bytes each = grain size 64KB */
515
    uint8_t  whole_grain[extent->cluster_sectors * 512];
516

    
517
    // we will be here if it's first write on non-exist grain(cluster).
518
    // try to read from parent image, if exist
519
    if (bs->backing_hd) {
520
        int ret;
521

    
522
        if (!vmdk_is_cid_valid(bs))
523
            return -1;
524

    
525
        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
526
                extent->cluster_sectors);
527
        if (ret < 0) {
528
            return -1;
529
        }
530

    
531
        //Write grain only into the active image
532
        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
533
                extent->cluster_sectors);
534
        if (ret < 0) {
535
            return -1;
536
        }
537
    }
538
    return 0;
539
}
540

    
541
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
542
{
543
    /* update L2 table */
544
    if (bdrv_pwrite_sync(
545
                extent->file,
546
                ((int64_t)m_data->l2_offset * 512)
547
                    + (m_data->l2_index * sizeof(m_data->offset)),
548
                &(m_data->offset),
549
                sizeof(m_data->offset)
550
            ) < 0) {
551
        return -1;
552
    }
553
    /* update backup L2 table */
554
    if (extent->l1_backup_table_offset != 0) {
555
        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
556
        if (bdrv_pwrite_sync(
557
                    extent->file,
558
                    ((int64_t)m_data->l2_offset * 512)
559
                        + (m_data->l2_index * sizeof(m_data->offset)),
560
                    &(m_data->offset), sizeof(m_data->offset)
561
                ) < 0) {
562
            return -1;
563
        }
564
    }
565

    
566
    return 0;
567
}
568

    
569
static uint64_t get_cluster_offset(BlockDriverState *bs,
570
                                    VmdkExtent *extent,
571
                                    VmdkMetaData *m_data,
572
                                    uint64_t offset, int allocate)
573
{
574
    unsigned int l1_index, l2_offset, l2_index;
575
    int min_index, i, j;
576
    uint32_t min_count, *l2_table, tmp = 0;
577
    uint64_t cluster_offset;
578

    
579
    if (m_data)
580
        m_data->valid = 0;
581

    
582
    l1_index = (offset >> 9) / extent->l1_entry_sectors;
583
    if (l1_index >= extent->l1_size) {
584
        return 0;
585
    }
586
    l2_offset = extent->l1_table[l1_index];
587
    if (!l2_offset) {
588
        return 0;
589
    }
590
    for(i = 0; i < L2_CACHE_SIZE; i++) {
591
        if (l2_offset == extent->l2_cache_offsets[i]) {
592
            /* increment the hit count */
593
            if (++extent->l2_cache_counts[i] == 0xffffffff) {
594
                for(j = 0; j < L2_CACHE_SIZE; j++) {
595
                    extent->l2_cache_counts[j] >>= 1;
596
                }
597
            }
598
            l2_table = extent->l2_cache + (i * extent->l2_size);
599
            goto found;
600
        }
601
    }
602
    /* not found: load a new entry in the least used one */
603
    min_index = 0;
604
    min_count = 0xffffffff;
605
    for(i = 0; i < L2_CACHE_SIZE; i++) {
606
        if (extent->l2_cache_counts[i] < min_count) {
607
            min_count = extent->l2_cache_counts[i];
608
            min_index = i;
609
        }
610
    }
611
    l2_table = extent->l2_cache + (min_index * extent->l2_size);
612
    if (bdrv_pread(
613
                extent->file,
614
                (int64_t)l2_offset * 512,
615
                l2_table,
616
                extent->l2_size * sizeof(uint32_t)
617
            ) != extent->l2_size * sizeof(uint32_t)) {
618
        return 0;
619
    }
620

    
621
    extent->l2_cache_offsets[min_index] = l2_offset;
622
    extent->l2_cache_counts[min_index] = 1;
623
 found:
624
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
625
    cluster_offset = le32_to_cpu(l2_table[l2_index]);
626

    
627
    if (!cluster_offset) {
628
        if (!allocate)
629
            return 0;
630

    
631
        // Avoid the L2 tables update for the images that have snapshots.
632
        cluster_offset = bdrv_getlength(extent->file);
633
        bdrv_truncate(
634
            extent->file,
635
            cluster_offset + (extent->cluster_sectors << 9)
636
        );
637

    
638
        cluster_offset >>= 9;
639
        tmp = cpu_to_le32(cluster_offset);
640
        l2_table[l2_index] = tmp;
641

    
642
        /* First of all we write grain itself, to avoid race condition
643
         * that may to corrupt the image.
644
         * This problem may occur because of insufficient space on host disk
645
         * or inappropriate VM shutdown.
646
         */
647
        if (get_whole_cluster(
648
                bs, extent, cluster_offset, offset, allocate) == -1)
649
            return 0;
650

    
651
        if (m_data) {
652
            m_data->offset = tmp;
653
            m_data->l1_index = l1_index;
654
            m_data->l2_index = l2_index;
655
            m_data->l2_offset = l2_offset;
656
            m_data->valid = 1;
657
        }
658
    }
659
    cluster_offset <<= 9;
660
    return cluster_offset;
661
}
662

    
663
static VmdkExtent *find_extent(BDRVVmdkState *s,
664
                                int64_t sector_num, VmdkExtent *start_hint)
665
{
666
    VmdkExtent *extent = start_hint;
667

    
668
    if (!extent) {
669
        extent = &s->extents[0];
670
    }
671
    while (extent < &s->extents[s->num_extents]) {
672
        if (sector_num < extent->end_sector) {
673
            return extent;
674
        }
675
        extent++;
676
    }
677
    return NULL;
678
}
679

    
680
static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
681
                             int nb_sectors, int *pnum)
682
{
683
    BDRVVmdkState *s = bs->opaque;
684

    
685
    int64_t index_in_cluster, n, ret;
686
    uint64_t offset;
687
    VmdkExtent *extent;
688

    
689
    extent = find_extent(s, sector_num, NULL);
690
    if (!extent) {
691
        return 0;
692
    }
693
    if (extent->flat) {
694
        n = extent->end_sector - sector_num;
695
        ret = 1;
696
    } else {
697
        offset = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0);
698
        index_in_cluster = sector_num % extent->cluster_sectors;
699
        n = extent->cluster_sectors - index_in_cluster;
700
        ret = offset ? 1 : 0;
701
    }
702
    if (n > nb_sectors)
703
        n = nb_sectors;
704
    *pnum = n;
705
    return ret;
706
}
707

    
708
static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
709
                    uint8_t *buf, int nb_sectors)
710
{
711
    BDRVVmdkState *s = bs->opaque;
712
    int ret;
713
    uint64_t n, index_in_cluster;
714
    VmdkExtent *extent = NULL;
715
    uint64_t cluster_offset;
716

    
717
    while (nb_sectors > 0) {
718
        extent = find_extent(s, sector_num, extent);
719
        if (!extent) {
720
            return -EIO;
721
        }
722
        cluster_offset = get_cluster_offset(
723
                            bs, extent, NULL, sector_num << 9, 0);
724
        index_in_cluster = sector_num % extent->cluster_sectors;
725
        n = extent->cluster_sectors - index_in_cluster;
726
        if (n > nb_sectors)
727
            n = nb_sectors;
728
        if (!cluster_offset) {
729
            // try to read from parent image, if exist
730
            if (bs->backing_hd) {
731
                if (!vmdk_is_cid_valid(bs))
732
                    return -1;
733
                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
734
                if (ret < 0)
735
                    return -1;
736
            } else {
737
                memset(buf, 0, 512 * n);
738
            }
739
        } else {
740
            if(bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
741
                return -1;
742
        }
743
        nb_sectors -= n;
744
        sector_num += n;
745
        buf += n * 512;
746
    }
747
    return 0;
748
}
749

    
750
static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
751
                     const uint8_t *buf, int nb_sectors)
752
{
753
    BDRVVmdkState *s = bs->opaque;
754
    VmdkExtent *extent = NULL;
755
    int n;
756
    int64_t index_in_cluster;
757
    uint64_t cluster_offset;
758
    static int cid_update = 0;
759
    VmdkMetaData m_data;
760

    
761
    if (sector_num > bs->total_sectors) {
762
        fprintf(stderr,
763
                "(VMDK) Wrong offset: sector_num=0x%" PRIx64
764
                " total_sectors=0x%" PRIx64 "\n",
765
                sector_num, bs->total_sectors);
766
        return -1;
767
    }
768

    
769
    while (nb_sectors > 0) {
770
        extent = find_extent(s, sector_num, extent);
771
        if (!extent) {
772
            return -EIO;
773
        }
774
        cluster_offset = get_cluster_offset(
775
                                bs,
776
                                extent,
777
                                &m_data,
778
                                sector_num << 9, 1);
779
        if (!cluster_offset) {
780
            return -1;
781
        }
782
        index_in_cluster = sector_num % extent->cluster_sectors;
783
        n = extent->cluster_sectors - index_in_cluster;
784
        if (n > nb_sectors) {
785
            n = nb_sectors;
786
        }
787

    
788
        if (bdrv_pwrite(bs->file,
789
                        cluster_offset + index_in_cluster * 512,
790
                        buf, n * 512)
791
                != n * 512) {
792
            return -1;
793
        }
794
        if (m_data.valid) {
795
            /* update L2 tables */
796
            if (vmdk_L2update(extent, &m_data) == -1) {
797
                return -1;
798
            }
799
        }
800
        nb_sectors -= n;
801
        sector_num += n;
802
        buf += n * 512;
803

    
804
        // update CID on the first write every time the virtual disk is opened
805
        if (!cid_update) {
806
            vmdk_write_cid(bs, time(NULL));
807
            cid_update++;
808
        }
809
    }
810
    return 0;
811
}
812

    
813
static int vmdk_create(const char *filename, QEMUOptionParameter *options)
814
{
815
    int fd, i;
816
    VMDK4Header header;
817
    uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
818
    static const char desc_template[] =
819
        "# Disk DescriptorFile\n"
820
        "version=1\n"
821
        "CID=%x\n"
822
        "parentCID=ffffffff\n"
823
        "createType=\"monolithicSparse\"\n"
824
        "\n"
825
        "# Extent description\n"
826
        "RW %" PRId64 " SPARSE \"%s\"\n"
827
        "\n"
828
        "# The Disk Data Base \n"
829
        "#DDB\n"
830
        "\n"
831
        "ddb.virtualHWVersion = \"%d\"\n"
832
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
833
        "ddb.geometry.heads = \"16\"\n"
834
        "ddb.geometry.sectors = \"63\"\n"
835
        "ddb.adapterType = \"ide\"\n";
836
    char desc[1024];
837
    const char *real_filename, *temp_str;
838
    int64_t total_size = 0;
839
    const char *backing_file = NULL;
840
    int flags = 0;
841
    int ret;
842

    
843
    // Read out options
844
    while (options && options->name) {
845
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
846
            total_size = options->value.n / 512;
847
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
848
            backing_file = options->value.s;
849
        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
850
            flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
851
        }
852
        options++;
853
    }
854

    
855
    /* XXX: add support for backing file */
856
    if (backing_file) {
857
        return vmdk_snapshot_create(filename, backing_file);
858
    }
859

    
860
    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
861
              0644);
862
    if (fd < 0)
863
        return -errno;
864
    magic = cpu_to_be32(VMDK4_MAGIC);
865
    memset(&header, 0, sizeof(header));
866
    header.version = 1;
867
    header.flags = 3; /* ?? */
868
    header.capacity = total_size;
869
    header.granularity = 128;
870
    header.num_gtes_per_gte = 512;
871

    
872
    grains = (total_size + header.granularity - 1) / header.granularity;
873
    gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
874
    gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
875
    gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
876

    
877
    header.desc_offset = 1;
878
    header.desc_size = 20;
879
    header.rgd_offset = header.desc_offset + header.desc_size;
880
    header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
881
    header.grain_offset =
882
       ((header.gd_offset + gd_size + (gt_size * gt_count) +
883
         header.granularity - 1) / header.granularity) *
884
        header.granularity;
885

    
886
    /* swap endianness for all header fields */
887
    header.version = cpu_to_le32(header.version);
888
    header.flags = cpu_to_le32(header.flags);
889
    header.capacity = cpu_to_le64(header.capacity);
890
    header.granularity = cpu_to_le64(header.granularity);
891
    header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
892
    header.desc_offset = cpu_to_le64(header.desc_offset);
893
    header.desc_size = cpu_to_le64(header.desc_size);
894
    header.rgd_offset = cpu_to_le64(header.rgd_offset);
895
    header.gd_offset = cpu_to_le64(header.gd_offset);
896
    header.grain_offset = cpu_to_le64(header.grain_offset);
897

    
898
    header.check_bytes[0] = 0xa;
899
    header.check_bytes[1] = 0x20;
900
    header.check_bytes[2] = 0xd;
901
    header.check_bytes[3] = 0xa;
902

    
903
    /* write all the data */
904
    ret = qemu_write_full(fd, &magic, sizeof(magic));
905
    if (ret != sizeof(magic)) {
906
        ret = -errno;
907
        goto exit;
908
    }
909
    ret = qemu_write_full(fd, &header, sizeof(header));
910
    if (ret != sizeof(header)) {
911
        ret = -errno;
912
        goto exit;
913
    }
914

    
915
    ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
916
    if (ret < 0) {
917
        ret = -errno;
918
        goto exit;
919
    }
920

    
921
    /* write grain directory */
922
    lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
923
    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
924
         i < gt_count; i++, tmp += gt_size) {
925
        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
926
        if (ret != sizeof(tmp)) {
927
            ret = -errno;
928
            goto exit;
929
        }
930
    }
931

    
932
    /* write backup grain directory */
933
    lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
934
    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
935
         i < gt_count; i++, tmp += gt_size) {
936
        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
937
        if (ret != sizeof(tmp)) {
938
            ret = -errno;
939
            goto exit;
940
        }
941
    }
942

    
943
    /* compose the descriptor */
944
    real_filename = filename;
945
    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
946
        real_filename = temp_str + 1;
947
    if ((temp_str = strrchr(real_filename, '/')) != NULL)
948
        real_filename = temp_str + 1;
949
    if ((temp_str = strrchr(real_filename, ':')) != NULL)
950
        real_filename = temp_str + 1;
951
    snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
952
             total_size, real_filename,
953
             (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
954
             total_size / (int64_t)(63 * 16));
955

    
956
    /* write the descriptor */
957
    lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
958
    ret = qemu_write_full(fd, desc, strlen(desc));
959
    if (ret != strlen(desc)) {
960
        ret = -errno;
961
        goto exit;
962
    }
963

    
964
    ret = 0;
965
exit:
966
    close(fd);
967
    return ret;
968
}
969

    
970
static void vmdk_close(BlockDriverState *bs)
971
{
972
    vmdk_free_extents(bs);
973
}
974

    
975
static int vmdk_flush(BlockDriverState *bs)
976
{
977
    return bdrv_flush(bs->file);
978
}
979

    
980

    
981
static QEMUOptionParameter vmdk_create_options[] = {
982
    {
983
        .name = BLOCK_OPT_SIZE,
984
        .type = OPT_SIZE,
985
        .help = "Virtual disk size"
986
    },
987
    {
988
        .name = BLOCK_OPT_BACKING_FILE,
989
        .type = OPT_STRING,
990
        .help = "File name of a base image"
991
    },
992
    {
993
        .name = BLOCK_OPT_COMPAT6,
994
        .type = OPT_FLAG,
995
        .help = "VMDK version 6 image"
996
    },
997
    { NULL }
998
};
999

    
1000
static BlockDriver bdrv_vmdk = {
1001
    .format_name        = "vmdk",
1002
    .instance_size        = sizeof(BDRVVmdkState),
1003
    .bdrv_probe                = vmdk_probe,
1004
    .bdrv_open      = vmdk_open,
1005
    .bdrv_read                = vmdk_read,
1006
    .bdrv_write                = vmdk_write,
1007
    .bdrv_close                = vmdk_close,
1008
    .bdrv_create        = vmdk_create,
1009
    .bdrv_flush                = vmdk_flush,
1010
    .bdrv_is_allocated        = vmdk_is_allocated,
1011

    
1012
    .create_options = vmdk_create_options,
1013
};
1014

    
1015
static void bdrv_vmdk_init(void)
1016
{
1017
    bdrv_register(&bdrv_vmdk);
1018
}
1019

    
1020
block_init(bdrv_vmdk_init);