Statistics
| Branch: | Revision:

root / drivers / block-vhd.c @ abdb293f

History | View | Annotate | Download (56.5 kB)

1
/*
2
 *
3
 * Copyright (c) 2007, XenSource Inc.
4
 * Copyright (c) 2010, Citrix Systems, Inc.
5
 *
6
 * All rights reserved.
7
 *
8
 * Redistribution and use in source and binary forms, with or without
9
 * modification, are permitted provided that the following conditions are met:
10
 *     * Redistributions of source code must retain the above copyright
11
 *       notice, this list of conditions and the following disclaimer.
12
 *     * Redistributions in binary form must reproduce the above copyright
13
 *       notice, this list of conditions and the following disclaimer in the
14
 *       documentation and/or other materials provided with the distribution.
15
 *     * Neither the name of XenSource Inc. nor the names of its contributors
16
 *       may be used to endorse or promote products derived from this software
17
 *       without specific prior written permission.
18
 *
19
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
23
 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
 */
31

    
32
/*
33
 * block-vhd.c: asynchronous vhd implementation.
34
 *
35
 * A note on write transactions:
36
 * Writes that require updating the BAT or bitmaps cannot be signaled
37
 * as complete until all updates have reached disk.  Transactions are
38
 * used to ensure proper ordering in these cases.  The two types of
39
 * transactions are as follows:
40
 *   - Bitmap updates only: data writes that require updates to the same
41
 *     bitmap are grouped in a transaction.  Only after all data writes
42
 *     in a transaction complete does the bitmap write commence.  Only
43
 *     after the bitmap write finishes are the data writes signalled as
44
 *     complete.
45
 *   - BAT and bitmap updates: data writes are grouped in transactions
46
 *     as above, but a special extra write is included in the transaction,
47
 *     which zeros out the newly allocated bitmap on disk.  When the data
48
 *     writes and the zero-bitmap write complete, the BAT and bitmap writes
49
 *     are started in parallel.  The transaction is completed only after both
50
 *     the BAT and bitmap writes successfully return.
51
 */
52

    
53
#ifdef HAVE_CONFIG_H
54
#include "config.h"
55
#endif
56

    
57
#include <errno.h>
58
#include <fcntl.h>
59
#include <stdio.h>
60
#include <stdlib.h>
61
#include <unistd.h>
62
#include <sys/stat.h>
63
#include <sys/ioctl.h>
64
#include <uuid/uuid.h> /* For whatever reason, Linux packages this in */
65
                       /* e2fsprogs-devel.                            */
66
#include <string.h>    /* for memset.                                 */
67
#include <libaio.h>
68
#include <sys/mman.h>
69

    
70
#include "libvhd.h"
71
#include "tapdisk.h"
72
#include "tapdisk-driver.h"
73
#include "tapdisk-interface.h"
74
#include "tapdisk-disktype.h"
75
#include "tapdisk-storage.h"
76

    
77
unsigned int SPB;
78

    
79
#define DEBUGGING   2
80
#define ASSERTING   1
81
#define MICROSOFT_COMPAT
82

    
83
#define VHD_BATMAP_MAX_RETRIES 10
84

    
85
#define __TRACE(s)                                                        \
86
        do {                                                                \
87
                DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %"        \
88
                    PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: "        \
89
                    "%u, BBLK: 0x%04x\n",                                \
90
                    s->vhd.file, s->queued, s->completed, s->returned,        \
91
                    VHD_REQS_DATA - s->vreq_free_count,                        \
92
                    s->bat.pbw_blk);                                        \
93
        } while(0)
94

    
95
#define __ASSERT(_p)                                                        \
96
        if (!(_p)) {                                                        \
97
                DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n",                \
98
                        __FILE__, __LINE__, #_p);                        \
99
                DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n",        \
100
                    __FILE__, __LINE__, #_p);                                \
101
                td_panic();                                                \
102
        }
103

    
104
#if (DEBUGGING == 1)
105
  #define DBG(level, _f, _a...)      DPRINTF(_f, ##_a)
106
  #define ERR(_s, err, _f, _a...)    DPRINTF("ERROR: %d: " _f, err, ##_a)
107
  #define TRACE(s)                   ((void)0)
108
#elif (DEBUGGING == 2)
109
  #define DBG(level, _f, _a...)      tlog_write(level, _f, ##_a)
110
  #define ERR(_s, _err, _f, _a...)   tlog_drv_error((_s)->driver, _err, _f, ##_a)
111
  #define TRACE(s)                   __TRACE(s)
112
#else
113
  #define DBG(level, _f, _a...)      ((void)0)
114
  #define ERR(_s, err, _f, _a...)    ((void)0)
115
  #define TRACE(s)                   ((void)0)
116
#endif
117

    
118
#if (ASSERTING == 1)
119
  #define ASSERT(_p)                 __ASSERT(_p)
120
#else
121
  #define ASSERT(_p)                 ((void)0)
122
#endif
123

    
124
/******VHD DEFINES******/
125
#define VHD_CACHE_SIZE               32
126

    
127
#define VHD_REQS_DATA                TAPDISK_DATA_REQUESTS
128
#define VHD_REQS_META                (VHD_CACHE_SIZE + 2)
129
#define VHD_REQS_TOTAL               (VHD_REQS_DATA + VHD_REQS_META)
130

    
131
#define VHD_OP_BAT_WRITE             0
132
#define VHD_OP_DATA_READ             1
133
#define VHD_OP_DATA_WRITE            2
134
#define VHD_OP_BITMAP_READ           3
135
#define VHD_OP_BITMAP_WRITE          4
136
#define VHD_OP_ZERO_BM_WRITE         5
137
#define VHD_OP_REDUNDANT_BM_WRITE    6
138

    
139
#define VHD_BM_BAT_LOCKED            0
140
#define VHD_BM_BAT_CLEAR             1
141
#define VHD_BM_BIT_CLEAR             2
142
#define VHD_BM_BIT_SET               3
143
#define VHD_BM_NOT_CACHED            4
144
#define VHD_BM_READ_PENDING          5
145

    
146
#define VHD_FLAG_OPEN_RDONLY         1
147
#define VHD_FLAG_OPEN_NO_CACHE       2
148
#define VHD_FLAG_OPEN_QUIET          4
149
#define VHD_FLAG_OPEN_STRICT         8
150
#define VHD_FLAG_OPEN_QUERY          16
151
#define VHD_FLAG_OPEN_PREALLOCATE    32
152

    
153
#define VHD_FLAG_BAT_LOCKED          1
154
#define VHD_FLAG_BAT_WRITE_STARTED   2
155

    
156
#define VHD_FLAG_BM_UPDATE_BAT       1
157
#define VHD_FLAG_BM_WRITE_PENDING    2
158
#define VHD_FLAG_BM_READ_PENDING     4
159
#define VHD_FLAG_BM_LOCKED           8
160

    
161
#define VHD_FLAG_REQ_UPDATE_BAT      1
162
#define VHD_FLAG_REQ_UPDATE_BITMAP   2
163
#define VHD_FLAG_REQ_QUEUED          4
164
#define VHD_FLAG_REQ_FINISHED        8
165

    
166
#define VHD_FLAG_TX_LIVE             1
167
#define VHD_FLAG_TX_UPDATE_BAT       2
168

    
169
typedef uint8_t vhd_flag_t;
170

    
171
struct vhd_state;
172
struct vhd_request;
173

    
174
struct vhd_req_list {
175
        struct vhd_request       *head;
176
        struct vhd_request       *tail;
177
};
178

    
179
struct vhd_transaction {
180
        int                       error;
181
        int                       closed;
182
        int                       started;
183
        int                       finished;
184
        vhd_flag_t                status;
185
        struct vhd_req_list       requests;
186
};
187

    
188
struct vhd_request {
189
        int                       error;
190
        uint8_t                   op;
191
        vhd_flag_t                flags;
192
        td_request_t              treq;
193
        struct tiocb              tiocb;
194
        struct vhd_state         *state;
195
        struct vhd_request       *next;
196
        struct vhd_transaction   *tx;
197
};
198

    
199
struct vhd_bat_state {
200
        vhd_bat_t                 bat;
201
        vhd_batmap_t              batmap;
202
        vhd_flag_t                status;
203
        uint32_t                  pbw_blk;     /* blk num of pending write */
204
        uint64_t                  pbw_offset;  /* file offset of same */
205
        struct vhd_request        req;         /* for writing bat table */
206
        struct vhd_request        zero_req;    /* for initializing bitmaps */
207
        char                     *bat_buf;
208
};
209

    
210
struct vhd_bitmap {
211
        uint32_t                  blk;
212
        uint64_t                  seqno;       /* lru sequence number */
213
        vhd_flag_t                status;
214

    
215
        char                     *map;         /* map should only be modified
216
                                                * in finish_bitmap_write */
217
        char                     *shadow;      /* in-memory bitmap changes are 
218
                                                * made to shadow and copied to
219
                                                * map only after having been
220
                                                * flushed to disk */
221
        struct vhd_transaction    tx;          /* transaction data structure
222
                                                * encapsulating data, bitmap, 
223
                                                * and bat writes */
224
        struct vhd_req_list       queue;       /* data writes waiting for next
225
                                                * transaction */
226
        struct vhd_req_list       waiting;     /* pending requests that cannot
227
                                                * be serviced until this bitmap
228
                                                * is read from disk */
229
        struct vhd_request        req;
230
};
231

    
232
struct vhd_state {
233
        vhd_flag_t                flags;
234

    
235
        /* VHD stuff */
236
        vhd_context_t             vhd;
237
        uint32_t                  spp;         /* sectors per page */
238
        uint32_t                  spb;         /* sectors per block */
239
        uint64_t                  first_db;    /* pointer to datablock 0 */
240
        uint64_t                  next_db;     /* pointer to the next 
241
                                                * (unallocated) datablock */
242

    
243
        struct vhd_bat_state      bat;
244

    
245
        uint64_t                  bm_lru;      /* lru sequence number */
246
        uint32_t                  bm_secs;     /* size of bitmap, in sectors */
247
        struct vhd_bitmap        *bitmap[VHD_CACHE_SIZE];
248

    
249
        int                       bm_free_count;
250
        struct vhd_bitmap        *bitmap_free[VHD_CACHE_SIZE];
251
        struct vhd_bitmap         bitmap_list[VHD_CACHE_SIZE];
252

    
253
        int                       vreq_free_count;
254
        struct vhd_request       *vreq_free[VHD_REQS_DATA];
255
        struct vhd_request        vreq_list[VHD_REQS_DATA];
256

    
257
        /* for redundant bitmap writes */
258
        int                       padbm_size;
259
        char                     *padbm_buf;
260
        long int                  debug_skipped_redundant_writes;
261
        long int                  debug_done_redundant_writes;
262

    
263
        td_driver_t              *driver;
264

    
265
        uint64_t                  queued;
266
        uint64_t                  completed;
267
        uint64_t                  returned;
268
        uint64_t                  reads;
269
        uint64_t                  read_size;
270
        uint64_t                  writes;
271
        uint64_t                  write_size;
272
};
273

    
274
#define test_vhd_flag(word, flag)  ((word) & (flag))
275
#define set_vhd_flag(word, flag)   ((word) |= (flag))
276
#define clear_vhd_flag(word, flag) ((word) &= ~(flag))
277

    
278
#define bat_entry(s, blk)          ((s)->bat.bat.bat[(blk)])
279

    
280
static void vhd_complete(void *, struct tiocb *, int);
281
static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *);
282

    
283
static struct vhd_state  *_vhd_master;
284
static unsigned long      _vhd_zsize;
285
static char              *_vhd_zeros;
286

    
287
static int
288
vhd_initialize(struct vhd_state *s)
289
{
290
        if (_vhd_zeros)
291
                return 0;
292

    
293
        _vhd_zsize = 2 * getpagesize();
294
        if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
295
                _vhd_zsize += VHD_BLOCK_SIZE;
296

    
297
        _vhd_zeros = mmap(0, _vhd_zsize, PROT_READ,
298
                          MAP_SHARED | MAP_ANONYMOUS, -1, 0);
299
        if (_vhd_zeros == MAP_FAILED) {
300
                EPRINTF("vhd_initialize failed: %d\n", -errno);
301
                _vhd_zeros = NULL;
302
                _vhd_zsize = 0;
303
                return -errno;
304
        }
305

    
306
        _vhd_master = s;
307
        return 0;
308
}
309

    
310
static void
311
vhd_free(struct vhd_state *s)
312
{
313
        if (_vhd_master != s || !_vhd_zeros)
314
                return;
315

    
316
        munmap(_vhd_zeros, _vhd_zsize);
317
        _vhd_zsize  = 0;
318
        _vhd_zeros  = NULL;
319
        _vhd_master = NULL;
320
}
321

    
322
static char *
323
_get_vhd_zeros(const char *func, unsigned long size)
324
{
325
        if (!_vhd_zeros || _vhd_zsize < size) {
326
                EPRINTF("invalid zero request from %s: %lu, %lu, %p\n",
327
                        func, size, _vhd_zsize, _vhd_zeros);
328
                ASSERT(0);
329
        }
330

    
331
        return _vhd_zeros;
332
}
333

    
334
#define vhd_zeros(size)        _get_vhd_zeros(__func__, size)
335

    
336
static inline void
337
set_batmap(struct vhd_state *s, uint32_t blk)
338
{
339
        if (s->bat.batmap.map) {
340
                vhd_batmap_set(&s->vhd, &s->bat.batmap, blk);
341
                DBG(TLOG_DBG, "block 0x%x completely full\n", blk);
342
        }
343
}
344

    
345
static inline int
346
test_batmap(struct vhd_state *s, uint32_t blk)
347
{
348
        if (!s->bat.batmap.map)
349
                return 0;
350
        return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk);
351
}
352

    
353
static int
354
vhd_kill_footer(struct vhd_state *s)
355
{
356
        int err;
357
        off64_t end;
358
        void *zeros;
359

    
360
        if (s->vhd.footer.type == HD_TYPE_FIXED)
361
                return 0;
362

    
363
        err = posix_memalign(&zeros, 512, 512);
364
        if (err)
365
                return -err;
366

    
367
        err = 1;
368
        memset(zeros, 0xc7c7c7c7, 512);
369

    
370
        if ((end = lseek64(s->vhd.fd, 0, SEEK_END)) == -1)
371
                goto fail;
372

    
373
        if (lseek64(s->vhd.fd, (end - 512), SEEK_SET) == -1)
374
                goto fail;
375

    
376
        if (write(s->vhd.fd, zeros, 512) != 512)
377
                goto fail;
378

    
379
        err = 0;
380

    
381
 fail:
382
        free(zeros);
383
        if (err)
384
                return (errno ? -errno : -EIO);
385
        return 0;
386
}
387

    
388
static inline int
389
find_next_free_block(struct vhd_state *s)
390
{
391
        int err;
392
        off64_t eom;
393
        uint32_t i, entry;
394

    
395
        err = vhd_end_of_headers(&s->vhd, &eom);
396
        if (err)
397
                return err;
398

    
399
        s->next_db = secs_round_up(eom);
400
        s->first_db = s->next_db;
401
        if ((s->first_db + s->bm_secs) % s->spp)
402
                s->first_db += (s->spp - ((s->first_db + s->bm_secs) % s->spp));
403

    
404
        for (i = 0; i < s->bat.bat.entries; i++) {
405
                entry = bat_entry(s, i);
406
                if (entry != DD_BLK_UNUSED && entry >= s->next_db)
407
                        s->next_db = entry + s->spb + s->bm_secs;
408
        }
409

    
410
        return 0;
411
}
412

    
413
static void
414
vhd_free_bat(struct vhd_state *s)
415
{
416
        free(s->bat.bat.bat);
417
        free(s->bat.batmap.map);
418
        free(s->bat.bat_buf);
419
        memset(&s->bat, 0, sizeof(struct vhd_bat));
420
}
421

    
422
static int
423
vhd_initialize_bat(struct vhd_state *s)
424
{
425
        int err, batmap_required, i;
426
        void *buf;
427

    
428
        memset(&s->bat, 0, sizeof(struct vhd_bat));
429

    
430
        err = vhd_read_bat(&s->vhd, &s->bat.bat);
431
        if (err) {
432
                EPRINTF("%s: reading bat: %d\n", s->vhd.file, err);
433
                return err;
434
        }
435

    
436
        batmap_required = 1;
437
        if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) {
438
                batmap_required = 0;
439
        } else {
440
                err = find_next_free_block(s);
441
                if (err)
442
                        goto fail;
443
        }
444

    
445
        if (vhd_has_batmap(&s->vhd)) {
446
                for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) {
447
                        err = vhd_read_batmap(&s->vhd, &s->bat.batmap);
448
                        if (err) {
449
                                EPRINTF("%s: reading batmap: %d\n",
450
                                                s->vhd.file, err);
451
                                if (batmap_required)
452
                                        goto fail;
453
                        } else {
454
                                break;
455
                        }
456
                }
457
                if (err)
458
                        EPRINTF("%s: ignoring non-critical batmap error\n",
459
                                        s->vhd.file);
460
        }
461

    
462
        err = posix_memalign(&buf, VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
463
        if (err)
464
                goto fail;
465

    
466
        s->bat.bat_buf = buf;
467

    
468
        return 0;
469

    
470
fail:
471
        vhd_free_bat(s);
472
        return err;
473
}
474

    
475
static void
476
vhd_free_bitmap_cache(struct vhd_state *s)
477
{
478
        int i;
479
        struct vhd_bitmap *bm;
480

    
481
        for (i = 0; i < VHD_CACHE_SIZE; i++) {
482
                bm = s->bitmap_list + i;
483
                free(bm->map);
484
                free(bm->shadow);
485
                s->bitmap_free[i] = NULL;
486
        }
487

    
488
        memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
489
}
490

    
491
static int
492
vhd_initialize_bitmap_cache(struct vhd_state *s)
493
{
494
        int i, err, map_size;
495
        struct vhd_bitmap *bm;
496
        void *map, *shadow;
497

    
498
        memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
499

    
500
        s->bm_lru        = 0;
501
        map_size         = vhd_sectors_to_bytes(s->bm_secs);
502
        s->bm_free_count = VHD_CACHE_SIZE;
503

    
504
        for (i = 0; i < VHD_CACHE_SIZE; i++) {
505
                bm = s->bitmap_list + i;
506

    
507
                err = posix_memalign(&map, 512, map_size);
508
                if (err)
509
                        goto fail;
510

    
511
                bm->map = map;
512

    
513
                err = posix_memalign(&shadow, 512, map_size);
514
                if (err)
515
                        goto fail;
516

    
517
                bm->shadow = shadow;
518

    
519
                memset(bm->map, 0, map_size);
520
                memset(bm->shadow, 0, map_size);
521
                s->bitmap_free[i] = bm;
522
        }
523

    
524
        return 0;
525

    
526
fail:
527
        vhd_free_bitmap_cache(s);
528
        return err;
529
}
530

    
531
static int
532
vhd_initialize_dynamic_disk(struct vhd_state *s)
533
{
534
        uint32_t bm_size;
535
        void *buf;
536
        int err;
537

    
538
        err = vhd_get_header(&s->vhd);
539
        if (err) {
540
                if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
541
                        EPRINTF("Error reading VHD DD header.\n");
542
                return err;
543
        }
544

    
545
        if (s->vhd.header.hdr_ver != 0x00010000) {
546
                EPRINTF("unsupported header version! (0x%x)\n",
547
                        s->vhd.header.hdr_ver);
548
                return -EINVAL;
549
        }
550

    
551
        s->spp     = getpagesize() >> VHD_SECTOR_SHIFT;
552
        s->spb     = s->vhd.header.block_size >> VHD_SECTOR_SHIFT;
553
        s->bm_secs = secs_round_up_no_zero(s->spb >> 3);
554

    
555
        s->padbm_size = (s->bm_secs / getpagesize()) * getpagesize();
556
        if (s->bm_secs % getpagesize())
557
                s->padbm_size += getpagesize();
558

    
559
        err = posix_memalign(&buf, 512, s->padbm_size);
560
        if (err)
561
                return -err;
562

    
563
        s->padbm_buf = buf;
564
        bm_size = s->bm_secs << VHD_SECTOR_SHIFT;
565
        memset(s->padbm_buf, 0, s->padbm_size - bm_size);
566
        memset(s->padbm_buf + (s->padbm_size - bm_size), ~0, bm_size);
567
        s->debug_skipped_redundant_writes = 0;
568
        s->debug_done_redundant_writes = 0;
569

    
570
        if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE))
571
                return 0;
572

    
573
        err = vhd_initialize_bat(s);
574
        if (err)
575
                return err;
576

    
577
        err = vhd_initialize_bitmap_cache(s);
578
        if (err) {
579
                vhd_free_bat(s);
580
                return err;
581
        }
582

    
583
        return 0;
584
}
585

    
586
static int
587
vhd_check_version(struct vhd_state *s)
588
{
589
        if (strncmp(s->vhd.footer.crtr_app, "tap", 3))
590
                return 0;
591

    
592
        if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) {
593
                if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
594
                        EPRINTF("WARNING: %s vhd creator version 0x%08x, "
595
                                "but only versions up to 0x%08x are "
596
                                "supported for IO\n", s->vhd.file,
597
                                s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION);
598

    
599
                return -EINVAL;
600
        }
601

    
602
        return 0;
603
}
604

    
605
static void
606
vhd_log_open(struct vhd_state *s)
607
{
608
        char buf[5];
609
        uint32_t i, allocated, full;
610

    
611
        if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
612
                return;
613

    
614
        snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app);
615
        if (!vhd_type_dynamic(&s->vhd)) {
616
                DPRINTF("%s version: %s 0x%08x\n",
617
                        s->vhd.file, buf, s->vhd.footer.crtr_ver);
618
                return;
619
        }
620

    
621
        allocated = 0;
622
        full      = 0;
623

    
624
        for (i = 0; i < s->bat.bat.entries; i++) {
625
                if (bat_entry(s, i) != DD_BLK_UNUSED)
626
                        allocated++;
627
                if (test_batmap(s, i))
628
                        full++;
629
        }
630

    
631
        DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n",
632
                s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries,
633
                allocated, full, s->next_db);
634
}
635

    
636
static int
637
__vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags)
638
{
639
        int i, o_flags, err;
640
        struct vhd_state *s;
641

    
642
        DBG(TLOG_INFO, "vhd_open: %s\n", name);
643
        if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
644
                libvhd_set_log_level(1);
645

    
646
        s = (struct vhd_state *)driver->data;
647
        memset(s, 0, sizeof(struct vhd_state));
648

    
649
        s->flags  = flags;
650
        s->driver = driver;
651

    
652
        err = vhd_initialize(s);
653
        if (err)
654
                return err;
655

    
656
        o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ? 
657
                   VHD_OPEN_RDONLY : VHD_OPEN_RDWR);
658

    
659
        if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
660
                set_vhd_flag(o_flags, VHD_OPEN_STRICT);
661

    
662
        err = vhd_open(&s->vhd, name, o_flags);
663
        if (err) {
664
                libvhd_set_log_level(1);
665
                err = vhd_open(&s->vhd, name, o_flags);
666
                if (err) {
667
                        EPRINTF("Unable to open [%s] (%d)!\n", name, err);
668
                        return err;
669
                }
670
        }
671

    
672
        err = vhd_check_version(s);
673
        if (err)
674
                goto fail;
675

    
676
        s->spb = s->spp = 1;
677

    
678
        if (vhd_type_dynamic(&s->vhd)) {
679
                err = vhd_initialize_dynamic_disk(s);
680
                if (err)
681
                        goto fail;
682
        }
683

    
684
        vhd_log_open(s);
685

    
686
        SPB = s->spb;
687

    
688
        s->vreq_free_count = VHD_REQS_DATA;
689
        for (i = 0; i < VHD_REQS_DATA; i++)
690
                s->vreq_free[i] = s->vreq_list + i;
691

    
692
        driver->info.size        = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT;
693
        driver->info.sector_size = VHD_SECTOR_SIZE;
694
        driver->info.info        = 0;
695

    
696
        DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%lu, inf:%u)\n",
697
            driver->info.size, driver->info.sector_size, driver->info.info);
698

    
699
        if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) && 
700
            !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) {
701
                err = vhd_kill_footer(s);
702
                if (err) {
703
                        DPRINTF("ERROR killing footer: %d\n", err);
704
                        goto fail;
705
                }
706
                s->writes++;
707
        }
708

    
709
        return 0;
710

    
711
 fail:
712
        vhd_free_bat(s);
713
        vhd_free_bitmap_cache(s);
714
        vhd_close(&s->vhd);
715
        vhd_free(s);
716
        return err;
717
}
718

    
719
static int
720
_vhd_open(td_driver_t *driver, const char *name, td_flag_t flags)
721
{
722
        vhd_flag_t vhd_flags = 0;
723

    
724
        if (flags & TD_OPEN_RDONLY)
725
                vhd_flags |= VHD_FLAG_OPEN_RDONLY;
726
        if (flags & TD_OPEN_QUIET)
727
                vhd_flags |= VHD_FLAG_OPEN_QUIET;
728
        if (flags & TD_OPEN_STRICT)
729
                vhd_flags |= VHD_FLAG_OPEN_STRICT;
730
        if (flags & TD_OPEN_QUERY)
731
                vhd_flags |= (VHD_FLAG_OPEN_QUERY  |
732
                              VHD_FLAG_OPEN_QUIET  |
733
                              VHD_FLAG_OPEN_RDONLY |
734
                              VHD_FLAG_OPEN_NO_CACHE);
735

    
736
        /* pre-allocate for all but NFS and LVM storage */
737
        driver->storage = tapdisk_storage_type(name);
738

    
739
        if (driver->storage != TAPDISK_STORAGE_TYPE_NFS &&
740
            driver->storage != TAPDISK_STORAGE_TYPE_LVM)
741
                vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE;
742

    
743
        return __vhd_open(driver, name, vhd_flags);
744
}
745

    
746
static void
747
vhd_log_close(struct vhd_state *s)
748
{
749
        uint32_t i, allocated, full;
750

    
751
        if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
752
                return;
753

    
754
        allocated = 0;
755
        full      = 0;
756

    
757
        for (i = 0; i < s->bat.bat.entries; i++) {
758
                if (bat_entry(s, i) != DD_BLK_UNUSED)
759
                        allocated++;
760
                if (test_batmap(s, i))
761
                        full++;
762
        }
763

    
764
        DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n",
765
                s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db);
766
}
767

    
768
static int
769
_vhd_close(td_driver_t *driver)
770
{
771
        int err;
772
        struct vhd_state *s;
773
        
774
        DBG(TLOG_WARN, "vhd_close\n");
775
        s = (struct vhd_state *)driver->data;
776

    
777
        DPRINTF("gaps written/skipped: %ld/%ld\n", 
778
                        s->debug_done_redundant_writes,
779
                        s->debug_skipped_redundant_writes);
780

    
781
        /* don't write footer if tapdisk is read-only */
782
        if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY))
783
                goto free;
784
        
785
        /* 
786
         * write footer if:
787
         *   - we killed it on open (opened with strict) 
788
         *   - we've written data since opening
789
         */
790
        if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) {
791
                memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t));
792
                err = vhd_write_footer(&s->vhd, &s->vhd.footer);
793
                memset(&s->vhd.bat, 0, sizeof(vhd_bat_t));
794

    
795
                if (err)
796
                        EPRINTF("writing %s footer: %d\n", s->vhd.file, err);
797

    
798
                if (!vhd_has_batmap(&s->vhd))
799
                        goto free;
800

    
801
                err = vhd_write_batmap(&s->vhd, &s->bat.batmap);
802
                if (err)
803
                        EPRINTF("writing %s batmap: %d\n", s->vhd.file, err);
804
        }
805

    
806
 free:
807
        vhd_log_close(s);
808
        vhd_free_bat(s);
809
        vhd_free_bitmap_cache(s);
810
        vhd_close(&s->vhd);
811
        vhd_free(s);
812

    
813
        memset(s, 0, sizeof(struct vhd_state));
814

    
815
        return 0;
816
}
817

    
818
int
819
vhd_validate_parent(td_driver_t *child_driver,
820
                    td_driver_t *parent_driver, td_flag_t flags)
821
{
822
        struct vhd_state *child  = (struct vhd_state *)child_driver->data;
823
        struct vhd_state *parent;
824

    
825
        if (parent_driver->type != DISK_TYPE_VHD) {
826
                if (child_driver->type != DISK_TYPE_VHD)
827
                        return -EINVAL;
828
                if (child->vhd.footer.type != HD_TYPE_DIFF)
829
                        return -EINVAL;
830
                if (!vhd_parent_raw(&child->vhd))
831
                        return -EINVAL;
832
                return 0;
833
        }
834

    
835
        parent = (struct vhd_state *)parent_driver->data;
836

    
837
        /* 
838
         * This check removed because of cases like:
839
         *   - parent VHD marked as 'hidden'
840
         *   - parent VHD modified during coalesce
841
         */
842
        /*
843
        if (stat(parent->vhd.file, &stats)) {
844
                DPRINTF("ERROR stating parent file %s\n", parent->vhd.file);
845
                return -errno;
846
        }
847

848
        if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) {
849
                DPRINTF("ERROR: parent file has been modified since "
850
                        "snapshot.  Child image no longer valid.\n");
851
                return -EINVAL;
852
        }
853
        */
854

    
855
        if (uuid_compare(child->vhd.header.prt_uuid, parent->vhd.footer.uuid)) {
856
                DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since "
857
                        "snapshot.  Child image no longer valid.\n",
858
                        __func__, child->vhd.file, parent->vhd.file);
859
                return -EINVAL;
860
        }
861

    
862
        /* TODO: compare sizes */
863
        
864
        return 0;
865
}
866

    
867
int
868
vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
869
{
870
        int err;
871
        char *parent;
872
        struct vhd_state *s;
873

    
874
        DBG(TLOG_DBG, "\n");
875
        memset(id, 0, sizeof(td_disk_id_t));
876

    
877
        s = (struct vhd_state *)driver->data;
878

    
879
        if (s->vhd.footer.type != HD_TYPE_DIFF)
880
                return TD_NO_PARENT;
881

    
882
        err = vhd_parent_locator_get(&s->vhd, &parent);
883
        if (err)
884
                return err;
885

    
886
        id->name   = parent;
887
        id->type   = vhd_parent_raw(&s->vhd) ? DISK_TYPE_AIO : DISK_TYPE_VHD;
888
        id->flags |= TD_OPEN_SHAREABLE|TD_OPEN_RDONLY;
889

    
890
        return 0;
891
}
892

    
893
static inline void
894
clear_req_list(struct vhd_req_list *list)
895
{
896
        list->head = list->tail = NULL;
897
}
898

    
899
static inline void
900
add_to_tail(struct vhd_req_list *list, struct vhd_request *e)
901
{
902
        if (!list->head) 
903
                list->head = list->tail = e;
904
        else 
905
                list->tail = list->tail->next = e;
906
}
907

    
908
static inline int
909
remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e)
910
{
911
        struct vhd_request *i = list->head;
912

    
913
        if (list->head == e) {
914
                if (list->tail == e)
915
                        clear_req_list(list);
916
                else
917
                        list->head = list->head->next;
918
                return 0;
919
        }
920

    
921
        while (i->next) {
922
                if (i->next == e) {
923
                        if (list->tail == e) {
924
                                i->next = NULL;
925
                                list->tail = i;
926
                        } else
927
                                i->next = i->next->next;
928
                        return 0;
929
                }
930
                i = i->next;
931
        }
932

    
933
        return -EINVAL;
934
}
935

    
936
static inline void
937
init_vhd_request(struct vhd_state *s, struct vhd_request *req)
938
{
939
        memset(req, 0, sizeof(struct vhd_request));
940
        req->state = s;
941
}
942

    
943
static inline void
944
init_tx(struct vhd_transaction *tx)
945
{
946
        memset(tx, 0, sizeof(struct vhd_transaction));
947
}
948

    
949
static inline void
950
add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r)
951
{
952
        ASSERT(!tx->closed);
953

    
954
        r->tx = tx;
955
        tx->started++;
956
        add_to_tail(&tx->requests, r);
957
        set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE);
958

    
959
        DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, "
960
            "started: %d, finished: %d, status: %u\n",
961
            r->treq.sec / SPB, r->treq.sec, tx,
962
            tx->started, tx->finished, tx->status);
963
}
964

    
965
static inline int
966
transaction_completed(struct vhd_transaction *tx)
967
{
968
        return (tx->started == tx->finished);
969
}
970

    
971
static inline void
972
init_bat(struct vhd_state *s)
973
{
974
        s->bat.req.tx     = NULL;
975
        s->bat.req.next   = NULL;
976
        s->bat.req.error  = 0;
977
        s->bat.pbw_blk    = 0;
978
        s->bat.pbw_offset = 0;
979
        s->bat.status     = 0;
980
}
981

    
982
static inline void
983
lock_bat(struct vhd_state *s)
984
{
985
        set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
986
}
987

    
988
static inline void
989
unlock_bat(struct vhd_state *s)
990
{
991
        clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
992
}
993

    
994
static inline int
995
bat_locked(struct vhd_state *s)
996
{
997
        return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
998
}
999

    
1000
static inline void
1001
init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
1002
{
1003
        bm->blk    = 0;
1004
        bm->seqno  = 0;
1005
        bm->status = 0;
1006
        init_tx(&bm->tx);
1007
        clear_req_list(&bm->queue);
1008
        clear_req_list(&bm->waiting);
1009
        memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs));
1010
        memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs));
1011
        init_vhd_request(s, &bm->req);
1012
}
1013

    
1014
static inline struct vhd_bitmap *
1015
get_bitmap(struct vhd_state *s, uint32_t block)
1016
{
1017
        int i;
1018
        struct vhd_bitmap *bm;
1019

    
1020
        for (i = 0; i < VHD_CACHE_SIZE; i++) {
1021
                bm = s->bitmap[i];
1022
                if (bm && bm->blk == block)
1023
                        return bm;
1024
        }
1025

    
1026
        return NULL;
1027
}
1028

    
1029
static inline void
1030
lock_bitmap(struct vhd_bitmap *bm)
1031
{
1032
        set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
1033
}
1034

    
1035
static inline void
1036
unlock_bitmap(struct vhd_bitmap *bm)
1037
{
1038
        clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
1039
}
1040

    
1041
static inline int
1042
bitmap_locked(struct vhd_bitmap *bm)
1043
{
1044
        return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
1045
}
1046

    
1047
static inline int
1048
bitmap_valid(struct vhd_bitmap *bm)
1049
{
1050
        return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
1051
}
1052

    
1053
static inline int
1054
bitmap_in_use(struct vhd_bitmap *bm)
1055
{
1056
        return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)  ||
1057
                test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) ||
1058
                test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) ||
1059
                bm->waiting.head || bm->tx.requests.head || bm->queue.head);
1060
}
1061

    
1062
static inline int
1063
bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm)
1064
{
1065
        int i, n;
1066

    
1067
        n = s->spb >> 3;
1068
        for (i = 0; i < n; i++)
1069
                if (bm->map[i] != (char)0xFF)
1070
                        return 0;
1071

    
1072
        DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk);
1073
        return 1;
1074
}
1075

    
1076
static struct vhd_bitmap *
1077
remove_lru_bitmap(struct vhd_state *s)
1078
{
1079
        int i, idx = 0;
1080
        uint64_t seq = s->bm_lru;
1081
        struct vhd_bitmap *bm, *lru = NULL;
1082

    
1083
        for (i = 0; i < VHD_CACHE_SIZE; i++) {
1084
                bm = s->bitmap[i];
1085
                if (bm && bm->seqno < seq && !bitmap_locked(bm)) {
1086
                        idx = i;
1087
                        lru = bm;
1088
                        seq = lru->seqno;
1089
                }
1090
        }
1091

    
1092
        if (lru) {
1093
                s->bitmap[idx] = NULL;
1094
                ASSERT(!bitmap_in_use(lru));
1095
        }
1096

    
1097
        return  lru;
1098
}
1099

    
1100
static int
1101
alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk)
1102
{
1103
        struct vhd_bitmap *bm;
1104
        
1105
        *bitmap = NULL;
1106

    
1107
        if (s->bm_free_count > 0) {
1108
                bm = s->bitmap_free[--s->bm_free_count];
1109
        } else {
1110
                bm = remove_lru_bitmap(s);
1111
                if (!bm)
1112
                        return -EBUSY;
1113
        }
1114

    
1115
        init_vhd_bitmap(s, bm);
1116
        bm->blk = blk;
1117
        *bitmap = bm;
1118

    
1119
        return 0;
1120
}
1121

    
1122
static inline uint64_t
1123
__bitmap_lru_seqno(struct vhd_state *s)
1124
{
1125
        int i;
1126
        struct vhd_bitmap *bm;
1127

    
1128
        if (s->bm_lru == 0xffffffff) {
1129
                s->bm_lru = 0;
1130
                for (i = 0; i < VHD_CACHE_SIZE; i++) {
1131
                        bm = s->bitmap[i];
1132
                        if (bm) {
1133
                                bm->seqno >>= 1;
1134
                                if (bm->seqno > s->bm_lru)
1135
                                        s->bm_lru = bm->seqno;
1136
                        }
1137
                }
1138
        }
1139

    
1140
        return ++s->bm_lru;
1141
}
1142

    
1143
static inline void
1144
touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
1145
{
1146
        bm->seqno = __bitmap_lru_seqno(s);
1147
}
1148

    
1149
static inline void
1150
install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
1151
{
1152
        int i;
1153
        for (i = 0; i < VHD_CACHE_SIZE; i++) {
1154
                if (!s->bitmap[i]) {
1155
                        touch_bitmap(s, bm);
1156
                        s->bitmap[i] = bm;
1157
                        return;
1158
                }
1159
        }
1160

    
1161
        ASSERT(0);
1162
}
1163

    
1164
static inline void
1165
free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
1166
{
1167
        int i;
1168

    
1169
        for (i = 0; i < VHD_CACHE_SIZE; i++)
1170
                if (s->bitmap[i] == bm)
1171
                        break;
1172

    
1173
        ASSERT(!bitmap_locked(bm));
1174
        ASSERT(!bitmap_in_use(bm));
1175
        ASSERT(i < VHD_CACHE_SIZE);
1176

    
1177
        s->bitmap[i] = NULL;
1178
        s->bitmap_free[s->bm_free_count++] = bm;
1179
}
1180

    
1181
static int
1182
read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op)
1183
{
1184
        uint32_t blk, sec;
1185
        struct vhd_bitmap *bm;
1186

    
1187
        /* in fixed disks, every block is present */
1188
        if (s->vhd.footer.type == HD_TYPE_FIXED) 
1189
                return VHD_BM_BIT_SET;
1190

    
1191
        blk = sector / s->spb;
1192
        sec = sector % s->spb;
1193

    
1194
        if (blk > s->vhd.header.max_bat_size) {
1195
                DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n",
1196
                        sector, op);
1197
                return -EINVAL;
1198
        }
1199

    
1200
        if (bat_entry(s, blk) == DD_BLK_UNUSED) {
1201
                if (op == VHD_OP_DATA_WRITE &&
1202
                    s->bat.pbw_blk != blk && bat_locked(s))
1203
                        return VHD_BM_BAT_LOCKED;
1204

    
1205
                return VHD_BM_BAT_CLEAR;
1206
        }
1207

    
1208
        if (test_batmap(s, blk)) {
1209
                DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk);
1210
                return VHD_BM_BIT_SET;
1211
        }
1212

    
1213
        bm = get_bitmap(s, blk);
1214
        if (!bm)
1215
                return VHD_BM_NOT_CACHED;
1216

    
1217
        /* bump lru count */
1218
        touch_bitmap(s, bm);
1219

    
1220
        if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING))
1221
                return VHD_BM_READ_PENDING;
1222

    
1223
        return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ? 
1224
                VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR);
1225
}
1226

    
1227
static int
1228
read_bitmap_cache_span(struct vhd_state *s, 
1229
                       uint64_t sector, int nr_secs, int value)
1230
{
1231
        int ret;
1232
        uint32_t blk, sec;
1233
        struct vhd_bitmap *bm;
1234

    
1235
        /* in fixed disks, every block is present */
1236
        if (s->vhd.footer.type == HD_TYPE_FIXED) 
1237
                return nr_secs;
1238

    
1239
        sec = sector % s->spb;
1240
        blk = sector / s->spb;
1241

    
1242
        if (test_batmap(s, blk))
1243
                return MIN(nr_secs, s->spb - sec);
1244

    
1245
        bm  = get_bitmap(s, blk);
1246
        
1247
        ASSERT(bm && bitmap_valid(bm));
1248

    
1249
        for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++)
1250
                if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value)
1251
                        break;
1252

    
1253
        return ret;
1254
}
1255

    
1256
static inline struct vhd_request *
1257
alloc_vhd_request(struct vhd_state *s)
1258
{
1259
        struct vhd_request *req = NULL;
1260
        
1261
        if (s->vreq_free_count > 0) {
1262
                req = s->vreq_free[--s->vreq_free_count];
1263
                ASSERT(req->treq.secs == 0);
1264
                init_vhd_request(s, req);
1265
                return req;
1266
        }
1267

    
1268
        return NULL;
1269
}
1270

    
1271
static inline void
1272
free_vhd_request(struct vhd_state *s, struct vhd_request *req)
1273
{
1274
        memset(req, 0, sizeof(struct vhd_request));
1275
        s->vreq_free[s->vreq_free_count++] = req;
1276
}
1277

    
1278
static inline void
1279
aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
1280
{
1281
        struct tiocb *tiocb = &req->tiocb;
1282

    
1283
        td_prep_read(tiocb, s->vhd.fd, req->treq.buf,
1284
                     vhd_sectors_to_bytes(req->treq.secs),
1285
                     offset, vhd_complete, req);
1286
        td_queue_tiocb(s->driver, tiocb);
1287

    
1288
        s->queued++;
1289
        s->reads++;
1290
        s->read_size += req->treq.secs;
1291
        TRACE(s);
1292
}
1293

    
1294
static inline void
1295
aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
1296
{
1297
        struct tiocb *tiocb = &req->tiocb;
1298

    
1299
        td_prep_write(tiocb, s->vhd.fd, req->treq.buf,
1300
                      vhd_sectors_to_bytes(req->treq.secs),
1301
                      offset, vhd_complete, req);
1302
        td_queue_tiocb(s->driver, tiocb);
1303

    
1304
        s->queued++;
1305
        s->writes++;
1306
        s->write_size += req->treq.secs;
1307
        TRACE(s);
1308
}
1309

    
1310
static inline uint64_t
1311
reserve_new_block(struct vhd_state *s, uint32_t blk)
1312
{
1313
        int gap = 0;
1314

    
1315
        ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
1316

    
1317
        /* data region of segment should begin on page boundary */
1318
        if ((s->next_db + s->bm_secs) % s->spp)
1319
                gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
1320

    
1321
        s->bat.pbw_blk    = blk;
1322
        s->bat.pbw_offset = s->next_db + gap;
1323

    
1324
        return s->next_db;
1325
}
1326

    
1327
static int
1328
schedule_bat_write(struct vhd_state *s)
1329
{
1330
        int i;
1331
        uint32_t blk;
1332
        char *buf;
1333
        uint64_t offset;
1334
        struct vhd_request *req;
1335

    
1336
        ASSERT(bat_locked(s));
1337

    
1338
        req = &s->bat.req;
1339
        buf = s->bat.bat_buf;
1340
        blk = s->bat.pbw_blk;
1341

    
1342
        init_vhd_request(s, req);
1343
        memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512);
1344

    
1345
        ((uint32_t *)buf)[blk % 128] = s->bat.pbw_offset;
1346

    
1347
        for (i = 0; i < 128; i++)
1348
                BE32_OUT(&((uint32_t *)buf)[i]);
1349

    
1350
        offset         = s->vhd.header.table_offset + (blk - (blk % 128)) * 4;
1351
        req->treq.secs = 1;
1352
        req->treq.buf  = buf;
1353
        req->op        = VHD_OP_BAT_WRITE;
1354
        req->next      = NULL;
1355

    
1356
        aio_write(s, req, offset);
1357
        set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED);
1358

    
1359
        DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", "
1360
            "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset);
1361

    
1362
        return 0;
1363
}
1364

    
1365
static void
1366
schedule_zero_bm_write(struct vhd_state *s,
1367
                       struct vhd_bitmap *bm, uint64_t lb_end)
1368
{
1369
        uint64_t offset;
1370
        struct vhd_request *req = &s->bat.zero_req;
1371

    
1372
        init_vhd_request(s, req);
1373

    
1374
        offset         = vhd_sectors_to_bytes(lb_end);
1375
        req->op        = VHD_OP_ZERO_BM_WRITE;
1376
        req->treq.sec  = s->bat.pbw_blk * s->spb;
1377
        req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs;
1378
        req->treq.buf  = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs));
1379
        req->next      = NULL;
1380

    
1381
        DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n",
1382
            s->bat.pbw_blk, offset);
1383

    
1384
        lock_bitmap(bm);
1385
        add_to_transaction(&bm->tx, req);
1386
        aio_write(s, req, offset);
1387
}
1388

    
1389
/* This is a performance optimization. When writing sequentially into full 
1390
 * blocks, skipping (up-to-date) bitmaps causes an approx. 25% reduction in 
1391
 * throughput. To prevent skipping, we issue redundant writes into the (padded) 
1392
 * bitmap area just to make all writes sequential. This will help VHDs on raw 
1393
 * block devices, while the FS-based VHDs shouldn't suffer much.
1394
 *
1395
 * Note that it only makes sense to perform this reduntant bitmap write if the 
1396
 * block is completely full (i.e. the batmap entry is set). If the block is not 
1397
 * completely full then one of the following two things will be true:
1398
 *  1. we'll either be allocating new sectors in this block and writing its
1399
 *     bitmap transactionally, which will be slow anyways; or
1400
 *  2. the IO will be skipping over the unallocated sectors again, so the
1401
 *     pattern will not be sequential anyways
1402
 * In either case a redundant bitmap write becomes pointless. This fact 
1403
 * simplifies the implementation of redundant writes: since we know the bitmap 
1404
 * cannot be updated by anyone else, we don't have to worry about transactions 
1405
 * or potential write conflicts.
1406
 * */
1407
static void
1408
schedule_redundant_bm_write(struct vhd_state *s, uint32_t blk)
1409
{
1410
        uint64_t offset;
1411
        struct vhd_request *req;
1412

    
1413
        ASSERT(s->vhd.footer.type != HD_TYPE_FIXED);
1414
        ASSERT(test_batmap(s, blk));
1415

    
1416
        req = alloc_vhd_request(s);
1417
        if (!req) 
1418
                return;
1419

    
1420
        req->treq.buf = s->padbm_buf;
1421

    
1422
        offset = bat_entry(s, blk);
1423
        ASSERT(offset != DD_BLK_UNUSED);
1424
        offset <<= VHD_SECTOR_SHIFT;
1425
        offset -= s->padbm_size - (s->bm_secs << VHD_SECTOR_SHIFT);
1426

    
1427
        req->op        = VHD_OP_REDUNDANT_BM_WRITE;
1428
        req->treq.sec  = blk * s->spb;
1429
        req->treq.secs = s->padbm_size >> VHD_SECTOR_SHIFT;
1430
        req->next      = NULL;
1431

    
1432
        DBG(TLOG_DBG, "blk: %u, writing redundant bitmap at %" PRIu64 "\n",
1433
            blk, offset);
1434

    
1435
        aio_write(s, req, offset);
1436
}
1437

    
1438
static int
1439
update_bat(struct vhd_state *s, uint32_t blk)
1440
{
1441
        int err;
1442
        uint64_t lb_end;
1443
        struct vhd_bitmap *bm;
1444

    
1445
        ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
1446
        
1447
        if (bat_locked(s)) {
1448
                ASSERT(s->bat.pbw_blk == blk);
1449
                return 0;
1450
        }
1451

    
1452
        /* empty bitmap could already be in
1453
         * cache if earlier bat update failed */
1454
        bm = get_bitmap(s, blk);
1455
        if (!bm) {
1456
                /* install empty bitmap in cache */
1457
                err = alloc_vhd_bitmap(s, &bm, blk);
1458
                if (err) 
1459
                        return err;
1460

    
1461
                install_bitmap(s, bm);
1462
        }
1463

    
1464
        lock_bat(s);
1465
        lb_end = reserve_new_block(s, blk);
1466
        schedule_zero_bm_write(s, bm, lb_end);
1467
        set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT);
1468

    
1469
        return 0;
1470
}
1471

    
1472
static int
1473
allocate_block(struct vhd_state *s, uint32_t blk)
1474
{
1475
        int err, gap;
1476
        uint64_t offset, size;
1477
        struct vhd_bitmap *bm;
1478
        ssize_t count;
1479

    
1480
        ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
1481

    
1482
        if (bat_locked(s)) {
1483
                ASSERT(s->bat.pbw_blk == blk);
1484
                if (s->bat.req.error)
1485
                        return -EBUSY;
1486
                return 0;
1487
        }
1488

    
1489
        gap            = 0;
1490
        s->bat.pbw_blk = blk;
1491
        offset         = vhd_sectors_to_bytes(s->next_db);
1492

    
1493
        /* data region of segment should begin on page boundary */
1494
        if ((s->next_db + s->bm_secs) % s->spp) {
1495
                gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
1496
                s->next_db += gap;
1497
        }
1498

    
1499
        s->bat.pbw_offset = s->next_db;
1500

    
1501
        DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n",
1502
            blk, s->bat.pbw_offset);
1503

    
1504
        if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) {
1505
                ERR(s, -errno, "lseek failed\n");
1506
                return -errno;
1507
        }
1508

    
1509
        size  = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap);
1510
        count = write(s->vhd.fd, vhd_zeros(size), size);
1511
        if (count != size) {
1512
                err = count < 0 ? -errno : -ENOSPC;
1513
                ERR(s, -errno,
1514
                    "write failed (%zd, offset %"PRIu64")\n", count, offset);
1515
                return err;
1516
        }
1517

    
1518
        /* empty bitmap could already be in
1519
         * cache if earlier bat update failed */
1520
        bm = get_bitmap(s, blk);
1521
        if (!bm) {
1522
                /* install empty bitmap in cache */
1523
                err = alloc_vhd_bitmap(s, &bm, blk);
1524
                if (err) 
1525
                        return err;
1526

    
1527
                install_bitmap(s, bm);
1528
        }
1529

    
1530
        lock_bat(s);
1531
        lock_bitmap(bm);
1532
        schedule_bat_write(s);
1533
        add_to_transaction(&bm->tx, &s->bat.req);
1534

    
1535
        return 0;
1536
}
1537

    
1538
static int 
1539
schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
1540
{
1541
        uint64_t offset;
1542
        uint32_t blk = 0, sec = 0;
1543
        struct vhd_bitmap  *bm;
1544
        struct vhd_request *req;
1545

    
1546
        if (s->vhd.footer.type == HD_TYPE_FIXED) {
1547
                offset = vhd_sectors_to_bytes(treq.sec);
1548
                goto make_request;
1549
        }
1550

    
1551
        blk    = treq.sec / s->spb;
1552
        sec    = treq.sec % s->spb;
1553
        bm     = get_bitmap(s, blk);
1554
        offset = bat_entry(s, blk);
1555

    
1556
        ASSERT(offset != DD_BLK_UNUSED);
1557
        ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm)));
1558

    
1559
        offset += s->bm_secs + sec;
1560
        offset  = vhd_sectors_to_bytes(offset);
1561

    
1562
 make_request:
1563
        req = alloc_vhd_request(s);
1564
        if (!req) 
1565
                return -EBUSY;
1566

    
1567
        req->treq  = treq;
1568
        req->flags = flags;
1569
        req->op    = VHD_OP_DATA_READ;
1570
        req->next  = NULL;
1571

    
1572
        aio_read(s, req, offset);
1573

    
1574
        DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
1575
            "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n",
1576
            s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags,
1577
            treq.buf);
1578

    
1579
        return 0;
1580
}
1581

    
1582
static int
1583
schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
1584
{
1585
        int err;
1586
        uint64_t offset;
1587
        uint32_t blk = 0, sec = 0;
1588
        struct vhd_bitmap  *bm = NULL;
1589
        struct vhd_request *req;
1590

    
1591
        if (s->vhd.footer.type == HD_TYPE_FIXED) {
1592
                offset = vhd_sectors_to_bytes(treq.sec);
1593
                goto make_request;
1594
        }
1595

    
1596
        blk    = treq.sec / s->spb;
1597
        sec    = treq.sec % s->spb;
1598
        offset = bat_entry(s, blk);
1599

    
1600
        if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) {
1601
                if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
1602
                        err = allocate_block(s, blk);
1603
                else
1604
                        err = update_bat(s, blk);
1605

    
1606
                if (err)
1607
                        return err;
1608

    
1609
                offset = s->bat.pbw_offset;
1610
        }
1611

    
1612
        offset += s->bm_secs + sec;
1613
        offset  = vhd_sectors_to_bytes(offset);
1614

    
1615
 make_request:
1616
        req = alloc_vhd_request(s);
1617
        if (!req)
1618
                return -EBUSY;
1619

    
1620
        req->treq  = treq;
1621
        req->flags = flags;
1622
        req->op    = VHD_OP_DATA_WRITE;
1623
        req->next  = NULL;
1624

    
1625
        if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) {
1626
                bm = get_bitmap(s, blk);
1627
                ASSERT(bm && bitmap_valid(bm));
1628
                lock_bitmap(bm);
1629

    
1630
                if (bm->tx.closed) {
1631
                        add_to_tail(&bm->queue, req);
1632
                        set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED);
1633
                } else
1634
                        add_to_transaction(&bm->tx, req);
1635
        } else if (sec == 0 &&         /* first sector inside data block */
1636
                   s->vhd.footer.type != HD_TYPE_FIXED && 
1637
                   bat_entry(s, blk) != s->first_db &&
1638
                   test_batmap(s, blk))
1639
                schedule_redundant_bm_write(s, blk);
1640

    
1641
        aio_write(s, req, offset);
1642

    
1643
        DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
1644
            "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n",
1645
            s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags);
1646

    
1647
        return 0;
1648
}
1649

    
1650
static int 
1651
schedule_bitmap_read(struct vhd_state *s, uint32_t blk)
1652
{
1653
        int err;
1654
        uint64_t offset;
1655
        struct vhd_bitmap  *bm;
1656
        struct vhd_request *req = NULL;
1657

    
1658
        ASSERT(vhd_type_dynamic(&s->vhd));
1659

    
1660
        offset = bat_entry(s, blk);
1661

    
1662
        ASSERT(offset != DD_BLK_UNUSED);
1663
        ASSERT(!get_bitmap(s, blk));
1664

    
1665
        offset = vhd_sectors_to_bytes(offset);
1666

    
1667
        err = alloc_vhd_bitmap(s, &bm, blk);
1668
        if (err)
1669
                return err;
1670

    
1671
        req = &bm->req;
1672
        init_vhd_request(s, req);
1673

    
1674
        req->treq.sec  = blk * s->spb;
1675
        req->treq.secs = s->bm_secs;
1676
        req->treq.buf  = bm->map;
1677
        req->treq.cb   = NULL;
1678
        req->op        = VHD_OP_BITMAP_READ;
1679
        req->next      = NULL;
1680

    
1681
        aio_read(s, req, offset);
1682
        lock_bitmap(bm);
1683
        install_bitmap(s, bm);
1684
        set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
1685

    
1686
        DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, "
1687
            "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk,
1688
            req->treq.secs, offset);
1689

    
1690
        return 0;
1691
}
1692

    
1693
static void
1694
schedule_bitmap_write(struct vhd_state *s, uint32_t blk)
1695
{
1696
        uint64_t offset;
1697
        struct vhd_bitmap  *bm;
1698
        struct vhd_request *req;
1699

    
1700
        bm     = get_bitmap(s, blk);
1701
        offset = bat_entry(s, blk);
1702

    
1703
        ASSERT(vhd_type_dynamic(&s->vhd));
1704
        ASSERT(bm && bitmap_valid(bm) &&
1705
               !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
1706

    
1707
        if (offset == DD_BLK_UNUSED) {
1708
                ASSERT(bat_locked(s) && s->bat.pbw_blk == blk);
1709
                offset = s->bat.pbw_offset;
1710
        }
1711
        
1712
        offset = vhd_sectors_to_bytes(offset);
1713

    
1714
        req = &bm->req;
1715
        init_vhd_request(s, req);
1716

    
1717
        req->treq.sec  = blk * s->spb;
1718
        req->treq.secs = s->bm_secs;
1719
        req->treq.buf  = bm->shadow;
1720
        req->treq.cb   = NULL;
1721
        req->op        = VHD_OP_BITMAP_WRITE;
1722
        req->next      = NULL;
1723

    
1724
        aio_write(s, req, offset);
1725
        lock_bitmap(bm);
1726
        touch_bitmap(s, bm);     /* bump lru count */
1727
        set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
1728

    
1729
        DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, "
1730
            "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec,
1731
            req->treq.secs, offset);
1732
}
1733

    
1734
/* 
1735
 * queued requests will be submitted once the bitmap
1736
 * describing them is read and the requests are validated. 
1737
 */
1738
static int
1739
__vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq)
1740
{
1741
        uint32_t blk;
1742
        struct vhd_bitmap  *bm;
1743
        struct vhd_request *req;
1744

    
1745
        ASSERT(vhd_type_dynamic(&s->vhd));
1746

    
1747
        blk = treq.sec / s->spb;
1748
        bm  = get_bitmap(s, blk);
1749

    
1750
        ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
1751

    
1752
        req = alloc_vhd_request(s);
1753
        if (!req)
1754
                return -EBUSY;
1755

    
1756
        req->treq = treq;
1757
        req->op   = op;
1758
        req->next = NULL;
1759

    
1760
        add_to_tail(&bm->waiting, req);
1761
        lock_bitmap(bm);
1762

    
1763
        DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, "
1764
            "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op);
1765

    
1766
        TRACE(s);
1767
        return 0;
1768
}
1769

    
1770
static void
1771
vhd_queue_read(td_driver_t *driver, td_request_t treq)
1772
{
1773
        struct vhd_state *s = (struct vhd_state *)driver->data;
1774

    
1775
        DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n",
1776
            s->vhd.file, treq.sec, treq.secs, treq.sidx);
1777

    
1778
        while (treq.secs) {
1779
                int err;
1780
                td_request_t clone;
1781

    
1782
                err   = 0;
1783
                clone = treq;
1784

    
1785
                switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) {
1786
                case -EINVAL:
1787
                        err = -EINVAL;
1788
                        goto fail;
1789

    
1790
                case VHD_BM_BAT_CLEAR:
1791
                        clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1792
                        td_forward_request(clone);
1793
                        break;
1794

    
1795
                case VHD_BM_BIT_CLEAR:
1796
                        clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
1797
                        td_forward_request(clone);
1798
                        break;
1799

    
1800
                case VHD_BM_BIT_SET:
1801
                        clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
1802
                        err = schedule_data_read(s, clone, 0);
1803
                        if (err)
1804
                                goto fail;
1805
                        break;
1806

    
1807
                case VHD_BM_NOT_CACHED:
1808
                        err = schedule_bitmap_read(s, clone.sec / s->spb);
1809
                        if (err)
1810
                                goto fail;
1811

    
1812
                        clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1813
                        err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
1814
                        if (err)
1815
                                goto fail;
1816
                        break;
1817

    
1818
                case VHD_BM_READ_PENDING:
1819
                        clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1820
                        err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
1821
                        if (err)
1822
                                goto fail;
1823
                        break;
1824

    
1825
                case VHD_BM_BAT_LOCKED:
1826
                default:
1827
                        ASSERT(0);
1828
                        break;
1829
                }
1830

    
1831
                treq.sec  += clone.secs;
1832
                treq.secs -= clone.secs;
1833
                treq.buf  += vhd_sectors_to_bytes(clone.secs);
1834
                continue;
1835

    
1836
        fail:
1837
                clone.secs = treq.secs;
1838
                td_complete_request(clone, err);
1839
                break;
1840
        }
1841
}
1842

    
1843
static void
1844
vhd_queue_write(td_driver_t *driver, td_request_t treq)
1845
{
1846
        struct vhd_state *s = (struct vhd_state *)driver->data;
1847

    
1848
        DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n",
1849
            s->vhd.file, treq.sec, treq.secs, treq.sidx);
1850

    
1851
        while (treq.secs) {
1852
                int err;
1853
                uint8_t flags;
1854
                td_request_t clone;
1855

    
1856
                err   = 0;
1857
                flags = 0;
1858
                clone = treq;
1859

    
1860
                switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) {
1861
                case -EINVAL:
1862
                        err = -EINVAL;
1863
                        goto fail;
1864

    
1865
                case VHD_BM_BAT_LOCKED:
1866
                        err = -EBUSY;
1867
                        goto fail;
1868

    
1869
                case VHD_BM_BAT_CLEAR:
1870
                        flags      = (VHD_FLAG_REQ_UPDATE_BAT |
1871
                                      VHD_FLAG_REQ_UPDATE_BITMAP);
1872
                        clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1873
                        err        = schedule_data_write(s, clone, flags);
1874
                        if (err)
1875
                                goto fail;
1876
                        break;
1877

    
1878
                case VHD_BM_BIT_CLEAR:
1879
                        flags      = VHD_FLAG_REQ_UPDATE_BITMAP;
1880
                        clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
1881
                        err        = schedule_data_write(s, clone, flags);
1882
                        if (err)
1883
                                goto fail;
1884
                        break;
1885

    
1886
                case VHD_BM_BIT_SET:
1887
                        clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
1888
                        err = schedule_data_write(s, clone, 0);
1889
                        if (err)
1890
                                goto fail;
1891
                        break;
1892

    
1893
                case VHD_BM_NOT_CACHED:
1894
                        clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1895
                        err = schedule_bitmap_read(s, clone.sec / s->spb);
1896
                        if (err)
1897
                                goto fail;
1898

    
1899
                        err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
1900
                        if (err)
1901
                                goto fail;
1902
                        break;
1903

    
1904
                case VHD_BM_READ_PENDING:
1905
                        clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1906
                        err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
1907
                        if (err)
1908
                                goto fail;
1909
                        break;
1910

    
1911
                default:
1912
                        ASSERT(0);
1913
                        break;
1914
                }
1915

    
1916
                treq.sec  += clone.secs;
1917
                treq.secs -= clone.secs;
1918
                treq.buf  += vhd_sectors_to_bytes(clone.secs);
1919
                continue;
1920

    
1921
        fail:
1922
                clone.secs = treq.secs;
1923
                td_complete_request(clone, err);
1924
                break;
1925
        }
1926
}
1927

    
1928
static inline void
1929
signal_completion(struct vhd_request *list, int error)
1930
{
1931
        struct vhd_state *s;
1932
        struct vhd_request *r, *next;
1933

    
1934
        if (!list)
1935
                return;
1936

    
1937
        r = list;
1938
        s = list->state;
1939

    
1940
        while (r) {
1941
                int err;
1942

    
1943
                err  = (error ? error : r->error);
1944
                next = r->next;
1945
                td_complete_request(r->treq, err);
1946
                DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", "
1947
                    "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err);
1948
                free_vhd_request(s, r);
1949
                r    = next;
1950

    
1951
                s->returned++;
1952
                TRACE(s);
1953
        }
1954
}
1955

    
1956
static void
1957
start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
1958
{
1959
        struct vhd_transaction *tx;
1960
        struct vhd_request *r, *next;
1961
        int i;
1962

    
1963
        if (!bm->queue.head)
1964
                return;
1965

    
1966
        DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
1967

    
1968
        r  = bm->queue.head;
1969
        tx = &bm->tx;
1970
        clear_req_list(&bm->queue);
1971

    
1972
        if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED)
1973
                tx->error = -EIO;
1974

    
1975
        while (r) {
1976
                next    = r->next;
1977
                r->next = NULL;
1978
                clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED);
1979

    
1980
                add_to_transaction(tx, r);
1981
                if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) {
1982
                        tx->finished++;
1983
                        if (!r->error) {
1984
                                uint32_t sec = r->treq.sec % s->spb;
1985
                                for (i = 0; i < r->treq.secs; i++)
1986
                                        vhd_bitmap_set(&s->vhd,
1987
                                                       bm->shadow, sec + i);
1988
                        }
1989
                }
1990
                r = next;
1991
        }
1992

    
1993
        /* perhaps all the queued writes already completed? */
1994
        if (tx->started && transaction_completed(tx))
1995
                finish_data_transaction(s, bm);
1996
}
1997

    
1998
static void
1999
finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
2000
{
2001
        struct vhd_transaction *tx = &bm->tx;
2002

    
2003
        if (!bat_locked(s))
2004
                return;
2005

    
2006
        if (s->bat.pbw_blk != bm->blk)
2007
                return;
2008

    
2009
        if (!s->bat.req.error)
2010
                goto release;
2011

    
2012
        if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE))
2013
                goto release;
2014

    
2015
        tx->closed = 1;
2016
        return;
2017

    
2018
 release:
2019
        DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
2020
        unlock_bat(s);
2021
        init_bat(s);
2022
}
2023

    
2024
static void
2025
finish_bitmap_transaction(struct vhd_state *s,
2026
                          struct vhd_bitmap *bm, int error)
2027
{
2028
        int map_size;
2029
        struct vhd_transaction *tx = &bm->tx;
2030

    
2031
        DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error);
2032
        tx->error = (tx->error ? tx->error : error);
2033
        map_size  = vhd_sectors_to_bytes(s->bm_secs);
2034

    
2035
        if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
2036
                if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) {
2037
                        /* still waiting for bat write */
2038
                        ASSERT(bm->blk == s->bat.pbw_blk);
2039
                        ASSERT(test_vhd_flag(s->bat.status, 
2040
                                             VHD_FLAG_BAT_WRITE_STARTED));
2041
                        s->bat.req.tx = tx;
2042
                        return;
2043
                }
2044
        }
2045

    
2046
        if (tx->error) {
2047
                /* undo changes to shadow */
2048
                memcpy(bm->shadow, bm->map, map_size);
2049
        } else {
2050
                /* complete atomic write */
2051
                memcpy(bm->map, bm->shadow, map_size);
2052
                if (!test_batmap(s, bm->blk) && bitmap_full(s, bm))
2053
                        set_batmap(s, bm->blk);
2054
        }
2055

    
2056
        /* transaction done; signal completions */
2057
        signal_completion(tx->requests.head, tx->error);
2058
        init_tx(tx);
2059
        start_new_bitmap_transaction(s, bm);
2060

    
2061
        if (!bitmap_in_use(bm))
2062
                unlock_bitmap(bm);
2063

    
2064
        finish_bat_transaction(s, bm);
2065
}
2066

    
2067
static void
2068
finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
2069
{
2070
        struct vhd_transaction *tx = &bm->tx;
2071

    
2072
        DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
2073

    
2074
        tx->closed = 1;
2075

    
2076
        if (!tx->error)
2077
                return schedule_bitmap_write(s, bm->blk);
2078

    
2079
        return finish_bitmap_transaction(s, bm, 0);
2080
}
2081

    
2082
static void
2083
finish_bat_write(struct vhd_request *req)
2084
{
2085
        struct vhd_bitmap *bm;
2086
        struct vhd_transaction *tx;
2087
        struct vhd_state *s = req->state;
2088

    
2089
        s->returned++;
2090
        TRACE(s);
2091

    
2092
        bm = get_bitmap(s, s->bat.pbw_blk);
2093

    
2094
        DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n",
2095
            s->bat.pbw_blk, s->bat.pbw_offset, req->error);
2096
        ASSERT(bm && bitmap_valid(bm));
2097
        ASSERT(bat_locked(s) &&
2098
               test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
2099

    
2100
        tx = &bm->tx;
2101
        ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE));
2102

    
2103
        if (!req->error) {
2104
                bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset;
2105
                s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs;
2106
        } else
2107
                tx->error = req->error;
2108

    
2109
        if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
2110
                tx->finished++;
2111
                remove_from_req_list(&tx->requests, req);
2112
                if (transaction_completed(tx))
2113
                        finish_data_transaction(s, bm);
2114
        } else {
2115
                clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
2116
                if (s->bat.req.tx)
2117
                        finish_bitmap_transaction(s, bm, req->error);
2118
        }
2119

    
2120
        finish_bat_transaction(s, bm);
2121
}
2122

    
2123
static void
2124
finish_zero_bm_write(struct vhd_request *req)
2125
{
2126
        uint32_t blk;
2127
        struct vhd_bitmap *bm;
2128
        struct vhd_transaction *tx = req->tx;
2129
        struct vhd_state *s = req->state;
2130

    
2131
        s->returned++;
2132
        TRACE(s);
2133

    
2134
        blk = req->treq.sec / s->spb;
2135
        bm  = get_bitmap(s, blk);
2136

    
2137
        DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
2138
        ASSERT(bat_locked(s));
2139
        ASSERT(s->bat.pbw_blk == blk);
2140
        ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
2141

    
2142
        tx->finished++;
2143
        remove_from_req_list(&tx->requests, req);
2144

    
2145
        if (req->error) {
2146
                unlock_bat(s);
2147
                init_bat(s);
2148
                tx->error = req->error;
2149
                clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
2150
        } else
2151
                schedule_bat_write(s);
2152

    
2153
        if (transaction_completed(tx))
2154
                finish_data_transaction(s, bm);
2155
}
2156

    
2157
static int
2158
finish_redundant_bm_write(struct vhd_request *req)
2159
{
2160
        /* uint32_t blk; */
2161
        struct vhd_state *s = (struct vhd_state *) req->state;
2162

    
2163
        s->returned++;
2164
        TRACE(s);        
2165
        /* blk = req->treq.sec / s->spb;
2166
           DBG(TLOG_DBG, "blk: %u\n", blk); */
2167

    
2168
        if (req->error) {
2169
                ERR(s, req->error, "lsec: 0x%08"PRIx64, req->treq.sec);
2170
        }
2171
        free_vhd_request(s, req);
2172
        s->debug_done_redundant_writes++;
2173
        return 0;
2174
}
2175

    
2176

    
2177
static void
2178
finish_bitmap_read(struct vhd_request *req)
2179
{
2180
        uint32_t blk;
2181
        struct vhd_bitmap  *bm;
2182
        struct vhd_request *r, *next;
2183
        struct vhd_state   *s = req->state;
2184

    
2185
        s->returned++;
2186
        TRACE(s);
2187

    
2188
        blk = req->treq.sec / s->spb;
2189
        bm  = get_bitmap(s, blk);
2190

    
2191
        DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
2192
        ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
2193

    
2194
        r = bm->waiting.head;
2195
        clear_req_list(&bm->waiting);
2196
        clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
2197

    
2198
        if (!req->error) {
2199
                memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs));
2200

    
2201
                while (r) {
2202
                        struct vhd_request tmp;
2203

    
2204
                        tmp  = *r;
2205
                        next =  r->next;
2206
                        free_vhd_request(s, r);
2207

    
2208
                        ASSERT(tmp.op == VHD_OP_DATA_READ || 
2209
                               tmp.op == VHD_OP_DATA_WRITE);
2210

    
2211
                        if (tmp.op == VHD_OP_DATA_READ)
2212
                                vhd_queue_read(s->driver, tmp.treq);
2213
                        else if (tmp.op == VHD_OP_DATA_WRITE)
2214
                                vhd_queue_write(s->driver, tmp.treq);
2215

    
2216
                        r = next;
2217
                }
2218
        } else {
2219
                int err = req->error;
2220
                unlock_bitmap(bm);
2221
                free_vhd_bitmap(s, bm);
2222
                return signal_completion(r, err);
2223
        }
2224

    
2225
        if (!bitmap_in_use(bm))
2226
                unlock_bitmap(bm);
2227
}
2228

    
2229
static void
2230
finish_bitmap_write(struct vhd_request *req)
2231
{
2232
        uint32_t blk;
2233
        struct vhd_bitmap  *bm;
2234
        struct vhd_transaction *tx;
2235
        struct vhd_state *s = req->state;
2236

    
2237
        s->returned++;
2238
        TRACE(s);
2239

    
2240
        blk = req->treq.sec / s->spb;
2241
        bm  = get_bitmap(s, blk);
2242
        tx  = &bm->tx;
2243

    
2244
        DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n",
2245
            blk, tx->started, tx->finished);
2246
        ASSERT(tx->closed);
2247
        ASSERT(bm && bitmap_valid(bm));
2248
        ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
2249

    
2250
        clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
2251

    
2252
        finish_bitmap_transaction(s, bm, req->error);
2253
}
2254

    
2255
static void
2256
finish_data_read(struct vhd_request *req)
2257
{
2258
        struct vhd_state *s = req->state;
2259

    
2260
        DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", 
2261
            req->treq.sec, req->treq.sec / s->spb);
2262
        signal_completion(req, 0);
2263
}
2264

    
2265
static void
2266
finish_data_write(struct vhd_request *req)
2267
{
2268
        int i;
2269
        struct vhd_transaction *tx = req->tx;
2270
        struct vhd_state *s = (struct vhd_state *)req->state;
2271

    
2272
        set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED);
2273

    
2274
        if (tx) {
2275
                uint32_t blk, sec;
2276
                struct vhd_bitmap *bm;
2277

    
2278
                blk = req->treq.sec / s->spb;
2279
                sec = req->treq.sec % s->spb;
2280
                bm  = get_bitmap(s, blk);
2281

    
2282
                ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
2283

    
2284
                tx->finished++;
2285

    
2286
                DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", "
2287
                    "tx->started: %d, tx->finished: %d\n", req->treq.sec,
2288
                    req->treq.sec / s->spb, tx->started, tx->finished);
2289

    
2290
                if (!req->error)
2291
                        for (i = 0; i < req->treq.secs; i++)
2292
                                vhd_bitmap_set(&s->vhd, bm->shadow,  sec + i);
2293

    
2294
                if (transaction_completed(tx))
2295
                        finish_data_transaction(s, bm);
2296

    
2297
        } else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) {
2298
                ASSERT(!req->next);
2299
                DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", 
2300
                    req->treq.sec, req->treq.sec / s->spb);
2301
                signal_completion(req, 0);
2302
        }
2303
}
2304

    
2305
void
2306
vhd_complete(void *arg, struct tiocb *tiocb, int err)
2307
{
2308
        struct vhd_request *req = (struct vhd_request *)arg;
2309
        struct vhd_state *s = req->state;
2310
        struct iocb *io = &tiocb->iocb;
2311

    
2312
        s->completed++;
2313
        TRACE(s);
2314

    
2315
        req->error = err;
2316

    
2317
        if (req->error)
2318
                ERR(s, req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, "
2319
                    "nbytes: %lu, blk: %"PRIu64", blk_offset: %u",
2320
                    s->vhd.file, req->op, req->treq.sec, req->treq.secs,
2321
                    io->u.c.nbytes, req->treq.sec / s->spb,
2322
                    bat_entry(s, req->treq.sec / s->spb));
2323

    
2324
        switch (req->op) {
2325
        case VHD_OP_DATA_READ:
2326
                finish_data_read(req);
2327
                break;
2328

    
2329
        case VHD_OP_DATA_WRITE:
2330
                finish_data_write(req);
2331
                break;
2332

    
2333
        case VHD_OP_BITMAP_READ:
2334
                finish_bitmap_read(req);
2335
                break;
2336

    
2337
        case VHD_OP_BITMAP_WRITE:
2338
                finish_bitmap_write(req);
2339
                break;
2340

    
2341
        case VHD_OP_ZERO_BM_WRITE:
2342
                finish_zero_bm_write(req);
2343
                break;
2344

    
2345
        case VHD_OP_REDUNDANT_BM_WRITE:
2346
                finish_redundant_bm_write(req);
2347
                break;
2348

    
2349
        case VHD_OP_BAT_WRITE:
2350
                finish_bat_write(req);
2351
                break;
2352

    
2353
        default:
2354
                ASSERT(0);
2355
                break;
2356
        }
2357
}
2358

    
2359
void 
2360
vhd_debug(td_driver_t *driver)
2361
{
2362
        int i;
2363
        struct vhd_state *s = (struct vhd_state *)driver->data;
2364

    
2365
        DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", "
2366
            "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed,
2367
            s->returned);
2368
        DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n",
2369
            s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0));
2370
        DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n",
2371
            s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0));
2372

    
2373
        DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%u total)\n", VHD_REQS_DATA);
2374
        for (i = 0; i < VHD_REQS_DATA; i++) {
2375
                struct vhd_request *r = &s->vreq_list[i];
2376
                td_request_t *t       = &r->treq;
2377
                const char *vname     = t->vreq ? t->vreq->name: NULL;
2378
                if (t->secs)
2379
                        DBG(TLOG_WARN, "%d: vreq: %s.%d, err: %d, op: %d,"
2380
                            " lsec: 0x%08"PRIx64", flags: %d, this: %p, "
2381
                            "next: %p, tx: %p\n", i, vname, t->sidx, r->error, r->op,
2382
                            t->sec, r->flags, r, r->next, r->tx);
2383
        }
2384

    
2385
        DBG(TLOG_WARN, "BITMAP CACHE:\n");
2386
        for (i = 0; i < VHD_CACHE_SIZE; i++) {
2387
                int qnum = 0, wnum = 0, rnum = 0;
2388
                struct vhd_bitmap *bm = s->bitmap[i];
2389
                struct vhd_transaction *tx;
2390
                struct vhd_request *r;
2391

    
2392
                if (!bm)
2393
                        continue;
2394

    
2395
                tx = &bm->tx;
2396
                r = bm->queue.head;
2397
                while (r) {
2398
                        qnum++;
2399
                        r = r->next;
2400
                }
2401

    
2402
                r = bm->waiting.head;
2403
                while (r) {
2404
                        wnum++;
2405
                        r = r->next;
2406
                }
2407

    
2408
                r = tx->requests.head;
2409
                while (r) {
2410
                        rnum++;
2411
                        r = r->next;
2412
                }
2413

    
2414
                DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, "
2415
                    "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, "
2416
                    "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n",
2417
                    i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head,
2418
                    wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error,
2419
                    tx->started, tx->finished, tx->status, tx->requests.head, rnum);
2420
        }
2421

    
2422
        DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, "
2423
            "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk,
2424
            s->bat.pbw_offset, s->bat.req.tx);
2425

    
2426
/*
2427
        for (i = 0; i < s->hdr.max_bat_size; i++)
2428
                DPRINTF("%d: %u\n", i, s->bat.bat[i]);
2429
*/
2430
}
2431

    
2432
struct tap_disk tapdisk_vhd = {
2433
        .disk_type          = "tapdisk_vhd",
2434
        .flags              = 0,
2435
        .private_data_size  = sizeof(struct vhd_state),
2436
        .td_open            = _vhd_open,
2437
        .td_close           = _vhd_close,
2438
        .td_queue_read      = vhd_queue_read,
2439
        .td_queue_write     = vhd_queue_write,
2440
        .td_get_parent_id   = vhd_get_parent_id,
2441
        .td_validate_parent = vhd_validate_parent,
2442
        .td_debug           = vhd_debug,
2443
};